1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 75 76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 84 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 85 86 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 87 { 88 cusparseStatus_t stat; 89 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 90 91 PetscFunctionBegin; 92 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 93 cusparsestruct->stream = stream; 94 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 95 PetscFunctionReturn(0); 96 } 97 98 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 99 { 100 cusparseStatus_t stat; 101 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 102 103 PetscFunctionBegin; 104 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 105 if (cusparsestruct->handle != handle) { 106 if (cusparsestruct->handle) { 107 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 108 } 109 cusparsestruct->handle = handle; 110 } 111 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 112 PetscFunctionReturn(0); 113 } 114 115 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 116 { 117 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 118 PetscBool flg; 119 PetscErrorCode ierr; 120 121 PetscFunctionBegin; 122 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 123 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 124 if (cusparsestruct->handle) cusparsestruct->handle = 0; 125 PetscFunctionReturn(0); 126 } 127 128 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 129 { 130 PetscFunctionBegin; 131 *type = MATSOLVERCUSPARSE; 132 PetscFunctionReturn(0); 133 } 134 135 /*MC 136 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 137 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 138 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 139 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 140 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 141 algorithms are not recommended. This class does NOT support direct solver operations. 142 143 Level: beginner 144 145 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 146 M*/ 147 148 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 149 { 150 PetscErrorCode ierr; 151 PetscInt n = A->rmap->n; 152 153 PetscFunctionBegin; 154 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 155 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 156 (*B)->factortype = ftype; 157 (*B)->useordering = PETSC_TRUE; 158 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 159 160 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 161 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 162 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 163 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 164 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 165 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 166 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 167 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 168 169 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 170 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 171 PetscFunctionReturn(0); 172 } 173 174 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 175 { 176 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 177 178 PetscFunctionBegin; 179 switch (op) { 180 case MAT_CUSPARSE_MULT: 181 cusparsestruct->format = format; 182 break; 183 case MAT_CUSPARSE_ALL: 184 cusparsestruct->format = format; 185 break; 186 default: 187 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 188 } 189 PetscFunctionReturn(0); 190 } 191 192 /*@ 193 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 194 operation. Only the MatMult operation can use different GPU storage formats 195 for MPIAIJCUSPARSE matrices. 196 Not Collective 197 198 Input Parameters: 199 + A - Matrix of type SEQAIJCUSPARSE 200 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 201 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 202 203 Output Parameter: 204 205 Level: intermediate 206 207 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 208 @*/ 209 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 210 { 211 PetscErrorCode ierr; 212 213 PetscFunctionBegin; 214 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 215 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 216 PetscFunctionReturn(0); 217 } 218 219 /*@ 220 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 221 222 Collective on mat 223 224 Input Parameters: 225 + A - Matrix of type SEQAIJCUSPARSE 226 - transgen - the boolean flag 227 228 Level: intermediate 229 230 .seealso: MATSEQAIJCUSPARSE 231 @*/ 232 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 233 { 234 PetscErrorCode ierr; 235 PetscBool flg; 236 237 PetscFunctionBegin; 238 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 239 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 240 if (flg) { 241 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 242 243 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 244 cusp->transgen = transgen; 245 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 246 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 247 } 248 } 249 PetscFunctionReturn(0); 250 } 251 252 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 253 { 254 PetscErrorCode ierr; 255 MatCUSPARSEStorageFormat format; 256 PetscBool flg; 257 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 258 259 PetscFunctionBegin; 260 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 261 if (A->factortype == MAT_FACTOR_NONE) { 262 PetscBool transgen = cusparsestruct->transgen; 263 264 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 265 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 266 267 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 268 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 269 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 270 271 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 272 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 273 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 274 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 275 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 276 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 277 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 278 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 279 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 280 281 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 282 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 283 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 284 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 285 286 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 287 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 288 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 289 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 290 #endif 291 } 292 ierr = PetscOptionsTail();CHKERRQ(ierr); 293 PetscFunctionReturn(0); 294 } 295 296 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 297 { 298 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 299 PetscErrorCode ierr; 300 301 PetscFunctionBegin; 302 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 303 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 304 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 305 PetscFunctionReturn(0); 306 } 307 308 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 309 { 310 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 311 PetscErrorCode ierr; 312 313 PetscFunctionBegin; 314 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 315 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 316 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 317 PetscFunctionReturn(0); 318 } 319 320 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 321 { 322 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 323 PetscErrorCode ierr; 324 325 PetscFunctionBegin; 326 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 327 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 328 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 329 PetscFunctionReturn(0); 330 } 331 332 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 333 { 334 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 335 PetscErrorCode ierr; 336 337 PetscFunctionBegin; 338 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 339 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 340 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 341 PetscFunctionReturn(0); 342 } 343 344 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 345 { 346 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 347 PetscInt n = A->rmap->n; 348 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 349 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 350 cusparseStatus_t stat; 351 const PetscInt *ai = a->i,*aj = a->j,*vi; 352 const MatScalar *aa = a->a,*v; 353 PetscInt *AiLo, *AjLo; 354 PetscInt i,nz, nzLower, offset, rowOffset; 355 PetscErrorCode ierr; 356 cudaError_t cerr; 357 358 PetscFunctionBegin; 359 if (!n) PetscFunctionReturn(0); 360 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 361 try { 362 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 363 nzLower=n+ai[n]-ai[1]; 364 if (!loTriFactor) { 365 PetscScalar *AALo; 366 367 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 368 369 /* Allocate Space for the lower triangular matrix */ 370 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 371 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 372 373 /* Fill the lower triangular matrix */ 374 AiLo[0] = (PetscInt) 0; 375 AiLo[n] = nzLower; 376 AjLo[0] = (PetscInt) 0; 377 AALo[0] = (MatScalar) 1.0; 378 v = aa; 379 vi = aj; 380 offset = 1; 381 rowOffset= 1; 382 for (i=1; i<n; i++) { 383 nz = ai[i+1] - ai[i]; 384 /* additional 1 for the term on the diagonal */ 385 AiLo[i] = rowOffset; 386 rowOffset += nz+1; 387 388 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 389 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 390 391 offset += nz; 392 AjLo[offset] = (PetscInt) i; 393 AALo[offset] = (MatScalar) 1.0; 394 offset += 1; 395 396 v += nz; 397 vi += nz; 398 } 399 400 /* allocate space for the triangular factor information */ 401 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 402 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 403 /* Create the matrix description */ 404 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 405 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 406 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 407 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 408 #else 409 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 410 #endif 411 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 412 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 413 414 /* set the operation */ 415 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 416 417 /* set the matrix */ 418 loTriFactor->csrMat = new CsrMatrix; 419 loTriFactor->csrMat->num_rows = n; 420 loTriFactor->csrMat->num_cols = n; 421 loTriFactor->csrMat->num_entries = nzLower; 422 423 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 424 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 425 426 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 427 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 428 429 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 430 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 431 432 /* Create the solve analysis information */ 433 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 434 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 435 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 436 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 437 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 438 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 439 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 440 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 441 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 442 #endif 443 444 /* perform the solve analysis */ 445 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 446 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 447 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 448 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 449 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 450 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 451 #endif 452 );CHKERRCUSPARSE(stat); 453 cerr = WaitForCUDA();CHKERRCUDA(cerr); 454 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 455 456 /* assign the pointer */ 457 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 458 loTriFactor->AA_h = AALo; 459 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 460 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 461 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 462 } else { /* update values only */ 463 if (!loTriFactor->AA_h) { 464 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 465 } 466 /* Fill the lower triangular matrix */ 467 loTriFactor->AA_h[0] = 1.0; 468 v = aa; 469 vi = aj; 470 offset = 1; 471 for (i=1; i<n; i++) { 472 nz = ai[i+1] - ai[i]; 473 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 474 offset += nz; 475 loTriFactor->AA_h[offset] = 1.0; 476 offset += 1; 477 v += nz; 478 } 479 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 480 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 481 } 482 } catch(char *ex) { 483 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 484 } 485 } 486 PetscFunctionReturn(0); 487 } 488 489 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 490 { 491 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 492 PetscInt n = A->rmap->n; 493 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 494 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 495 cusparseStatus_t stat; 496 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 497 const MatScalar *aa = a->a,*v; 498 PetscInt *AiUp, *AjUp; 499 PetscInt i,nz, nzUpper, offset; 500 PetscErrorCode ierr; 501 cudaError_t cerr; 502 503 PetscFunctionBegin; 504 if (!n) PetscFunctionReturn(0); 505 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 506 try { 507 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 508 nzUpper = adiag[0]-adiag[n]; 509 if (!upTriFactor) { 510 PetscScalar *AAUp; 511 512 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 513 514 /* Allocate Space for the upper triangular matrix */ 515 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 516 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 517 518 /* Fill the upper triangular matrix */ 519 AiUp[0]=(PetscInt) 0; 520 AiUp[n]=nzUpper; 521 offset = nzUpper; 522 for (i=n-1; i>=0; i--) { 523 v = aa + adiag[i+1] + 1; 524 vi = aj + adiag[i+1] + 1; 525 526 /* number of elements NOT on the diagonal */ 527 nz = adiag[i] - adiag[i+1]-1; 528 529 /* decrement the offset */ 530 offset -= (nz+1); 531 532 /* first, set the diagonal elements */ 533 AjUp[offset] = (PetscInt) i; 534 AAUp[offset] = (MatScalar)1./v[nz]; 535 AiUp[i] = AiUp[i+1] - (nz+1); 536 537 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 538 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 539 } 540 541 /* allocate space for the triangular factor information */ 542 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 543 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 544 545 /* Create the matrix description */ 546 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 547 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 548 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 549 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 550 #else 551 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 552 #endif 553 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 554 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 555 556 /* set the operation */ 557 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 558 559 /* set the matrix */ 560 upTriFactor->csrMat = new CsrMatrix; 561 upTriFactor->csrMat->num_rows = n; 562 upTriFactor->csrMat->num_cols = n; 563 upTriFactor->csrMat->num_entries = nzUpper; 564 565 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 566 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 567 568 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 569 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 570 571 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 572 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 573 574 /* Create the solve analysis information */ 575 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 576 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 577 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 578 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 579 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 580 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 581 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 582 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 583 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 584 #endif 585 586 /* perform the solve analysis */ 587 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 588 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 589 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 590 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 591 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 592 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 593 #endif 594 );CHKERRCUSPARSE(stat); 595 cerr = WaitForCUDA();CHKERRCUDA(cerr); 596 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 597 598 /* assign the pointer */ 599 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 600 upTriFactor->AA_h = AAUp; 601 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 602 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 603 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 604 } else { 605 if (!upTriFactor->AA_h) { 606 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 607 } 608 /* Fill the upper triangular matrix */ 609 offset = nzUpper; 610 for (i=n-1; i>=0; i--) { 611 v = aa + adiag[i+1] + 1; 612 613 /* number of elements NOT on the diagonal */ 614 nz = adiag[i] - adiag[i+1]-1; 615 616 /* decrement the offset */ 617 offset -= (nz+1); 618 619 /* first, set the diagonal elements */ 620 upTriFactor->AA_h[offset] = 1./v[nz]; 621 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 622 } 623 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 624 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 625 } 626 } catch(char *ex) { 627 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 628 } 629 } 630 PetscFunctionReturn(0); 631 } 632 633 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 634 { 635 PetscErrorCode ierr; 636 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 637 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 638 IS isrow = a->row,iscol = a->icol; 639 PetscBool row_identity,col_identity; 640 PetscInt n = A->rmap->n; 641 642 PetscFunctionBegin; 643 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 644 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 645 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 646 647 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 648 cusparseTriFactors->nnz=a->nz; 649 650 A->offloadmask = PETSC_OFFLOAD_BOTH; 651 /* lower triangular indices */ 652 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 653 if (!row_identity && !cusparseTriFactors->rpermIndices) { 654 const PetscInt *r; 655 656 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 657 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 658 cusparseTriFactors->rpermIndices->assign(r, r+n); 659 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 660 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 661 } 662 663 /* upper triangular indices */ 664 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 665 if (!col_identity && !cusparseTriFactors->cpermIndices) { 666 const PetscInt *c; 667 668 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 669 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 670 cusparseTriFactors->cpermIndices->assign(c, c+n); 671 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 672 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 673 } 674 PetscFunctionReturn(0); 675 } 676 677 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 678 { 679 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 680 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 681 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 682 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 683 cusparseStatus_t stat; 684 PetscErrorCode ierr; 685 cudaError_t cerr; 686 PetscInt *AiUp, *AjUp; 687 PetscScalar *AAUp; 688 PetscScalar *AALo; 689 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 690 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 691 const PetscInt *ai = b->i,*aj = b->j,*vj; 692 const MatScalar *aa = b->a,*v; 693 694 PetscFunctionBegin; 695 if (!n) PetscFunctionReturn(0); 696 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 697 try { 698 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 699 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 700 if (!upTriFactor && !loTriFactor) { 701 /* Allocate Space for the upper triangular matrix */ 702 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 703 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 704 705 /* Fill the upper triangular matrix */ 706 AiUp[0]=(PetscInt) 0; 707 AiUp[n]=nzUpper; 708 offset = 0; 709 for (i=0; i<n; i++) { 710 /* set the pointers */ 711 v = aa + ai[i]; 712 vj = aj + ai[i]; 713 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 714 715 /* first, set the diagonal elements */ 716 AjUp[offset] = (PetscInt) i; 717 AAUp[offset] = (MatScalar)1.0/v[nz]; 718 AiUp[i] = offset; 719 AALo[offset] = (MatScalar)1.0/v[nz]; 720 721 offset+=1; 722 if (nz>0) { 723 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 724 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 725 for (j=offset; j<offset+nz; j++) { 726 AAUp[j] = -AAUp[j]; 727 AALo[j] = AAUp[j]/v[nz]; 728 } 729 offset+=nz; 730 } 731 } 732 733 /* allocate space for the triangular factor information */ 734 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 735 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 736 737 /* Create the matrix description */ 738 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 739 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 740 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 741 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 742 #else 743 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 744 #endif 745 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 746 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 747 748 /* set the matrix */ 749 upTriFactor->csrMat = new CsrMatrix; 750 upTriFactor->csrMat->num_rows = A->rmap->n; 751 upTriFactor->csrMat->num_cols = A->cmap->n; 752 upTriFactor->csrMat->num_entries = a->nz; 753 754 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 755 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 756 757 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 758 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 759 760 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 761 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 762 763 /* set the operation */ 764 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 765 766 /* Create the solve analysis information */ 767 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 768 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 769 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 770 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 771 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 772 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 773 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 774 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 775 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 776 #endif 777 778 /* perform the solve analysis */ 779 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 780 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 781 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 782 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 783 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 784 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 785 #endif 786 );CHKERRCUSPARSE(stat); 787 cerr = WaitForCUDA();CHKERRCUDA(cerr); 788 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 789 790 /* assign the pointer */ 791 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 792 793 /* allocate space for the triangular factor information */ 794 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 795 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 796 797 /* Create the matrix description */ 798 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 799 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 800 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 801 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 802 #else 803 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 804 #endif 805 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 806 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 807 808 /* set the operation */ 809 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 810 811 /* set the matrix */ 812 loTriFactor->csrMat = new CsrMatrix; 813 loTriFactor->csrMat->num_rows = A->rmap->n; 814 loTriFactor->csrMat->num_cols = A->cmap->n; 815 loTriFactor->csrMat->num_entries = a->nz; 816 817 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 818 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 819 820 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 821 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 822 823 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 824 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 825 826 /* Create the solve analysis information */ 827 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 828 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 829 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 830 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 831 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 832 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 833 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 834 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 835 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 836 #endif 837 838 /* perform the solve analysis */ 839 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 840 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 841 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 842 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 843 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 844 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 845 #endif 846 );CHKERRCUSPARSE(stat); 847 cerr = WaitForCUDA();CHKERRCUDA(cerr); 848 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 849 850 /* assign the pointer */ 851 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 852 853 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 854 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 855 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 856 } else { 857 /* Fill the upper triangular matrix */ 858 offset = 0; 859 for (i=0; i<n; i++) { 860 /* set the pointers */ 861 v = aa + ai[i]; 862 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 863 864 /* first, set the diagonal elements */ 865 AAUp[offset] = 1.0/v[nz]; 866 AALo[offset] = 1.0/v[nz]; 867 868 offset+=1; 869 if (nz>0) { 870 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 871 for (j=offset; j<offset+nz; j++) { 872 AAUp[j] = -AAUp[j]; 873 AALo[j] = AAUp[j]/v[nz]; 874 } 875 offset+=nz; 876 } 877 } 878 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 879 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 880 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 881 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 882 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 883 } 884 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 885 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 886 } catch(char *ex) { 887 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 888 } 889 } 890 PetscFunctionReturn(0); 891 } 892 893 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 894 { 895 PetscErrorCode ierr; 896 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 897 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 898 IS ip = a->row; 899 PetscBool perm_identity; 900 PetscInt n = A->rmap->n; 901 902 PetscFunctionBegin; 903 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 904 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 905 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 906 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 907 908 A->offloadmask = PETSC_OFFLOAD_BOTH; 909 910 /* lower triangular indices */ 911 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 912 if (!perm_identity) { 913 IS iip; 914 const PetscInt *irip,*rip; 915 916 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 917 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 918 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 919 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 920 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 921 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 922 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 923 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 924 ierr = ISDestroy(&iip);CHKERRQ(ierr); 925 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 926 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 927 } 928 PetscFunctionReturn(0); 929 } 930 931 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 932 { 933 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 934 IS isrow = b->row,iscol = b->col; 935 PetscBool row_identity,col_identity; 936 PetscErrorCode ierr; 937 938 PetscFunctionBegin; 939 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 940 B->offloadmask = PETSC_OFFLOAD_CPU; 941 /* determine which version of MatSolve needs to be used. */ 942 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 943 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 944 if (row_identity && col_identity) { 945 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 946 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 947 B->ops->matsolve = NULL; 948 B->ops->matsolvetranspose = NULL; 949 } else { 950 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 951 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 952 B->ops->matsolve = NULL; 953 B->ops->matsolvetranspose = NULL; 954 } 955 956 /* get the triangular factors */ 957 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 958 PetscFunctionReturn(0); 959 } 960 961 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 962 { 963 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 964 IS ip = b->row; 965 PetscBool perm_identity; 966 PetscErrorCode ierr; 967 968 PetscFunctionBegin; 969 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 970 B->offloadmask = PETSC_OFFLOAD_CPU; 971 /* determine which version of MatSolve needs to be used. */ 972 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 973 if (perm_identity) { 974 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 975 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 976 B->ops->matsolve = NULL; 977 B->ops->matsolvetranspose = NULL; 978 } else { 979 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 980 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 981 B->ops->matsolve = NULL; 982 B->ops->matsolvetranspose = NULL; 983 } 984 985 /* get the triangular factors */ 986 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 987 PetscFunctionReturn(0); 988 } 989 990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 991 { 992 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 993 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 994 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 995 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 996 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 997 cusparseStatus_t stat; 998 cusparseIndexBase_t indexBase; 999 cusparseMatrixType_t matrixType; 1000 cusparseFillMode_t fillMode; 1001 cusparseDiagType_t diagType; 1002 cudaError_t cerr; 1003 PetscErrorCode ierr; 1004 1005 PetscFunctionBegin; 1006 /* allocate space for the transpose of the lower triangular factor */ 1007 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1008 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1009 1010 /* set the matrix descriptors of the lower triangular factor */ 1011 matrixType = cusparseGetMatType(loTriFactor->descr); 1012 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1013 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1014 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1015 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1016 1017 /* Create the matrix description */ 1018 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1019 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1020 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1021 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1022 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1023 1024 /* set the operation */ 1025 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1026 1027 /* allocate GPU space for the CSC of the lower triangular factor*/ 1028 loTriFactorT->csrMat = new CsrMatrix; 1029 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1030 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1031 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1032 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1033 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1034 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1035 1036 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1039 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1040 loTriFactor->csrMat->values->data().get(), 1041 loTriFactor->csrMat->row_offsets->data().get(), 1042 loTriFactor->csrMat->column_indices->data().get(), 1043 loTriFactorT->csrMat->values->data().get(), 1044 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1045 CUSPARSE_ACTION_NUMERIC,indexBase, 1046 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1047 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1048 #endif 1049 1050 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1051 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1052 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1053 loTriFactor->csrMat->values->data().get(), 1054 loTriFactor->csrMat->row_offsets->data().get(), 1055 loTriFactor->csrMat->column_indices->data().get(), 1056 loTriFactorT->csrMat->values->data().get(), 1057 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1058 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059 CUSPARSE_ACTION_NUMERIC, indexBase, 1060 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1061 #else 1062 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1063 CUSPARSE_ACTION_NUMERIC, indexBase 1064 #endif 1065 );CHKERRCUSPARSE(stat); 1066 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1067 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1068 1069 /* Create the solve analysis information */ 1070 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1071 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1072 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1073 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1074 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1075 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1076 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1077 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1078 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1079 #endif 1080 1081 /* perform the solve analysis */ 1082 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1083 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1084 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1085 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1086 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1087 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1088 #endif 1089 );CHKERRCUSPARSE(stat); 1090 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1091 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1092 1093 /* assign the pointer */ 1094 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1095 1096 /*********************************************/ 1097 /* Now the Transpose of the Upper Tri Factor */ 1098 /*********************************************/ 1099 1100 /* allocate space for the transpose of the upper triangular factor */ 1101 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1102 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1103 1104 /* set the matrix descriptors of the upper triangular factor */ 1105 matrixType = cusparseGetMatType(upTriFactor->descr); 1106 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1107 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1108 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1109 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1110 1111 /* Create the matrix description */ 1112 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1113 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1114 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1115 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1116 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1117 1118 /* set the operation */ 1119 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1120 1121 /* allocate GPU space for the CSC of the upper triangular factor*/ 1122 upTriFactorT->csrMat = new CsrMatrix; 1123 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1124 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1125 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1126 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1127 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1128 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1129 1130 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1131 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1132 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1133 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1134 upTriFactor->csrMat->values->data().get(), 1135 upTriFactor->csrMat->row_offsets->data().get(), 1136 upTriFactor->csrMat->column_indices->data().get(), 1137 upTriFactorT->csrMat->values->data().get(), 1138 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1139 CUSPARSE_ACTION_NUMERIC,indexBase, 1140 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1141 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1142 #endif 1143 1144 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1145 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1146 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1147 upTriFactor->csrMat->values->data().get(), 1148 upTriFactor->csrMat->row_offsets->data().get(), 1149 upTriFactor->csrMat->column_indices->data().get(), 1150 upTriFactorT->csrMat->values->data().get(), 1151 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1152 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1153 CUSPARSE_ACTION_NUMERIC, indexBase, 1154 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1155 #else 1156 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1157 CUSPARSE_ACTION_NUMERIC, indexBase 1158 #endif 1159 );CHKERRCUSPARSE(stat); 1160 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1161 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1162 1163 /* Create the solve analysis information */ 1164 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1165 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1166 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1167 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1168 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1169 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1170 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1171 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1172 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1173 #endif 1174 1175 /* perform the solve analysis */ 1176 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1177 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1178 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1179 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1180 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1181 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1182 #endif 1183 );CHKERRCUSPARSE(stat); 1184 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1185 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1186 1187 /* assign the pointer */ 1188 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1189 PetscFunctionReturn(0); 1190 } 1191 1192 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1193 { 1194 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1195 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1196 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1197 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1198 cusparseStatus_t stat; 1199 cusparseIndexBase_t indexBase; 1200 cudaError_t err; 1201 PetscErrorCode ierr; 1202 1203 PetscFunctionBegin; 1204 if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1205 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1206 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1207 /* create cusparse matrix */ 1208 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1209 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1210 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1211 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1212 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1213 1214 /* set alpha and beta */ 1215 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1216 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1217 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1218 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1219 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1220 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1221 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1222 1223 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1224 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1225 CsrMatrix *matrixT= new CsrMatrix; 1226 matrixT->num_rows = A->cmap->n; 1227 matrixT->num_cols = A->rmap->n; 1228 matrixT->num_entries = a->nz; 1229 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1230 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1231 matrixT->values = new THRUSTARRAY(a->nz); 1232 1233 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1234 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1235 1236 /* compute the transpose, i.e. the CSC */ 1237 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1238 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1239 A->cmap->n, matrix->num_entries, 1240 matrix->values->data().get(), 1241 cusparsestruct->rowoffsets_gpu->data().get(), 1242 matrix->column_indices->data().get(), 1243 matrixT->values->data().get(), 1244 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1245 CUSPARSE_ACTION_NUMERIC,indexBase, 1246 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1247 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1248 #endif 1249 1250 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1251 A->cmap->n, matrix->num_entries, 1252 matrix->values->data().get(), 1253 cusparsestruct->rowoffsets_gpu->data().get(), 1254 matrix->column_indices->data().get(), 1255 matrixT->values->data().get(), 1256 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1257 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1258 CUSPARSE_ACTION_NUMERIC,indexBase, 1259 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1260 #else 1261 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1262 CUSPARSE_ACTION_NUMERIC, indexBase 1263 #endif 1264 );CHKERRCUSPARSE(stat); 1265 matstructT->mat = matrixT; 1266 1267 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1268 stat = cusparseCreateCsr(&matstructT->matDescr, 1269 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1270 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1271 matrixT->values->data().get(), 1272 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1273 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1274 #endif 1275 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1276 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1277 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1278 #else 1279 CsrMatrix *temp = new CsrMatrix; 1280 CsrMatrix *tempT = new CsrMatrix; 1281 /* First convert HYB to CSR */ 1282 temp->num_rows = A->rmap->n; 1283 temp->num_cols = A->cmap->n; 1284 temp->num_entries = a->nz; 1285 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1286 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1287 temp->values = new THRUSTARRAY(a->nz); 1288 1289 stat = cusparse_hyb2csr(cusparsestruct->handle, 1290 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1291 temp->values->data().get(), 1292 temp->row_offsets->data().get(), 1293 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1294 1295 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1296 tempT->num_rows = A->rmap->n; 1297 tempT->num_cols = A->cmap->n; 1298 tempT->num_entries = a->nz; 1299 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1300 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1301 tempT->values = new THRUSTARRAY(a->nz); 1302 1303 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1304 temp->num_cols, temp->num_entries, 1305 temp->values->data().get(), 1306 temp->row_offsets->data().get(), 1307 temp->column_indices->data().get(), 1308 tempT->values->data().get(), 1309 tempT->column_indices->data().get(), 1310 tempT->row_offsets->data().get(), 1311 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1312 1313 /* Last, convert CSC to HYB */ 1314 cusparseHybMat_t hybMat; 1315 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1316 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1317 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1318 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1319 matstructT->descr, tempT->values->data().get(), 1320 tempT->row_offsets->data().get(), 1321 tempT->column_indices->data().get(), 1322 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1323 1324 /* assign the pointer */ 1325 matstructT->mat = hybMat; 1326 /* delete temporaries */ 1327 if (tempT) { 1328 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1329 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1330 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1331 delete (CsrMatrix*) tempT; 1332 } 1333 if (temp) { 1334 if (temp->values) delete (THRUSTARRAY*) temp->values; 1335 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1336 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1337 delete (CsrMatrix*) temp; 1338 } 1339 #endif 1340 } 1341 err = WaitForCUDA();CHKERRCUDA(err); 1342 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1343 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1344 /* the compressed row indices is not used for matTranspose */ 1345 matstructT->cprowIndices = NULL; 1346 /* assign the pointer */ 1347 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1348 PetscFunctionReturn(0); 1349 } 1350 1351 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1352 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1353 { 1354 PetscInt n = xx->map->n; 1355 const PetscScalar *barray; 1356 PetscScalar *xarray; 1357 thrust::device_ptr<const PetscScalar> bGPU; 1358 thrust::device_ptr<PetscScalar> xGPU; 1359 cusparseStatus_t stat; 1360 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1361 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1362 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1363 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1364 PetscErrorCode ierr; 1365 cudaError_t cerr; 1366 1367 PetscFunctionBegin; 1368 /* Analyze the matrix and create the transpose ... on the fly */ 1369 if (!loTriFactorT && !upTriFactorT) { 1370 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1371 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1372 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1373 } 1374 1375 /* Get the GPU pointers */ 1376 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1377 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1378 xGPU = thrust::device_pointer_cast(xarray); 1379 bGPU = thrust::device_pointer_cast(barray); 1380 1381 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1382 /* First, reorder with the row permutation */ 1383 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1384 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1385 xGPU); 1386 1387 /* First, solve U */ 1388 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1389 upTriFactorT->csrMat->num_rows, 1390 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1391 upTriFactorT->csrMat->num_entries, 1392 #endif 1393 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1394 upTriFactorT->csrMat->values->data().get(), 1395 upTriFactorT->csrMat->row_offsets->data().get(), 1396 upTriFactorT->csrMat->column_indices->data().get(), 1397 upTriFactorT->solveInfo, 1398 xarray, tempGPU->data().get() 1399 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1400 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1401 #endif 1402 );CHKERRCUSPARSE(stat); 1403 1404 /* Then, solve L */ 1405 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1406 loTriFactorT->csrMat->num_rows, 1407 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1408 loTriFactorT->csrMat->num_entries, 1409 #endif 1410 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1411 loTriFactorT->csrMat->values->data().get(), 1412 loTriFactorT->csrMat->row_offsets->data().get(), 1413 loTriFactorT->csrMat->column_indices->data().get(), 1414 loTriFactorT->solveInfo, 1415 tempGPU->data().get(), xarray 1416 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1417 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1418 #endif 1419 );CHKERRCUSPARSE(stat); 1420 1421 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1422 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1423 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1424 tempGPU->begin()); 1425 1426 /* Copy the temporary to the full solution. */ 1427 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1428 1429 /* restore */ 1430 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1431 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1432 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1433 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1434 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1435 PetscFunctionReturn(0); 1436 } 1437 1438 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1439 { 1440 const PetscScalar *barray; 1441 PetscScalar *xarray; 1442 cusparseStatus_t stat; 1443 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1444 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1445 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1446 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1447 PetscErrorCode ierr; 1448 cudaError_t cerr; 1449 1450 PetscFunctionBegin; 1451 /* Analyze the matrix and create the transpose ... on the fly */ 1452 if (!loTriFactorT && !upTriFactorT) { 1453 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1454 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1455 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1456 } 1457 1458 /* Get the GPU pointers */ 1459 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1460 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1461 1462 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1463 /* First, solve U */ 1464 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1465 upTriFactorT->csrMat->num_rows, 1466 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1467 upTriFactorT->csrMat->num_entries, 1468 #endif 1469 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1470 upTriFactorT->csrMat->values->data().get(), 1471 upTriFactorT->csrMat->row_offsets->data().get(), 1472 upTriFactorT->csrMat->column_indices->data().get(), 1473 upTriFactorT->solveInfo, 1474 barray, tempGPU->data().get() 1475 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1476 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1477 #endif 1478 );CHKERRCUSPARSE(stat); 1479 1480 /* Then, solve L */ 1481 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1482 loTriFactorT->csrMat->num_rows, 1483 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1484 loTriFactorT->csrMat->num_entries, 1485 #endif 1486 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1487 loTriFactorT->csrMat->values->data().get(), 1488 loTriFactorT->csrMat->row_offsets->data().get(), 1489 loTriFactorT->csrMat->column_indices->data().get(), 1490 loTriFactorT->solveInfo, 1491 tempGPU->data().get(), xarray 1492 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1493 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1494 #endif 1495 );CHKERRCUSPARSE(stat); 1496 1497 /* restore */ 1498 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1499 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1500 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1501 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1502 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1503 PetscFunctionReturn(0); 1504 } 1505 1506 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1507 { 1508 const PetscScalar *barray; 1509 PetscScalar *xarray; 1510 thrust::device_ptr<const PetscScalar> bGPU; 1511 thrust::device_ptr<PetscScalar> xGPU; 1512 cusparseStatus_t stat; 1513 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1514 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1515 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1516 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1517 PetscErrorCode ierr; 1518 cudaError_t cerr; 1519 1520 PetscFunctionBegin; 1521 1522 /* Get the GPU pointers */ 1523 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1524 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1525 xGPU = thrust::device_pointer_cast(xarray); 1526 bGPU = thrust::device_pointer_cast(barray); 1527 1528 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1529 /* First, reorder with the row permutation */ 1530 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1531 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1532 tempGPU->begin()); 1533 1534 /* Next, solve L */ 1535 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1536 loTriFactor->csrMat->num_rows, 1537 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1538 loTriFactor->csrMat->num_entries, 1539 #endif 1540 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1541 loTriFactor->csrMat->values->data().get(), 1542 loTriFactor->csrMat->row_offsets->data().get(), 1543 loTriFactor->csrMat->column_indices->data().get(), 1544 loTriFactor->solveInfo, 1545 tempGPU->data().get(), xarray 1546 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1547 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1548 #endif 1549 );CHKERRCUSPARSE(stat); 1550 1551 /* Then, solve U */ 1552 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1553 upTriFactor->csrMat->num_rows, 1554 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1555 upTriFactor->csrMat->num_entries, 1556 #endif 1557 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1558 upTriFactor->csrMat->values->data().get(), 1559 upTriFactor->csrMat->row_offsets->data().get(), 1560 upTriFactor->csrMat->column_indices->data().get(), 1561 upTriFactor->solveInfo, 1562 xarray, tempGPU->data().get() 1563 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1564 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1565 #endif 1566 );CHKERRCUSPARSE(stat); 1567 1568 /* Last, reorder with the column permutation */ 1569 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1570 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1571 xGPU); 1572 1573 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1574 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1575 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1576 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1577 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1578 PetscFunctionReturn(0); 1579 } 1580 1581 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1582 { 1583 const PetscScalar *barray; 1584 PetscScalar *xarray; 1585 cusparseStatus_t stat; 1586 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1587 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1588 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1589 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1590 PetscErrorCode ierr; 1591 cudaError_t cerr; 1592 1593 PetscFunctionBegin; 1594 /* Get the GPU pointers */ 1595 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1596 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1597 1598 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1599 /* First, solve L */ 1600 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1601 loTriFactor->csrMat->num_rows, 1602 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1603 loTriFactor->csrMat->num_entries, 1604 #endif 1605 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1606 loTriFactor->csrMat->values->data().get(), 1607 loTriFactor->csrMat->row_offsets->data().get(), 1608 loTriFactor->csrMat->column_indices->data().get(), 1609 loTriFactor->solveInfo, 1610 barray, tempGPU->data().get() 1611 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1612 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1613 #endif 1614 );CHKERRCUSPARSE(stat); 1615 1616 /* Next, solve U */ 1617 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1618 upTriFactor->csrMat->num_rows, 1619 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1620 upTriFactor->csrMat->num_entries, 1621 #endif 1622 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1623 upTriFactor->csrMat->values->data().get(), 1624 upTriFactor->csrMat->row_offsets->data().get(), 1625 upTriFactor->csrMat->column_indices->data().get(), 1626 upTriFactor->solveInfo, 1627 tempGPU->data().get(), xarray 1628 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1629 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1630 #endif 1631 );CHKERRCUSPARSE(stat); 1632 1633 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1634 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1635 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1636 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1637 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1638 PetscFunctionReturn(0); 1639 } 1640 1641 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1642 { 1643 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1644 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1645 cudaError_t cerr; 1646 PetscErrorCode ierr; 1647 1648 PetscFunctionBegin; 1649 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1650 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1651 1652 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1653 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1654 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1655 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1656 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1657 A->offloadmask = PETSC_OFFLOAD_BOTH; 1658 } 1659 PetscFunctionReturn(0); 1660 } 1661 1662 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1663 { 1664 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1665 PetscErrorCode ierr; 1666 1667 PetscFunctionBegin; 1668 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1669 *array = a->a; 1670 A->offloadmask = PETSC_OFFLOAD_CPU; 1671 PetscFunctionReturn(0); 1672 } 1673 1674 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1675 { 1676 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1677 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1678 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1679 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1680 PetscErrorCode ierr; 1681 cusparseStatus_t stat; 1682 cudaError_t err; 1683 1684 PetscFunctionBegin; 1685 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1686 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1687 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1688 /* Copy values only */ 1689 CsrMatrix *matrix,*matrixT; 1690 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1691 1692 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1693 matrix->values->assign(a->a, a->a+a->nz); 1694 err = WaitForCUDA();CHKERRCUDA(err); 1695 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1696 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1697 1698 /* Update matT when it was built before */ 1699 if (cusparsestruct->matTranspose) { 1700 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1701 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1702 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1703 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1704 A->cmap->n, matrix->num_entries, 1705 matrix->values->data().get(), 1706 cusparsestruct->rowoffsets_gpu->data().get(), 1707 matrix->column_indices->data().get(), 1708 matrixT->values->data().get(), 1709 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1710 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1711 CUSPARSE_ACTION_NUMERIC,indexBase, 1712 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1713 #else 1714 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1715 CUSPARSE_ACTION_NUMERIC, indexBase 1716 #endif 1717 );CHKERRCUSPARSE(stat); 1718 err = WaitForCUDA();CHKERRCUDA(err); 1719 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1720 } 1721 } else { 1722 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1723 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1724 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1725 delete cusparsestruct->workVector; 1726 delete cusparsestruct->rowoffsets_gpu; 1727 try { 1728 if (a->compressedrow.use) { 1729 m = a->compressedrow.nrows; 1730 ii = a->compressedrow.i; 1731 ridx = a->compressedrow.rindex; 1732 } else { 1733 m = A->rmap->n; 1734 ii = a->i; 1735 ridx = NULL; 1736 } 1737 cusparsestruct->nrows = m; 1738 1739 /* create cusparse matrix */ 1740 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1741 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1742 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1743 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1744 1745 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1746 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1747 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1748 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1749 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1750 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1751 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1752 1753 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1754 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1755 /* set the matrix */ 1756 CsrMatrix *mat= new CsrMatrix; 1757 mat->num_rows = m; 1758 mat->num_cols = A->cmap->n; 1759 mat->num_entries = a->nz; 1760 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1761 mat->row_offsets->assign(ii, ii + m+1); 1762 1763 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1764 mat->column_indices->assign(a->j, a->j+a->nz); 1765 1766 mat->values = new THRUSTARRAY(a->nz); 1767 mat->values->assign(a->a, a->a+a->nz); 1768 1769 /* assign the pointer */ 1770 matstruct->mat = mat; 1771 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1772 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1773 stat = cusparseCreateCsr(&matstruct->matDescr, 1774 mat->num_rows, mat->num_cols, mat->num_entries, 1775 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1776 mat->values->data().get(), 1777 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1778 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1779 } 1780 #endif 1781 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1782 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1783 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1784 #else 1785 CsrMatrix *mat= new CsrMatrix; 1786 mat->num_rows = m; 1787 mat->num_cols = A->cmap->n; 1788 mat->num_entries = a->nz; 1789 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1790 mat->row_offsets->assign(ii, ii + m+1); 1791 1792 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1793 mat->column_indices->assign(a->j, a->j+a->nz); 1794 1795 mat->values = new THRUSTARRAY(a->nz); 1796 mat->values->assign(a->a, a->a+a->nz); 1797 1798 cusparseHybMat_t hybMat; 1799 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1800 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1801 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1802 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1803 matstruct->descr, mat->values->data().get(), 1804 mat->row_offsets->data().get(), 1805 mat->column_indices->data().get(), 1806 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1807 /* assign the pointer */ 1808 matstruct->mat = hybMat; 1809 1810 if (mat) { 1811 if (mat->values) delete (THRUSTARRAY*)mat->values; 1812 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1813 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1814 delete (CsrMatrix*)mat; 1815 } 1816 #endif 1817 } 1818 1819 /* assign the compressed row indices */ 1820 if (a->compressedrow.use) { 1821 cusparsestruct->workVector = new THRUSTARRAY(m); 1822 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1823 matstruct->cprowIndices->assign(ridx,ridx+m); 1824 tmp = m; 1825 } else { 1826 cusparsestruct->workVector = NULL; 1827 matstruct->cprowIndices = NULL; 1828 tmp = 0; 1829 } 1830 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1831 1832 /* assign the pointer */ 1833 cusparsestruct->mat = matstruct; 1834 } catch(char *ex) { 1835 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1836 } 1837 err = WaitForCUDA();CHKERRCUDA(err); 1838 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1839 cusparsestruct->nonzerostate = A->nonzerostate; 1840 } 1841 A->offloadmask = PETSC_OFFLOAD_BOTH; 1842 } 1843 PetscFunctionReturn(0); 1844 } 1845 1846 struct VecCUDAPlusEquals 1847 { 1848 template <typename Tuple> 1849 __host__ __device__ 1850 void operator()(Tuple t) 1851 { 1852 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1853 } 1854 }; 1855 1856 struct VecCUDAEquals 1857 { 1858 template <typename Tuple> 1859 __host__ __device__ 1860 void operator()(Tuple t) 1861 { 1862 thrust::get<1>(t) = thrust::get<0>(t); 1863 } 1864 }; 1865 1866 struct VecCUDAEqualsReverse 1867 { 1868 template <typename Tuple> 1869 __host__ __device__ 1870 void operator()(Tuple t) 1871 { 1872 thrust::get<0>(t) = thrust::get<1>(t); 1873 } 1874 }; 1875 1876 struct MatMatCusparse { 1877 PetscBool cisdense; 1878 PetscScalar *Bt; 1879 Mat X; 1880 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1881 PetscLogDouble flops; 1882 CsrMatrix *Bcsr; 1883 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1884 cusparseSpMatDescr_t matSpBDescr; 1885 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1886 cusparseDnMatDescr_t matBDescr; 1887 cusparseDnMatDescr_t matCDescr; 1888 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1889 size_t mmBufferSize; 1890 void *mmBuffer; 1891 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1892 cusparseSpGEMMDescr_t spgemmDesc; 1893 #endif 1894 }; 1895 1896 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1897 { 1898 PetscErrorCode ierr; 1899 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1900 cudaError_t cerr; 1901 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1902 cusparseStatus_t stat; 1903 #endif 1904 1905 PetscFunctionBegin; 1906 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1907 delete mmdata->Bcsr; 1908 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1909 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1910 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1911 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1912 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1913 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1914 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1915 #endif 1916 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1917 ierr = PetscFree(data);CHKERRQ(ierr); 1918 PetscFunctionReturn(0); 1919 } 1920 1921 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1922 1923 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1924 { 1925 Mat_Product *product = C->product; 1926 Mat A,B; 1927 PetscInt m,n,blda,clda; 1928 PetscBool flg,biscuda; 1929 Mat_SeqAIJCUSPARSE *cusp; 1930 cusparseStatus_t stat; 1931 cusparseOperation_t opA; 1932 const PetscScalar *barray; 1933 PetscScalar *carray; 1934 PetscErrorCode ierr; 1935 MatMatCusparse *mmdata; 1936 Mat_SeqAIJCUSPARSEMultStruct *mat; 1937 CsrMatrix *csrmat; 1938 cudaError_t cerr; 1939 1940 PetscFunctionBegin; 1941 MatCheckProduct(C,1); 1942 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1943 mmdata = (MatMatCusparse*)product->data; 1944 A = product->A; 1945 B = product->B; 1946 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1947 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1948 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1949 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1950 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1951 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1952 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1953 switch (product->type) { 1954 case MATPRODUCT_AB: 1955 case MATPRODUCT_PtAP: 1956 mat = cusp->mat; 1957 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1958 m = A->rmap->n; 1959 n = B->cmap->n; 1960 break; 1961 case MATPRODUCT_AtB: 1962 if (!cusp->transgen) { 1963 mat = cusp->mat; 1964 opA = CUSPARSE_OPERATION_TRANSPOSE; 1965 } else { 1966 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1967 mat = cusp->matTranspose; 1968 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1969 } 1970 m = A->cmap->n; 1971 n = B->cmap->n; 1972 break; 1973 case MATPRODUCT_ABt: 1974 case MATPRODUCT_RARt: 1975 mat = cusp->mat; 1976 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1977 m = A->rmap->n; 1978 n = B->rmap->n; 1979 break; 1980 default: 1981 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1982 } 1983 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1984 csrmat = (CsrMatrix*)mat->mat; 1985 /* if the user passed a CPU matrix, copy the data to the GPU */ 1986 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1987 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1988 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1989 1990 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1991 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1992 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1993 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1994 } else { 1995 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1996 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1997 } 1998 1999 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2000 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2001 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2002 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2003 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2004 size_t mmBufferSize; 2005 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2006 if (!mmdata->matBDescr) { 2007 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2008 mmdata->Blda = blda; 2009 } 2010 2011 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2012 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2013 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2014 mmdata->Clda = clda; 2015 } 2016 2017 if (!mat->matDescr) { 2018 stat = cusparseCreateCsr(&mat->matDescr, 2019 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2020 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2021 csrmat->values->data().get(), 2022 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2023 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2024 } 2025 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2026 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2027 mmdata->matCDescr,cusparse_scalartype, 2028 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2029 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2030 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2031 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2032 mmdata->mmBufferSize = mmBufferSize; 2033 } 2034 mmdata->initialized = PETSC_TRUE; 2035 } else { 2036 /* to be safe, always update pointers of the mats */ 2037 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2038 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2039 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2040 } 2041 2042 /* do cusparseSpMM, which supports transpose on B */ 2043 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2044 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2045 mmdata->matCDescr,cusparse_scalartype, 2046 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2047 #else 2048 PetscInt k; 2049 /* cusparseXcsrmm does not support transpose on B */ 2050 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2051 cublasHandle_t cublasv2handle; 2052 cublasStatus_t cerr; 2053 2054 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2055 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2056 B->cmap->n,B->rmap->n, 2057 &PETSC_CUSPARSE_ONE ,barray,blda, 2058 &PETSC_CUSPARSE_ZERO,barray,blda, 2059 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2060 blda = B->cmap->n; 2061 k = B->cmap->n; 2062 } else { 2063 k = B->rmap->n; 2064 } 2065 2066 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2067 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2068 csrmat->num_entries,mat->alpha_one,mat->descr, 2069 csrmat->values->data().get(), 2070 csrmat->row_offsets->data().get(), 2071 csrmat->column_indices->data().get(), 2072 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2073 carray,clda);CHKERRCUSPARSE(stat); 2074 #endif 2075 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2076 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2077 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2078 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2079 if (product->type == MATPRODUCT_RARt) { 2080 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2081 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2082 } else if (product->type == MATPRODUCT_PtAP) { 2083 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2084 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2085 } else { 2086 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2087 } 2088 if (mmdata->cisdense) { 2089 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2090 } 2091 if (!biscuda) { 2092 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2093 } 2094 PetscFunctionReturn(0); 2095 } 2096 2097 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2098 { 2099 Mat_Product *product = C->product; 2100 Mat A,B; 2101 PetscInt m,n; 2102 PetscBool cisdense,flg; 2103 PetscErrorCode ierr; 2104 MatMatCusparse *mmdata; 2105 Mat_SeqAIJCUSPARSE *cusp; 2106 2107 PetscFunctionBegin; 2108 MatCheckProduct(C,1); 2109 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2110 A = product->A; 2111 B = product->B; 2112 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2113 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2114 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2115 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2116 switch (product->type) { 2117 case MATPRODUCT_AB: 2118 m = A->rmap->n; 2119 n = B->cmap->n; 2120 break; 2121 case MATPRODUCT_AtB: 2122 m = A->cmap->n; 2123 n = B->cmap->n; 2124 break; 2125 case MATPRODUCT_ABt: 2126 m = A->rmap->n; 2127 n = B->rmap->n; 2128 break; 2129 case MATPRODUCT_PtAP: 2130 m = B->cmap->n; 2131 n = B->cmap->n; 2132 break; 2133 case MATPRODUCT_RARt: 2134 m = B->rmap->n; 2135 n = B->rmap->n; 2136 break; 2137 default: 2138 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2139 } 2140 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2141 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2142 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2143 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2144 2145 /* product data */ 2146 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2147 mmdata->cisdense = cisdense; 2148 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2149 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2150 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2151 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2152 } 2153 #endif 2154 /* for these products we need intermediate storage */ 2155 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2156 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2157 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2158 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2159 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2160 } else { 2161 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2162 } 2163 } 2164 C->product->data = mmdata; 2165 C->product->destroy = MatDestroy_MatMatCusparse; 2166 2167 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2168 PetscFunctionReturn(0); 2169 } 2170 2171 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2172 { 2173 Mat_Product *product = C->product; 2174 Mat A,B; 2175 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2176 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2177 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2178 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2179 PetscBool flg; 2180 PetscErrorCode ierr; 2181 cusparseStatus_t stat; 2182 cudaError_t cerr; 2183 MatProductType ptype; 2184 MatMatCusparse *mmdata; 2185 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2186 cusparseSpMatDescr_t BmatSpDescr; 2187 #endif 2188 2189 PetscFunctionBegin; 2190 MatCheckProduct(C,1); 2191 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2192 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2193 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2194 mmdata = (MatMatCusparse*)C->product->data; 2195 A = product->A; 2196 B = product->B; 2197 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2198 mmdata->reusesym = PETSC_FALSE; 2199 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2200 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2201 Cmat = Ccusp->mat; 2202 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2203 Ccsr = (CsrMatrix*)Cmat->mat; 2204 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2205 goto finalize; 2206 } 2207 if (!c->nz) goto finalize; 2208 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2209 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2210 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2211 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2212 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2213 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2214 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2215 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2216 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2217 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2218 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2219 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2220 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2221 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2222 2223 ptype = product->type; 2224 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2225 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2226 switch (ptype) { 2227 case MATPRODUCT_AB: 2228 Amat = Acusp->mat; 2229 Bmat = Bcusp->mat; 2230 break; 2231 case MATPRODUCT_AtB: 2232 Amat = Acusp->matTranspose; 2233 Bmat = Bcusp->mat; 2234 break; 2235 case MATPRODUCT_ABt: 2236 Amat = Acusp->mat; 2237 Bmat = Bcusp->matTranspose; 2238 break; 2239 default: 2240 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2241 } 2242 Cmat = Ccusp->mat; 2243 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2244 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2245 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2246 Acsr = (CsrMatrix*)Amat->mat; 2247 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2248 Ccsr = (CsrMatrix*)Cmat->mat; 2249 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2250 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2251 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2252 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2253 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2254 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2255 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2256 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2257 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2258 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2259 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2260 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2261 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2262 #else 2263 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2264 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2265 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2266 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2267 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2268 #endif 2269 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2270 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2271 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2272 C->offloadmask = PETSC_OFFLOAD_GPU; 2273 finalize: 2274 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2275 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2276 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2277 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2278 c->reallocs = 0; 2279 C->info.mallocs += 0; 2280 C->info.nz_unneeded = 0; 2281 C->assembled = C->was_assembled = PETSC_TRUE; 2282 C->num_ass++; 2283 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 2284 ierr = MatSeqAIJCUSPARSECopyFromGPU(C);CHKERRQ(ierr); 2285 PetscFunctionReturn(0); 2286 } 2287 2288 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2289 { 2290 Mat_Product *product = C->product; 2291 Mat A,B; 2292 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2293 Mat_SeqAIJ *a,*b,*c; 2294 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2295 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2296 PetscInt i,j,m,n,k; 2297 PetscBool flg; 2298 PetscErrorCode ierr; 2299 cusparseStatus_t stat; 2300 cudaError_t cerr; 2301 MatProductType ptype; 2302 MatMatCusparse *mmdata; 2303 PetscLogDouble flops; 2304 PetscBool biscompressed,ciscompressed; 2305 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2306 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2307 size_t bufSize2; 2308 cusparseSpMatDescr_t BmatSpDescr; 2309 #else 2310 int cnz; 2311 #endif 2312 2313 PetscFunctionBegin; 2314 MatCheckProduct(C,1); 2315 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2316 A = product->A; 2317 B = product->B; 2318 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2319 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2320 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2321 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2322 a = (Mat_SeqAIJ*)A->data; 2323 b = (Mat_SeqAIJ*)B->data; 2324 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2325 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2326 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2327 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2328 2329 /* product data */ 2330 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2331 C->product->data = mmdata; 2332 C->product->destroy = MatDestroy_MatMatCusparse; 2333 2334 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2335 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2336 ptype = product->type; 2337 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2338 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2339 biscompressed = PETSC_FALSE; 2340 ciscompressed = PETSC_FALSE; 2341 switch (ptype) { 2342 case MATPRODUCT_AB: 2343 m = A->rmap->n; 2344 n = B->cmap->n; 2345 k = A->cmap->n; 2346 Amat = Acusp->mat; 2347 Bmat = Bcusp->mat; 2348 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2349 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2350 break; 2351 case MATPRODUCT_AtB: 2352 m = A->cmap->n; 2353 n = B->cmap->n; 2354 k = A->rmap->n; 2355 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2356 Amat = Acusp->matTranspose; 2357 Bmat = Bcusp->mat; 2358 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2359 break; 2360 case MATPRODUCT_ABt: 2361 m = A->rmap->n; 2362 n = B->rmap->n; 2363 k = A->cmap->n; 2364 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2365 Amat = Acusp->mat; 2366 Bmat = Bcusp->matTranspose; 2367 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2368 break; 2369 default: 2370 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2371 } 2372 2373 /* create cusparse matrix */ 2374 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2375 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2376 c = (Mat_SeqAIJ*)C->data; 2377 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2378 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2379 Ccsr = new CsrMatrix; 2380 2381 c->compressedrow.use = ciscompressed; 2382 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2383 c->compressedrow.nrows = a->compressedrow.nrows; 2384 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2385 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2386 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2387 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2388 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2389 } else { 2390 c->compressedrow.nrows = 0; 2391 c->compressedrow.i = NULL; 2392 c->compressedrow.rindex = NULL; 2393 Ccusp->workVector = NULL; 2394 Cmat->cprowIndices = NULL; 2395 } 2396 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2397 Ccusp->mat = Cmat; 2398 Ccusp->mat->mat = Ccsr; 2399 Ccsr->num_rows = Ccusp->nrows; 2400 Ccsr->num_cols = n; 2401 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2402 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2403 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2404 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2405 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2406 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2407 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2408 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2409 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2410 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2411 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2412 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2413 c->nz = 0; 2414 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2415 Ccsr->values = new THRUSTARRAY(c->nz); 2416 goto finalizesym; 2417 } 2418 2419 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2420 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2421 Acsr = (CsrMatrix*)Amat->mat; 2422 if (!biscompressed) { 2423 Bcsr = (CsrMatrix*)Bmat->mat; 2424 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2425 BmatSpDescr = Bmat->matDescr; 2426 #endif 2427 } else { /* we need to use row offsets for the full matrix */ 2428 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2429 Bcsr = new CsrMatrix; 2430 Bcsr->num_rows = B->rmap->n; 2431 Bcsr->num_cols = cBcsr->num_cols; 2432 Bcsr->num_entries = cBcsr->num_entries; 2433 Bcsr->column_indices = cBcsr->column_indices; 2434 Bcsr->values = cBcsr->values; 2435 if (!Bcusp->rowoffsets_gpu) { 2436 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2437 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2438 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2439 } 2440 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2441 mmdata->Bcsr = Bcsr; 2442 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2443 if (Bcsr->num_rows && Bcsr->num_cols) { 2444 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2445 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2446 Bcsr->values->data().get(), 2447 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2448 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2449 } 2450 BmatSpDescr = mmdata->matSpBDescr; 2451 #endif 2452 } 2453 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2454 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2455 /* precompute flops count */ 2456 if (ptype == MATPRODUCT_AB) { 2457 for (i=0, flops = 0; i<A->rmap->n; i++) { 2458 const PetscInt st = a->i[i]; 2459 const PetscInt en = a->i[i+1]; 2460 for (j=st; j<en; j++) { 2461 const PetscInt brow = a->j[j]; 2462 flops += 2.*(b->i[brow+1] - b->i[brow]); 2463 } 2464 } 2465 } else if (ptype == MATPRODUCT_AtB) { 2466 for (i=0, flops = 0; i<A->rmap->n; i++) { 2467 const PetscInt anzi = a->i[i+1] - a->i[i]; 2468 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2469 flops += (2.*anzi)*bnzi; 2470 } 2471 } else { /* TODO */ 2472 flops = 0.; 2473 } 2474 2475 mmdata->flops = flops; 2476 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2477 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2478 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2479 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2480 NULL, NULL, NULL, 2481 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2482 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2483 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2484 /* ask bufferSize bytes for external memory */ 2485 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2486 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2487 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2488 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2489 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUSPARSE(stat); 2490 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2491 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2492 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2493 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2494 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2495 /* ask bufferSize again bytes for external memory */ 2496 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2497 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2498 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2499 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2500 /* The CUSPARSE documentation is not clear, nor the API 2501 We need both buffers to perform the operations properly! 2502 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2503 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2504 is stored in the descriptor! What a messy API... */ 2505 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUSPARSE(stat); 2506 /* compute the intermediate product of A * B */ 2507 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2508 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2509 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2510 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2511 /* get matrix C non-zero entries C_nnz1 */ 2512 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2513 c->nz = (PetscInt) C_nnz1; 2514 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2515 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2516 Ccsr->values = new THRUSTARRAY(c->nz); 2517 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2518 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2519 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2520 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2521 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2522 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2523 #else 2524 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2525 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2526 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2527 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2528 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2529 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2530 c->nz = cnz; 2531 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2532 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2533 Ccsr->values = new THRUSTARRAY(c->nz); 2534 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2535 2536 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2537 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2538 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2539 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2540 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2541 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2542 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2543 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2544 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2545 #endif 2546 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2547 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2548 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2549 finalizesym: 2550 c->singlemalloc = PETSC_FALSE; 2551 c->free_a = PETSC_TRUE; 2552 c->free_ij = PETSC_TRUE; 2553 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2554 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2555 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2556 PetscInt *d_i = c->i; 2557 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2558 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2559 ii = *Ccsr->row_offsets; 2560 jj = *Ccsr->column_indices; 2561 if (ciscompressed) d_i = c->compressedrow.i; 2562 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2563 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2564 } else { 2565 PetscInt *d_i = c->i; 2566 if (ciscompressed) d_i = c->compressedrow.i; 2567 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2568 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2569 } 2570 if (ciscompressed) { /* need to expand host row offsets */ 2571 PetscInt r = 0; 2572 c->i[0] = 0; 2573 for (k = 0; k < c->compressedrow.nrows; k++) { 2574 const PetscInt next = c->compressedrow.rindex[k]; 2575 const PetscInt old = c->compressedrow.i[k]; 2576 for (; r < next; r++) c->i[r+1] = old; 2577 } 2578 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2579 } 2580 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2581 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2582 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2583 c->maxnz = c->nz; 2584 c->nonzerorowcnt = 0; 2585 c->rmax = 0; 2586 for (k = 0; k < m; k++) { 2587 const PetscInt nn = c->i[k+1] - c->i[k]; 2588 c->ilen[k] = c->imax[k] = nn; 2589 c->nonzerorowcnt += (PetscInt)!!nn; 2590 c->rmax = PetscMax(c->rmax,nn); 2591 } 2592 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2593 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2594 Ccsr->num_entries = c->nz; 2595 2596 C->nonzerostate++; 2597 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2598 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2599 Ccusp->nonzerostate = C->nonzerostate; 2600 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2601 C->preallocated = PETSC_TRUE; 2602 C->assembled = PETSC_FALSE; 2603 C->was_assembled = PETSC_FALSE; 2604 if (product->api_user) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2605 mmdata->reusesym = PETSC_TRUE; 2606 C->offloadmask = PETSC_OFFLOAD_GPU; 2607 } 2608 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2609 PetscFunctionReturn(0); 2610 } 2611 2612 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2613 2614 /* handles sparse or dense B */ 2615 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2616 { 2617 Mat_Product *product = mat->product; 2618 PetscErrorCode ierr; 2619 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2620 2621 PetscFunctionBegin; 2622 MatCheckProduct(mat,1); 2623 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2624 if (!product->B->boundtocpu) { 2625 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2626 } 2627 if (product->type == MATPRODUCT_ABC) { 2628 Ciscusp = PETSC_FALSE; 2629 if (!product->C->boundtocpu) { 2630 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2631 } 2632 } 2633 if (isdense) { 2634 switch (product->type) { 2635 case MATPRODUCT_AB: 2636 case MATPRODUCT_AtB: 2637 case MATPRODUCT_ABt: 2638 case MATPRODUCT_PtAP: 2639 case MATPRODUCT_RARt: 2640 if (product->A->boundtocpu) { 2641 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2642 } else { 2643 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2644 } 2645 break; 2646 case MATPRODUCT_ABC: 2647 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2648 break; 2649 default: 2650 break; 2651 } 2652 } else if (Biscusp && Ciscusp) { 2653 switch (product->type) { 2654 case MATPRODUCT_AB: 2655 case MATPRODUCT_AtB: 2656 case MATPRODUCT_ABt: 2657 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2658 break; 2659 case MATPRODUCT_PtAP: 2660 case MATPRODUCT_RARt: 2661 case MATPRODUCT_ABC: 2662 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2663 break; 2664 default: 2665 break; 2666 } 2667 } else { /* fallback for AIJ */ 2668 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2669 } 2670 PetscFunctionReturn(0); 2671 } 2672 2673 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2674 { 2675 PetscErrorCode ierr; 2676 2677 PetscFunctionBegin; 2678 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2679 PetscFunctionReturn(0); 2680 } 2681 2682 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2683 { 2684 PetscErrorCode ierr; 2685 2686 PetscFunctionBegin; 2687 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2688 PetscFunctionReturn(0); 2689 } 2690 2691 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2692 { 2693 PetscErrorCode ierr; 2694 2695 PetscFunctionBegin; 2696 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2697 PetscFunctionReturn(0); 2698 } 2699 2700 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2701 { 2702 PetscErrorCode ierr; 2703 2704 PetscFunctionBegin; 2705 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2706 PetscFunctionReturn(0); 2707 } 2708 2709 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2710 { 2711 PetscErrorCode ierr; 2712 2713 PetscFunctionBegin; 2714 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2715 PetscFunctionReturn(0); 2716 } 2717 2718 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2719 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2720 { 2721 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2722 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2723 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2724 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2725 PetscErrorCode ierr; 2726 cudaError_t cerr; 2727 cusparseStatus_t stat; 2728 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2729 PetscBool compressed; 2730 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2731 PetscInt nx,ny; 2732 #endif 2733 2734 PetscFunctionBegin; 2735 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2736 if (!a->nonzerorowcnt) { 2737 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2738 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2739 PetscFunctionReturn(0); 2740 } 2741 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2742 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2743 if (!trans) { 2744 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2745 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2746 } else { 2747 if (herm || !cusparsestruct->transgen) { 2748 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2749 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2750 } else { 2751 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2752 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2753 } 2754 } 2755 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2756 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2757 2758 try { 2759 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2760 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2761 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2762 2763 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2764 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2765 /* z = A x + beta y. 2766 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2767 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2768 */ 2769 xptr = xarray; 2770 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2771 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2772 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2773 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2774 allocated to accommodate different uses. So we get the length info directly from mat. 2775 */ 2776 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2777 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2778 nx = mat->num_cols; 2779 ny = mat->num_rows; 2780 } 2781 #endif 2782 } else { 2783 /* z = A^T x + beta y 2784 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2785 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2786 */ 2787 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2788 dptr = zarray; 2789 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2790 if (compressed) { /* Scatter x to work vector */ 2791 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2792 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2793 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2794 VecCUDAEqualsReverse()); 2795 } 2796 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2797 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2798 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2799 nx = mat->num_rows; 2800 ny = mat->num_cols; 2801 } 2802 #endif 2803 } 2804 2805 /* csr_spmv does y = alpha op(A) x + beta y */ 2806 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2807 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2808 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2809 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2810 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2811 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2812 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2813 matstruct->matDescr, 2814 matstruct->cuSpMV[opA].vecXDescr, beta, 2815 matstruct->cuSpMV[opA].vecYDescr, 2816 cusparse_scalartype, 2817 cusparsestruct->spmvAlg, 2818 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2819 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2820 2821 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2822 } else { 2823 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2824 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2825 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2826 } 2827 2828 stat = cusparseSpMV(cusparsestruct->handle, opA, 2829 matstruct->alpha_one, 2830 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2831 matstruct->cuSpMV[opA].vecXDescr, 2832 beta, 2833 matstruct->cuSpMV[opA].vecYDescr, 2834 cusparse_scalartype, 2835 cusparsestruct->spmvAlg, 2836 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2837 #else 2838 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2839 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2840 mat->num_rows, mat->num_cols, 2841 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2842 mat->values->data().get(), mat->row_offsets->data().get(), 2843 mat->column_indices->data().get(), xptr, beta, 2844 dptr);CHKERRCUSPARSE(stat); 2845 #endif 2846 } else { 2847 if (cusparsestruct->nrows) { 2848 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2849 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2850 #else 2851 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2852 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2853 matstruct->alpha_one, matstruct->descr, hybMat, 2854 xptr, beta, 2855 dptr);CHKERRCUSPARSE(stat); 2856 #endif 2857 } 2858 } 2859 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2860 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2861 2862 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2863 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2864 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2865 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2866 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2867 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2868 } 2869 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2870 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2871 } 2872 2873 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2874 if (compressed) { 2875 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2876 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2877 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2878 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2879 VecCUDAPlusEquals()); 2880 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2881 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2882 } 2883 } else { 2884 if (yy && yy != zz) { 2885 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2886 } 2887 } 2888 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2889 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2890 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2891 } catch(char *ex) { 2892 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2893 } 2894 if (yy) { 2895 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2896 } else { 2897 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2898 } 2899 PetscFunctionReturn(0); 2900 } 2901 2902 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2903 { 2904 PetscErrorCode ierr; 2905 2906 PetscFunctionBegin; 2907 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2908 PetscFunctionReturn(0); 2909 } 2910 2911 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2912 { 2913 PetscErrorCode ierr; 2914 PetscSplitCSRDataStructure *d_mat = NULL; 2915 PetscFunctionBegin; 2916 if (A->factortype == MAT_FACTOR_NONE) { 2917 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2918 } 2919 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2920 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2921 if (d_mat) { 2922 A->offloadmask = PETSC_OFFLOAD_GPU; 2923 } 2924 2925 PetscFunctionReturn(0); 2926 } 2927 2928 /* --------------------------------------------------------------------------------*/ 2929 /*@ 2930 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2931 (the default parallel PETSc format). This matrix will ultimately pushed down 2932 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2933 assembly performance the user should preallocate the matrix storage by setting 2934 the parameter nz (or the array nnz). By setting these parameters accurately, 2935 performance during matrix assembly can be increased by more than a factor of 50. 2936 2937 Collective 2938 2939 Input Parameters: 2940 + comm - MPI communicator, set to PETSC_COMM_SELF 2941 . m - number of rows 2942 . n - number of columns 2943 . nz - number of nonzeros per row (same for all rows) 2944 - nnz - array containing the number of nonzeros in the various rows 2945 (possibly different for each row) or NULL 2946 2947 Output Parameter: 2948 . A - the matrix 2949 2950 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2951 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2952 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2953 2954 Notes: 2955 If nnz is given then nz is ignored 2956 2957 The AIJ format (also called the Yale sparse matrix format or 2958 compressed row storage), is fully compatible with standard Fortran 77 2959 storage. That is, the stored row and column indices can begin at 2960 either one (as in Fortran) or zero. See the users' manual for details. 2961 2962 Specify the preallocated storage with either nz or nnz (not both). 2963 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2964 allocation. For large problems you MUST preallocate memory or you 2965 will get TERRIBLE performance, see the users' manual chapter on matrices. 2966 2967 By default, this format uses inodes (identical nodes) when possible, to 2968 improve numerical efficiency of matrix-vector products and solves. We 2969 search for consecutive rows with the same nonzero structure, thereby 2970 reusing matrix information to achieve increased efficiency. 2971 2972 Level: intermediate 2973 2974 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2975 @*/ 2976 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2977 { 2978 PetscErrorCode ierr; 2979 2980 PetscFunctionBegin; 2981 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2982 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2983 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2984 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2985 PetscFunctionReturn(0); 2986 } 2987 2988 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2989 { 2990 PetscErrorCode ierr; 2991 PetscSplitCSRDataStructure *d_mat = NULL; 2992 2993 PetscFunctionBegin; 2994 if (A->factortype == MAT_FACTOR_NONE) { 2995 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2996 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2997 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 2998 } else { 2999 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3000 } 3001 if (d_mat) { 3002 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3003 cudaError_t err; 3004 PetscSplitCSRDataStructure h_mat; 3005 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3006 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3007 if (a->compressedrow.use) { 3008 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3009 } 3010 err = cudaFree(d_mat);CHKERRCUDA(err); 3011 } 3012 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3013 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3014 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3015 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3016 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3017 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3018 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3019 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3020 PetscFunctionReturn(0); 3021 } 3022 3023 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3024 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3025 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3026 { 3027 PetscErrorCode ierr; 3028 3029 PetscFunctionBegin; 3030 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3031 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3032 PetscFunctionReturn(0); 3033 } 3034 3035 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc. 3036 { 3037 PetscErrorCode ierr; 3038 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3039 PetscBool flgx,flgy; 3040 3041 PetscFunctionBegin; 3042 if (a == (PetscScalar)0.0) PetscFunctionReturn(0); 3043 PetscValidHeaderSpecific(Y,MAT_CLASSID,1); 3044 PetscValidHeaderSpecific(X,MAT_CLASSID,3); 3045 ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr); 3046 ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr); 3047 if (!flgx || !flgy) { 3048 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 3049 PetscFunctionReturn(0); 3050 } 3051 if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE"); 3052 if (str == DIFFERENT_NONZERO_PATTERN) { 3053 if (x->nz == y->nz) { 3054 PetscBool e; 3055 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 3056 if (e) { 3057 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 3058 if (e) { 3059 str = SAME_NONZERO_PATTERN; 3060 } 3061 } 3062 } 3063 } 3064 if (str != SAME_NONZERO_PATTERN) { 3065 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 3066 PetscFunctionReturn(0); 3067 } else { 3068 Mat_SeqAIJCUSPARSE *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3069 Mat_SeqAIJCUSPARSE *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr; 3070 if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3071 if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3072 if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) { 3073 if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) { 3074 ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 3075 } 3076 if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) { 3077 ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr); 3078 } 3079 ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3080 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3081 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3082 } else { 3083 cublasHandle_t cublasv2handle; 3084 cublasStatus_t cberr; 3085 cudaError_t err; 3086 PetscScalar alpha = a; 3087 PetscBLASInt one = 1, bnz = 1; 3088 CsrMatrix *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat; 3089 CsrMatrix *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat; 3090 PetscScalar *aa_y, *aa_x; 3091 aa_y = matrix_y->values->data().get(); 3092 aa_x = matrix_x->values->data().get(); 3093 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3094 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3095 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3096 cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr); 3097 err = WaitForCUDA();CHKERRCUDA(err); 3098 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3099 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3100 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3101 ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr); 3102 if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU; 3103 else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state"); 3104 ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 3105 } 3106 } 3107 PetscFunctionReturn(0); 3108 } 3109 3110 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3111 { 3112 PetscErrorCode ierr; 3113 PetscBool both = PETSC_FALSE; 3114 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3115 3116 PetscFunctionBegin; 3117 if (A->factortype == MAT_FACTOR_NONE) { 3118 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3119 if (spptr->mat) { 3120 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3121 if (matrix->values) { 3122 both = PETSC_TRUE; 3123 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3124 } 3125 } 3126 if (spptr->matTranspose) { 3127 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3128 if (matrix->values) { 3129 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3130 } 3131 } 3132 } 3133 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3134 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3135 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3136 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3137 else A->offloadmask = PETSC_OFFLOAD_CPU; 3138 3139 PetscFunctionReturn(0); 3140 } 3141 3142 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3143 { 3144 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3145 PetscErrorCode ierr; 3146 3147 PetscFunctionBegin; 3148 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3149 if (flg) { 3150 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3151 3152 A->ops->axpy = MatAXPY_SeqAIJ; 3153 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3154 A->ops->mult = MatMult_SeqAIJ; 3155 A->ops->multadd = MatMultAdd_SeqAIJ; 3156 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3157 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3158 A->ops->multhermitiantranspose = NULL; 3159 A->ops->multhermitiantransposeadd = NULL; 3160 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3161 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3162 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3163 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3164 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3165 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3166 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3167 } else { 3168 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3169 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3170 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3171 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3172 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3173 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3174 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3175 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3176 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3177 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3178 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3179 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3180 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3181 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3182 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3183 } 3184 A->boundtocpu = flg; 3185 a->inode.use = flg; 3186 PetscFunctionReturn(0); 3187 } 3188 3189 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3190 { 3191 PetscErrorCode ierr; 3192 cusparseStatus_t stat; 3193 Mat B; 3194 3195 PetscFunctionBegin; 3196 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3197 if (reuse == MAT_INITIAL_MATRIX) { 3198 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3199 } else if (reuse == MAT_REUSE_MATRIX) { 3200 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3201 } 3202 B = *newmat; 3203 3204 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3205 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3206 3207 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3208 if (B->factortype == MAT_FACTOR_NONE) { 3209 Mat_SeqAIJCUSPARSE *spptr; 3210 3211 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3212 spptr->format = MAT_CUSPARSE_CSR; 3213 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3214 B->spptr = spptr; 3215 spptr->deviceMat = NULL; 3216 } else { 3217 Mat_SeqAIJCUSPARSETriFactors *spptr; 3218 3219 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3220 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3221 B->spptr = spptr; 3222 } 3223 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3224 } 3225 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3226 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3227 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3228 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3229 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3230 3231 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3232 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3233 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3234 PetscFunctionReturn(0); 3235 } 3236 3237 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3238 { 3239 PetscErrorCode ierr; 3240 3241 PetscFunctionBegin; 3242 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3243 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3244 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3245 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3246 ierr = PetscOptionsEnd();CHKERRQ(ierr); 3247 PetscFunctionReturn(0); 3248 } 3249 3250 /*MC 3251 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3252 3253 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3254 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3255 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3256 3257 Options Database Keys: 3258 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3259 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3260 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3261 3262 Level: beginner 3263 3264 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3265 M*/ 3266 3267 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3268 3269 3270 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3271 { 3272 PetscErrorCode ierr; 3273 3274 PetscFunctionBegin; 3275 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3276 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3277 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3278 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3279 PetscFunctionReturn(0); 3280 } 3281 3282 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3283 { 3284 PetscErrorCode ierr; 3285 cusparseStatus_t stat; 3286 3287 PetscFunctionBegin; 3288 if (*cusparsestruct) { 3289 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3290 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3291 delete (*cusparsestruct)->workVector; 3292 delete (*cusparsestruct)->rowoffsets_gpu; 3293 delete (*cusparsestruct)->cooPerm; 3294 delete (*cusparsestruct)->cooPerm_a; 3295 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3296 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3297 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 3298 #endif 3299 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3300 } 3301 PetscFunctionReturn(0); 3302 } 3303 3304 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3305 { 3306 PetscFunctionBegin; 3307 if (*mat) { 3308 delete (*mat)->values; 3309 delete (*mat)->column_indices; 3310 delete (*mat)->row_offsets; 3311 delete *mat; 3312 *mat = 0; 3313 } 3314 PetscFunctionReturn(0); 3315 } 3316 3317 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3318 { 3319 cusparseStatus_t stat; 3320 PetscErrorCode ierr; 3321 3322 PetscFunctionBegin; 3323 if (*trifactor) { 3324 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3325 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3326 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3327 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3328 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3329 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3330 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3331 #endif 3332 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3333 } 3334 PetscFunctionReturn(0); 3335 } 3336 3337 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3338 { 3339 CsrMatrix *mat; 3340 cusparseStatus_t stat; 3341 cudaError_t err; 3342 3343 PetscFunctionBegin; 3344 if (*matstruct) { 3345 if ((*matstruct)->mat) { 3346 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3347 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3348 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3349 #else 3350 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3351 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3352 #endif 3353 } else { 3354 mat = (CsrMatrix*)(*matstruct)->mat; 3355 CsrMatrix_Destroy(&mat); 3356 } 3357 } 3358 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3359 delete (*matstruct)->cprowIndices; 3360 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3361 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3362 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3363 3364 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3365 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3366 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3367 for (int i=0; i<3; i++) { 3368 if (mdata->cuSpMV[i].initialized) { 3369 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3370 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3371 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3372 } 3373 } 3374 #endif 3375 delete *matstruct; 3376 *matstruct = NULL; 3377 } 3378 PetscFunctionReturn(0); 3379 } 3380 3381 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3382 { 3383 PetscErrorCode ierr; 3384 3385 PetscFunctionBegin; 3386 if (*trifactors) { 3387 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3388 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3389 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3390 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3391 delete (*trifactors)->rpermIndices; 3392 delete (*trifactors)->cpermIndices; 3393 delete (*trifactors)->workVector; 3394 (*trifactors)->rpermIndices = NULL; 3395 (*trifactors)->cpermIndices = NULL; 3396 (*trifactors)->workVector = NULL; 3397 } 3398 PetscFunctionReturn(0); 3399 } 3400 3401 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3402 { 3403 PetscErrorCode ierr; 3404 cusparseHandle_t handle; 3405 cusparseStatus_t stat; 3406 3407 PetscFunctionBegin; 3408 if (*trifactors) { 3409 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3410 if (handle = (*trifactors)->handle) { 3411 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3412 } 3413 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3414 } 3415 PetscFunctionReturn(0); 3416 } 3417 3418 struct IJCompare 3419 { 3420 __host__ __device__ 3421 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3422 { 3423 if (t1.get<0>() < t2.get<0>()) return true; 3424 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3425 return false; 3426 } 3427 }; 3428 3429 struct IJEqual 3430 { 3431 __host__ __device__ 3432 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3433 { 3434 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3435 return true; 3436 } 3437 }; 3438 3439 struct IJDiff 3440 { 3441 __host__ __device__ 3442 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3443 { 3444 return t1 == t2 ? 0 : 1; 3445 } 3446 }; 3447 3448 struct IJSum 3449 { 3450 __host__ __device__ 3451 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3452 { 3453 return t1||t2; 3454 } 3455 }; 3456 3457 #include <thrust/iterator/discard_iterator.h> 3458 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3459 { 3460 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3461 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3462 THRUSTARRAY *cooPerm_v = NULL,*cooPerm_w; 3463 thrust::device_ptr<const PetscScalar> d_v; 3464 CsrMatrix *matrix; 3465 PetscErrorCode ierr; 3466 cudaError_t cerr; 3467 PetscInt n; 3468 3469 PetscFunctionBegin; 3470 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3471 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3472 if (!cusp->cooPerm) { 3473 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3474 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3475 PetscFunctionReturn(0); 3476 } 3477 matrix = (CsrMatrix*)cusp->mat->mat; 3478 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3479 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 3480 if (!v) { 3481 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3482 goto finalize; 3483 } 3484 n = cusp->cooPerm->size(); 3485 if (isCudaMem(v)) { 3486 d_v = thrust::device_pointer_cast(v); 3487 } else { 3488 cooPerm_v = new THRUSTARRAY(n); 3489 cooPerm_v->assign(v,v+n); 3490 d_v = cooPerm_v->data(); 3491 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3492 } 3493 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3494 if (cusp->cooPerm_a) { 3495 cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3496 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3497 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3498 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3499 delete cooPerm_w; 3500 } else { 3501 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3502 matrix->values->begin())); 3503 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3504 matrix->values->end())); 3505 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3506 } 3507 } else { 3508 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3509 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3510 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3511 } else { 3512 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3513 matrix->values->begin())); 3514 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3515 matrix->values->end())); 3516 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3517 } 3518 } 3519 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3520 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 3521 finalize: 3522 delete cooPerm_v; 3523 A->offloadmask = PETSC_OFFLOAD_GPU; 3524 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3525 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3526 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3527 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3528 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3529 a->reallocs = 0; 3530 A->info.mallocs += 0; 3531 A->info.nz_unneeded = 0; 3532 A->assembled = A->was_assembled = PETSC_TRUE; 3533 A->num_ass++; 3534 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 3535 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3536 PetscFunctionReturn(0); 3537 } 3538 3539 #include <thrust/binary_search.h> 3540 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3541 { 3542 PetscErrorCode ierr; 3543 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3544 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3545 PetscInt cooPerm_n, nzr = 0; 3546 cudaError_t cerr; 3547 3548 PetscFunctionBegin; 3549 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3550 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3551 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3552 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3553 if (n != cooPerm_n) { 3554 delete cusp->cooPerm; 3555 delete cusp->cooPerm_a; 3556 cusp->cooPerm = NULL; 3557 cusp->cooPerm_a = NULL; 3558 } 3559 if (n) { 3560 THRUSTINTARRAY d_i(n); 3561 THRUSTINTARRAY d_j(n); 3562 THRUSTINTARRAY ii(A->rmap->n); 3563 3564 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3565 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3566 3567 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3568 d_i.assign(coo_i,coo_i+n); 3569 d_j.assign(coo_j,coo_j+n); 3570 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3571 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3572 3573 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3574 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3575 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3576 *cusp->cooPerm_a = d_i; 3577 THRUSTINTARRAY w = d_j; 3578 3579 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3580 if (nekey == ekey) { /* all entries are unique */ 3581 delete cusp->cooPerm_a; 3582 cusp->cooPerm_a = NULL; 3583 } else { /* I couldn't come up with a more elegant algorithm */ 3584 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3585 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3586 (*cusp->cooPerm_a)[0] = 0; 3587 w[0] = 0; 3588 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3589 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3590 } 3591 thrust::counting_iterator<PetscInt> search_begin(0); 3592 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3593 search_begin, search_begin + A->rmap->n, 3594 ii.begin()); 3595 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3596 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3597 3598 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3599 a->singlemalloc = PETSC_FALSE; 3600 a->free_a = PETSC_TRUE; 3601 a->free_ij = PETSC_TRUE; 3602 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3603 a->i[0] = 0; 3604 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3605 a->nz = a->maxnz = a->i[A->rmap->n]; 3606 a->rmax = 0; 3607 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3608 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3609 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3610 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3611 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3612 for (PetscInt i = 0; i < A->rmap->n; i++) { 3613 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3614 nzr += (PetscInt)!!(nnzr); 3615 a->ilen[i] = a->imax[i] = nnzr; 3616 a->rmax = PetscMax(a->rmax,nnzr); 3617 } 3618 a->nonzerorowcnt = nzr; 3619 A->preallocated = PETSC_TRUE; 3620 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3621 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3622 } else { 3623 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3624 } 3625 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3626 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3627 3628 /* We want to allocate the CUSPARSE struct for matvec now. 3629 The code is so convoluted now that I prefer to copy zeros */ 3630 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3631 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3632 A->offloadmask = PETSC_OFFLOAD_CPU; 3633 A->nonzerostate++; 3634 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3635 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3636 3637 A->assembled = PETSC_FALSE; 3638 A->was_assembled = PETSC_FALSE; 3639 PetscFunctionReturn(0); 3640 } 3641 3642 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3643 { 3644 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3645 CsrMatrix *csr; 3646 PetscErrorCode ierr; 3647 3648 PetscFunctionBegin; 3649 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3650 PetscValidPointer(a,2); 3651 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3652 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3653 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3654 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3655 csr = (CsrMatrix*)cusp->mat->mat; 3656 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3657 *a = csr->values->data().get(); 3658 PetscFunctionReturn(0); 3659 } 3660 3661 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3662 { 3663 PetscFunctionBegin; 3664 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3665 PetscValidPointer(a,2); 3666 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3667 *a = NULL; 3668 PetscFunctionReturn(0); 3669 } 3670 3671 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3672 { 3673 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3674 CsrMatrix *csr; 3675 3676 PetscFunctionBegin; 3677 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3678 PetscValidPointer(a,2); 3679 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3680 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3681 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3682 csr = (CsrMatrix*)cusp->mat->mat; 3683 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3684 *a = csr->values->data().get(); 3685 PetscFunctionReturn(0); 3686 } 3687 3688 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3689 { 3690 PetscErrorCode ierr; 3691 3692 PetscFunctionBegin; 3693 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3694 PetscValidPointer(a,2); 3695 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3696 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3697 A->offloadmask = PETSC_OFFLOAD_GPU; 3698 *a = NULL; 3699 PetscFunctionReturn(0); 3700 } 3701 3702 struct IJCompare4 3703 { 3704 __host__ __device__ 3705 inline bool operator() (const thrust::tuple<int, int, PetscScalar, bool> &t1, const thrust::tuple<int, int, PetscScalar, bool> &t2) 3706 { 3707 if (t1.get<0>() < t2.get<0>()) return true; 3708 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3709 return false; 3710 } 3711 }; 3712 3713 struct Shift 3714 { 3715 int _shift; 3716 3717 Shift(int shift) : _shift(shift) {} 3718 __host__ __device__ 3719 inline int operator() (const int &c) 3720 { 3721 return c + _shift; 3722 } 3723 }; 3724 3725 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3726 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3727 { 3728 PetscErrorCode ierr; 3729 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3730 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3731 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3732 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3733 PetscInt Annz,Bnnz; 3734 cusparseStatus_t stat; 3735 PetscInt i,m,n,zero = 0; 3736 cudaError_t cerr; 3737 3738 PetscFunctionBegin; 3739 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3740 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3741 PetscValidPointer(C,4); 3742 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3743 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3744 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3745 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3746 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3747 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3748 if (reuse == MAT_INITIAL_MATRIX) { 3749 m = A->rmap->n; 3750 n = A->cmap->n + B->cmap->n; 3751 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3752 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3753 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3754 c = (Mat_SeqAIJ*)(*C)->data; 3755 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3756 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3757 Ccsr = new CsrMatrix; 3758 Cmat->cprowIndices = NULL; 3759 c->compressedrow.use = PETSC_FALSE; 3760 c->compressedrow.nrows = 0; 3761 c->compressedrow.i = NULL; 3762 c->compressedrow.rindex = NULL; 3763 Ccusp->workVector = NULL; 3764 Ccusp->nrows = m; 3765 Ccusp->mat = Cmat; 3766 Ccusp->mat->mat = Ccsr; 3767 Ccsr->num_rows = m; 3768 Ccsr->num_cols = n; 3769 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3770 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3771 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3772 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3773 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3774 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3775 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3776 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3777 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3778 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3779 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3780 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3781 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3782 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3783 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3784 3785 Acsr = (CsrMatrix*)Acusp->mat->mat; 3786 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3787 Annz = (PetscInt)Acsr->column_indices->size(); 3788 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3789 c->nz = Annz + Bnnz; 3790 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3791 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3792 Ccsr->values = new THRUSTARRAY(c->nz); 3793 Ccsr->num_entries = c->nz; 3794 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3795 3796 if (c->nz) { 3797 THRUSTINTARRAY32 Acoo(Annz); 3798 THRUSTINTARRAY32 Bcoo(Bnnz); 3799 THRUSTINTARRAY32 *roff; 3800 if (a->compressedrow.use) { /* need full row offset */ 3801 if (!Acusp->rowoffsets_gpu) { 3802 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3803 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3804 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3805 } 3806 roff = Acusp->rowoffsets_gpu; 3807 } else roff = Acsr->row_offsets; 3808 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3809 stat = cusparseXcsr2coo(Acusp->handle, 3810 roff->data().get(), 3811 Annz, 3812 m, 3813 Acoo.data().get(), 3814 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3815 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3816 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3817 if (b->compressedrow.use) { /* need full row offset */ 3818 if (!Bcusp->rowoffsets_gpu) { 3819 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3820 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3821 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3822 } 3823 roff = Bcusp->rowoffsets_gpu; 3824 } else roff = Bcsr->row_offsets; 3825 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3826 stat = cusparseXcsr2coo(Bcusp->handle, 3827 roff->data().get(), 3828 Bnnz, 3829 m, 3830 Bcoo.data().get(), 3831 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3832 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3833 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3834 THRUSTINTARRAY32 Ccoo(c->nz); 3835 auto Aperm = thrust::make_constant_iterator(true); 3836 auto Bperm = thrust::make_constant_iterator(false); 3837 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3838 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3839 thrust::device_vector<bool> wPerm(Annz+Bnnz); 3840 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo.begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3841 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo.end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3842 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.begin(),Bcib,Bcsr->values->begin(),Bperm)); 3843 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.end(),Bcie,Bcsr->values->end(),Bperm)); 3844 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo.begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm.begin())); 3845 auto p1 = Ccusp->cooPerm->begin(); 3846 auto p2 = Ccusp->cooPerm->begin(); 3847 thrust::advance(p2,Annz); 3848 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3849 thrust::merge(Azb,Aze,Bzb,Bze,Czb,IJCompare4()); 3850 thrust::partition_copy(thrust::make_counting_iterator(zero),thrust::make_counting_iterator(c->nz),wPerm.begin(),p1,p2,thrust::identity<bool>()); 3851 stat = cusparseXcoo2csr(Ccusp->handle, 3852 Ccoo.data().get(), 3853 c->nz, 3854 m, 3855 Ccsr->row_offsets->data().get(), 3856 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3857 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3858 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3859 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3860 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 3861 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 3862 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3863 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3864 #endif 3865 if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 3866 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3867 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 3868 CsrMatrix *CcsrT = new CsrMatrix; 3869 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3870 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 3871 3872 Ccusp->transgen = PETSC_TRUE; 3873 CmatT->cprowIndices = NULL; 3874 CmatT->mat = CcsrT; 3875 CcsrT->num_rows = n; 3876 CcsrT->num_cols = m; 3877 CcsrT->num_entries = c->nz; 3878 3879 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 3880 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 3881 CcsrT->values = new THRUSTARRAY(c->nz); 3882 3883 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3884 auto rT = CcsrT->row_offsets->begin(); 3885 if (AT) { 3886 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 3887 thrust::advance(rT,-1); 3888 } 3889 if (BT) { 3890 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 3891 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 3892 thrust::copy(titb,tite,rT); 3893 } 3894 auto cT = CcsrT->column_indices->begin(); 3895 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 3896 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 3897 auto vT = CcsrT->values->begin(); 3898 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 3899 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 3900 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3901 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3902 3903 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 3904 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3905 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3906 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3907 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3908 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3909 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3910 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3911 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3912 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3913 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 3914 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 3915 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3916 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3917 #endif 3918 Ccusp->matTranspose = CmatT; 3919 } 3920 } 3921 3922 c->singlemalloc = PETSC_FALSE; 3923 c->free_a = PETSC_TRUE; 3924 c->free_ij = PETSC_TRUE; 3925 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 3926 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 3927 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3928 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3929 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3930 ii = *Ccsr->row_offsets; 3931 jj = *Ccsr->column_indices; 3932 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3933 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3934 } else { 3935 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3936 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3937 } 3938 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 3939 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 3940 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 3941 c->maxnz = c->nz; 3942 c->nonzerorowcnt = 0; 3943 c->rmax = 0; 3944 for (i = 0; i < m; i++) { 3945 const PetscInt nn = c->i[i+1] - c->i[i]; 3946 c->ilen[i] = c->imax[i] = nn; 3947 c->nonzerorowcnt += (PetscInt)!!nn; 3948 c->rmax = PetscMax(c->rmax,nn); 3949 } 3950 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 3951 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 3952 (*C)->nonzerostate++; 3953 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 3954 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 3955 Ccusp->nonzerostate = (*C)->nonzerostate; 3956 (*C)->preallocated = PETSC_TRUE; 3957 } else { 3958 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 3959 c = (Mat_SeqAIJ*)(*C)->data; 3960 if (c->nz) { 3961 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3962 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 3963 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3964 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 3965 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3966 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3967 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3968 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3969 Acsr = (CsrMatrix*)Acusp->mat->mat; 3970 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3971 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 3972 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 3973 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 3974 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 3975 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 3976 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 3977 auto pmid = Ccusp->cooPerm->begin(); 3978 thrust::advance(pmid,Acsr->num_entries); 3979 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3980 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 3981 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 3982 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 3983 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 3984 thrust::for_each(zibait,zieait,VecCUDAEquals()); 3985 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 3986 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 3987 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 3988 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 3989 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 3990 if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 3991 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 3992 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3993 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3994 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 3995 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 3996 auto vT = CcsrT->values->begin(); 3997 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 3998 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 3999 } 4000 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4001 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4002 } 4003 } 4004 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4005 (*C)->assembled = PETSC_TRUE; 4006 (*C)->was_assembled = PETSC_FALSE; 4007 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4008 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 4009 ierr = MatSeqAIJCUSPARSECopyFromGPU(*C);CHKERRQ(ierr); 4010 PetscFunctionReturn(0); 4011 } 4012