1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 75 76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 86 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 87 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 88 89 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 90 { 91 cusparseStatus_t stat; 92 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 93 94 PetscFunctionBegin; 95 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 96 cusparsestruct->stream = stream; 97 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 98 PetscFunctionReturn(0); 99 } 100 101 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 102 { 103 cusparseStatus_t stat; 104 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 105 106 PetscFunctionBegin; 107 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 108 if (cusparsestruct->handle != handle) { 109 if (cusparsestruct->handle) { 110 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 111 } 112 cusparsestruct->handle = handle; 113 } 114 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 115 PetscFunctionReturn(0); 116 } 117 118 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 119 { 120 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 121 PetscBool flg; 122 PetscErrorCode ierr; 123 124 PetscFunctionBegin; 125 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 126 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 127 if (cusparsestruct->handle) cusparsestruct->handle = 0; 128 PetscFunctionReturn(0); 129 } 130 131 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 132 { 133 PetscFunctionBegin; 134 *type = MATSOLVERCUSPARSE; 135 PetscFunctionReturn(0); 136 } 137 138 /*MC 139 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 140 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 141 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 142 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 143 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 144 algorithms are not recommended. This class does NOT support direct solver operations. 145 146 Level: beginner 147 148 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 149 M*/ 150 151 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 152 { 153 PetscErrorCode ierr; 154 PetscInt n = A->rmap->n; 155 156 PetscFunctionBegin; 157 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 158 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 159 (*B)->factortype = ftype; 160 (*B)->useordering = PETSC_TRUE; 161 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 162 163 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 164 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 165 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 166 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 167 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 168 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 169 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 170 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 171 172 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 173 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 174 PetscFunctionReturn(0); 175 } 176 177 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 178 { 179 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 180 181 PetscFunctionBegin; 182 switch (op) { 183 case MAT_CUSPARSE_MULT: 184 cusparsestruct->format = format; 185 break; 186 case MAT_CUSPARSE_ALL: 187 cusparsestruct->format = format; 188 break; 189 default: 190 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 191 } 192 PetscFunctionReturn(0); 193 } 194 195 /*@ 196 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 197 operation. Only the MatMult operation can use different GPU storage formats 198 for MPIAIJCUSPARSE matrices. 199 Not Collective 200 201 Input Parameters: 202 + A - Matrix of type SEQAIJCUSPARSE 203 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 204 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 205 206 Output Parameter: 207 208 Level: intermediate 209 210 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 211 @*/ 212 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 213 { 214 PetscErrorCode ierr; 215 216 PetscFunctionBegin; 217 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 218 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 219 PetscFunctionReturn(0); 220 } 221 222 /*@ 223 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 224 225 Collective on mat 226 227 Input Parameters: 228 + A - Matrix of type SEQAIJCUSPARSE 229 - transgen - the boolean flag 230 231 Level: intermediate 232 233 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose() 234 @*/ 235 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 236 { 237 PetscErrorCode ierr; 238 PetscBool flg; 239 240 PetscFunctionBegin; 241 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 242 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 243 if (flg) { 244 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 245 246 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 247 cusp->transgen = transgen; 248 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 249 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 250 } 251 } 252 PetscFunctionReturn(0); 253 } 254 255 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 256 { 257 PetscErrorCode ierr; 258 MatCUSPARSEStorageFormat format; 259 PetscBool flg; 260 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 261 262 PetscFunctionBegin; 263 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 264 if (A->factortype == MAT_FACTOR_NONE) { 265 PetscBool transgen = cusparsestruct->transgen; 266 267 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 268 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 269 270 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 271 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 272 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 273 274 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 275 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 276 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 277 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 278 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 279 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 280 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 281 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 282 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 283 284 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 285 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 286 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 287 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 288 289 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 290 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 291 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 292 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 293 #endif 294 } 295 ierr = PetscOptionsTail();CHKERRQ(ierr); 296 PetscFunctionReturn(0); 297 } 298 299 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 300 { 301 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 302 PetscErrorCode ierr; 303 304 PetscFunctionBegin; 305 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 306 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 307 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 308 PetscFunctionReturn(0); 309 } 310 311 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 312 { 313 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 314 PetscErrorCode ierr; 315 316 PetscFunctionBegin; 317 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 318 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 319 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 320 PetscFunctionReturn(0); 321 } 322 323 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 324 { 325 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 326 PetscErrorCode ierr; 327 328 PetscFunctionBegin; 329 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 330 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 331 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 332 PetscFunctionReturn(0); 333 } 334 335 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 336 { 337 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 338 PetscErrorCode ierr; 339 340 PetscFunctionBegin; 341 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 342 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 343 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 344 PetscFunctionReturn(0); 345 } 346 347 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 348 { 349 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 350 PetscInt n = A->rmap->n; 351 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 352 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 353 cusparseStatus_t stat; 354 const PetscInt *ai = a->i,*aj = a->j,*vi; 355 const MatScalar *aa = a->a,*v; 356 PetscInt *AiLo, *AjLo; 357 PetscInt i,nz, nzLower, offset, rowOffset; 358 PetscErrorCode ierr; 359 cudaError_t cerr; 360 361 PetscFunctionBegin; 362 if (!n) PetscFunctionReturn(0); 363 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 364 try { 365 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 366 nzLower=n+ai[n]-ai[1]; 367 if (!loTriFactor) { 368 PetscScalar *AALo; 369 370 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 371 372 /* Allocate Space for the lower triangular matrix */ 373 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 374 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 375 376 /* Fill the lower triangular matrix */ 377 AiLo[0] = (PetscInt) 0; 378 AiLo[n] = nzLower; 379 AjLo[0] = (PetscInt) 0; 380 AALo[0] = (MatScalar) 1.0; 381 v = aa; 382 vi = aj; 383 offset = 1; 384 rowOffset= 1; 385 for (i=1; i<n; i++) { 386 nz = ai[i+1] - ai[i]; 387 /* additional 1 for the term on the diagonal */ 388 AiLo[i] = rowOffset; 389 rowOffset += nz+1; 390 391 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 392 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 393 394 offset += nz; 395 AjLo[offset] = (PetscInt) i; 396 AALo[offset] = (MatScalar) 1.0; 397 offset += 1; 398 399 v += nz; 400 vi += nz; 401 } 402 403 /* allocate space for the triangular factor information */ 404 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 405 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 406 /* Create the matrix description */ 407 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 408 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 409 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 410 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 411 #else 412 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 413 #endif 414 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 415 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 416 417 /* set the operation */ 418 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 419 420 /* set the matrix */ 421 loTriFactor->csrMat = new CsrMatrix; 422 loTriFactor->csrMat->num_rows = n; 423 loTriFactor->csrMat->num_cols = n; 424 loTriFactor->csrMat->num_entries = nzLower; 425 426 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 427 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 428 429 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 430 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 431 432 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 433 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 434 435 /* Create the solve analysis information */ 436 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 437 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 438 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 439 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 440 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 441 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 442 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 443 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 444 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 445 #endif 446 447 /* perform the solve analysis */ 448 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 449 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 450 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 451 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 453 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 454 #endif 455 );CHKERRCUSPARSE(stat); 456 cerr = WaitForCUDA();CHKERRCUDA(cerr); 457 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 458 459 /* assign the pointer */ 460 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 461 loTriFactor->AA_h = AALo; 462 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 463 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 464 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 465 } else { /* update values only */ 466 if (!loTriFactor->AA_h) { 467 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 468 } 469 /* Fill the lower triangular matrix */ 470 loTriFactor->AA_h[0] = 1.0; 471 v = aa; 472 vi = aj; 473 offset = 1; 474 for (i=1; i<n; i++) { 475 nz = ai[i+1] - ai[i]; 476 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 477 offset += nz; 478 loTriFactor->AA_h[offset] = 1.0; 479 offset += 1; 480 v += nz; 481 } 482 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 483 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 484 } 485 } catch(char *ex) { 486 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 487 } 488 } 489 PetscFunctionReturn(0); 490 } 491 492 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 493 { 494 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 495 PetscInt n = A->rmap->n; 496 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 497 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 498 cusparseStatus_t stat; 499 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 500 const MatScalar *aa = a->a,*v; 501 PetscInt *AiUp, *AjUp; 502 PetscInt i,nz, nzUpper, offset; 503 PetscErrorCode ierr; 504 cudaError_t cerr; 505 506 PetscFunctionBegin; 507 if (!n) PetscFunctionReturn(0); 508 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 509 try { 510 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 511 nzUpper = adiag[0]-adiag[n]; 512 if (!upTriFactor) { 513 PetscScalar *AAUp; 514 515 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 516 517 /* Allocate Space for the upper triangular matrix */ 518 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 519 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 520 521 /* Fill the upper triangular matrix */ 522 AiUp[0]=(PetscInt) 0; 523 AiUp[n]=nzUpper; 524 offset = nzUpper; 525 for (i=n-1; i>=0; i--) { 526 v = aa + adiag[i+1] + 1; 527 vi = aj + adiag[i+1] + 1; 528 529 /* number of elements NOT on the diagonal */ 530 nz = adiag[i] - adiag[i+1]-1; 531 532 /* decrement the offset */ 533 offset -= (nz+1); 534 535 /* first, set the diagonal elements */ 536 AjUp[offset] = (PetscInt) i; 537 AAUp[offset] = (MatScalar)1./v[nz]; 538 AiUp[i] = AiUp[i+1] - (nz+1); 539 540 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 541 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 542 } 543 544 /* allocate space for the triangular factor information */ 545 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 546 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 547 548 /* Create the matrix description */ 549 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 550 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 551 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 552 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 553 #else 554 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 555 #endif 556 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 557 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 558 559 /* set the operation */ 560 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 561 562 /* set the matrix */ 563 upTriFactor->csrMat = new CsrMatrix; 564 upTriFactor->csrMat->num_rows = n; 565 upTriFactor->csrMat->num_cols = n; 566 upTriFactor->csrMat->num_entries = nzUpper; 567 568 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 569 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 570 571 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 572 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 573 574 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 575 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 576 577 /* Create the solve analysis information */ 578 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 579 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 580 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 581 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 582 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 583 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 584 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 585 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 586 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 587 #endif 588 589 /* perform the solve analysis */ 590 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 591 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 592 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 593 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 594 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 595 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 596 #endif 597 );CHKERRCUSPARSE(stat); 598 cerr = WaitForCUDA();CHKERRCUDA(cerr); 599 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 600 601 /* assign the pointer */ 602 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 603 upTriFactor->AA_h = AAUp; 604 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 605 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 606 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 607 } else { 608 if (!upTriFactor->AA_h) { 609 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 610 } 611 /* Fill the upper triangular matrix */ 612 offset = nzUpper; 613 for (i=n-1; i>=0; i--) { 614 v = aa + adiag[i+1] + 1; 615 616 /* number of elements NOT on the diagonal */ 617 nz = adiag[i] - adiag[i+1]-1; 618 619 /* decrement the offset */ 620 offset -= (nz+1); 621 622 /* first, set the diagonal elements */ 623 upTriFactor->AA_h[offset] = 1./v[nz]; 624 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 625 } 626 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 627 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 628 } 629 } catch(char *ex) { 630 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 631 } 632 } 633 PetscFunctionReturn(0); 634 } 635 636 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 637 { 638 PetscErrorCode ierr; 639 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 640 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 641 IS isrow = a->row,iscol = a->icol; 642 PetscBool row_identity,col_identity; 643 PetscInt n = A->rmap->n; 644 645 PetscFunctionBegin; 646 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 647 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 648 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 649 650 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 651 cusparseTriFactors->nnz=a->nz; 652 653 A->offloadmask = PETSC_OFFLOAD_BOTH; 654 /* lower triangular indices */ 655 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 656 if (!row_identity && !cusparseTriFactors->rpermIndices) { 657 const PetscInt *r; 658 659 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 660 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 661 cusparseTriFactors->rpermIndices->assign(r, r+n); 662 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 663 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 664 } 665 666 /* upper triangular indices */ 667 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 668 if (!col_identity && !cusparseTriFactors->cpermIndices) { 669 const PetscInt *c; 670 671 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 672 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 673 cusparseTriFactors->cpermIndices->assign(c, c+n); 674 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 675 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 676 } 677 PetscFunctionReturn(0); 678 } 679 680 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 681 { 682 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 683 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 684 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 685 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 686 cusparseStatus_t stat; 687 PetscErrorCode ierr; 688 cudaError_t cerr; 689 PetscInt *AiUp, *AjUp; 690 PetscScalar *AAUp; 691 PetscScalar *AALo; 692 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 693 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 694 const PetscInt *ai = b->i,*aj = b->j,*vj; 695 const MatScalar *aa = b->a,*v; 696 697 PetscFunctionBegin; 698 if (!n) PetscFunctionReturn(0); 699 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 700 try { 701 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 702 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 703 if (!upTriFactor && !loTriFactor) { 704 /* Allocate Space for the upper triangular matrix */ 705 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 706 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 707 708 /* Fill the upper triangular matrix */ 709 AiUp[0]=(PetscInt) 0; 710 AiUp[n]=nzUpper; 711 offset = 0; 712 for (i=0; i<n; i++) { 713 /* set the pointers */ 714 v = aa + ai[i]; 715 vj = aj + ai[i]; 716 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 717 718 /* first, set the diagonal elements */ 719 AjUp[offset] = (PetscInt) i; 720 AAUp[offset] = (MatScalar)1.0/v[nz]; 721 AiUp[i] = offset; 722 AALo[offset] = (MatScalar)1.0/v[nz]; 723 724 offset+=1; 725 if (nz>0) { 726 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 727 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 728 for (j=offset; j<offset+nz; j++) { 729 AAUp[j] = -AAUp[j]; 730 AALo[j] = AAUp[j]/v[nz]; 731 } 732 offset+=nz; 733 } 734 } 735 736 /* allocate space for the triangular factor information */ 737 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 738 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 739 740 /* Create the matrix description */ 741 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 742 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 743 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 744 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 745 #else 746 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 747 #endif 748 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 749 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 750 751 /* set the matrix */ 752 upTriFactor->csrMat = new CsrMatrix; 753 upTriFactor->csrMat->num_rows = A->rmap->n; 754 upTriFactor->csrMat->num_cols = A->cmap->n; 755 upTriFactor->csrMat->num_entries = a->nz; 756 757 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 758 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 759 760 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 761 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 762 763 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 764 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 765 766 /* set the operation */ 767 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 768 769 /* Create the solve analysis information */ 770 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 771 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 772 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 773 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 774 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 775 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 776 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 777 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 778 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 779 #endif 780 781 /* perform the solve analysis */ 782 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 783 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 784 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 785 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 786 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 787 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 788 #endif 789 );CHKERRCUSPARSE(stat); 790 cerr = WaitForCUDA();CHKERRCUDA(cerr); 791 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 792 793 /* assign the pointer */ 794 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 795 796 /* allocate space for the triangular factor information */ 797 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 798 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 799 800 /* Create the matrix description */ 801 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 802 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 803 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 804 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 805 #else 806 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 807 #endif 808 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 809 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 810 811 /* set the operation */ 812 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 813 814 /* set the matrix */ 815 loTriFactor->csrMat = new CsrMatrix; 816 loTriFactor->csrMat->num_rows = A->rmap->n; 817 loTriFactor->csrMat->num_cols = A->cmap->n; 818 loTriFactor->csrMat->num_entries = a->nz; 819 820 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 821 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 822 823 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 824 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 825 826 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 827 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 828 829 /* Create the solve analysis information */ 830 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 831 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 832 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 833 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 834 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 835 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 836 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 837 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 838 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 839 #endif 840 841 /* perform the solve analysis */ 842 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 843 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 844 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 845 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 846 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 847 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 848 #endif 849 );CHKERRCUSPARSE(stat); 850 cerr = WaitForCUDA();CHKERRCUDA(cerr); 851 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 852 853 /* assign the pointer */ 854 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 855 856 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 857 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 858 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 859 } else { 860 /* Fill the upper triangular matrix */ 861 offset = 0; 862 for (i=0; i<n; i++) { 863 /* set the pointers */ 864 v = aa + ai[i]; 865 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 866 867 /* first, set the diagonal elements */ 868 AAUp[offset] = 1.0/v[nz]; 869 AALo[offset] = 1.0/v[nz]; 870 871 offset+=1; 872 if (nz>0) { 873 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 874 for (j=offset; j<offset+nz; j++) { 875 AAUp[j] = -AAUp[j]; 876 AALo[j] = AAUp[j]/v[nz]; 877 } 878 offset+=nz; 879 } 880 } 881 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 882 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 883 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 884 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 885 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 886 } 887 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 888 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 889 } catch(char *ex) { 890 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 891 } 892 } 893 PetscFunctionReturn(0); 894 } 895 896 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 897 { 898 PetscErrorCode ierr; 899 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 900 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 901 IS ip = a->row; 902 PetscBool perm_identity; 903 PetscInt n = A->rmap->n; 904 905 PetscFunctionBegin; 906 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 907 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 908 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 909 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 910 911 A->offloadmask = PETSC_OFFLOAD_BOTH; 912 913 /* lower triangular indices */ 914 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 915 if (!perm_identity) { 916 IS iip; 917 const PetscInt *irip,*rip; 918 919 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 920 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 921 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 922 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 923 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 924 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 925 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 926 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 927 ierr = ISDestroy(&iip);CHKERRQ(ierr); 928 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 929 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 930 } 931 PetscFunctionReturn(0); 932 } 933 934 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 935 { 936 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 937 IS isrow = b->row,iscol = b->col; 938 PetscBool row_identity,col_identity; 939 PetscErrorCode ierr; 940 941 PetscFunctionBegin; 942 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 943 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 944 B->offloadmask = PETSC_OFFLOAD_CPU; 945 /* determine which version of MatSolve needs to be used. */ 946 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 947 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 948 if (row_identity && col_identity) { 949 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 950 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 951 B->ops->matsolve = NULL; 952 B->ops->matsolvetranspose = NULL; 953 } else { 954 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 955 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 956 B->ops->matsolve = NULL; 957 B->ops->matsolvetranspose = NULL; 958 } 959 960 /* get the triangular factors */ 961 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 962 PetscFunctionReturn(0); 963 } 964 965 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 966 { 967 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 968 IS ip = b->row; 969 PetscBool perm_identity; 970 PetscErrorCode ierr; 971 972 PetscFunctionBegin; 973 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 974 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 975 B->offloadmask = PETSC_OFFLOAD_CPU; 976 /* determine which version of MatSolve needs to be used. */ 977 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 978 if (perm_identity) { 979 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 980 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 981 B->ops->matsolve = NULL; 982 B->ops->matsolvetranspose = NULL; 983 } else { 984 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 985 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 986 B->ops->matsolve = NULL; 987 B->ops->matsolvetranspose = NULL; 988 } 989 990 /* get the triangular factors */ 991 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 992 PetscFunctionReturn(0); 993 } 994 995 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 996 { 997 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 998 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 999 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1000 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1002 cusparseStatus_t stat; 1003 cusparseIndexBase_t indexBase; 1004 cusparseMatrixType_t matrixType; 1005 cusparseFillMode_t fillMode; 1006 cusparseDiagType_t diagType; 1007 cudaError_t cerr; 1008 PetscErrorCode ierr; 1009 1010 PetscFunctionBegin; 1011 /* allocate space for the transpose of the lower triangular factor */ 1012 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1013 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1014 1015 /* set the matrix descriptors of the lower triangular factor */ 1016 matrixType = cusparseGetMatType(loTriFactor->descr); 1017 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1018 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1019 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1020 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1021 1022 /* Create the matrix description */ 1023 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1024 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1025 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1026 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1027 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1028 1029 /* set the operation */ 1030 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1031 1032 /* allocate GPU space for the CSC of the lower triangular factor*/ 1033 loTriFactorT->csrMat = new CsrMatrix; 1034 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1035 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1036 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1037 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1038 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1039 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1040 1041 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1042 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1043 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1044 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1045 loTriFactor->csrMat->values->data().get(), 1046 loTriFactor->csrMat->row_offsets->data().get(), 1047 loTriFactor->csrMat->column_indices->data().get(), 1048 loTriFactorT->csrMat->values->data().get(), 1049 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1050 CUSPARSE_ACTION_NUMERIC,indexBase, 1051 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1052 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1053 #endif 1054 1055 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1056 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1057 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1058 loTriFactor->csrMat->values->data().get(), 1059 loTriFactor->csrMat->row_offsets->data().get(), 1060 loTriFactor->csrMat->column_indices->data().get(), 1061 loTriFactorT->csrMat->values->data().get(), 1062 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1063 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1064 CUSPARSE_ACTION_NUMERIC, indexBase, 1065 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1066 #else 1067 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1068 CUSPARSE_ACTION_NUMERIC, indexBase 1069 #endif 1070 );CHKERRCUSPARSE(stat); 1071 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1072 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1073 1074 /* Create the solve analysis information */ 1075 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1076 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1077 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1078 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1079 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1080 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1081 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1082 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1083 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1084 #endif 1085 1086 /* perform the solve analysis */ 1087 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1088 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1089 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1090 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1091 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1092 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1093 #endif 1094 );CHKERRCUSPARSE(stat); 1095 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1096 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1097 1098 /* assign the pointer */ 1099 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1100 1101 /*********************************************/ 1102 /* Now the Transpose of the Upper Tri Factor */ 1103 /*********************************************/ 1104 1105 /* allocate space for the transpose of the upper triangular factor */ 1106 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1107 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1108 1109 /* set the matrix descriptors of the upper triangular factor */ 1110 matrixType = cusparseGetMatType(upTriFactor->descr); 1111 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1112 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1113 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1114 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1115 1116 /* Create the matrix description */ 1117 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1118 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1119 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1121 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1122 1123 /* set the operation */ 1124 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1125 1126 /* allocate GPU space for the CSC of the upper triangular factor*/ 1127 upTriFactorT->csrMat = new CsrMatrix; 1128 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1129 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1130 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1131 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1132 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1133 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1134 1135 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1137 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1138 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1139 upTriFactor->csrMat->values->data().get(), 1140 upTriFactor->csrMat->row_offsets->data().get(), 1141 upTriFactor->csrMat->column_indices->data().get(), 1142 upTriFactorT->csrMat->values->data().get(), 1143 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1144 CUSPARSE_ACTION_NUMERIC,indexBase, 1145 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1146 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1147 #endif 1148 1149 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1150 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1151 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1152 upTriFactor->csrMat->values->data().get(), 1153 upTriFactor->csrMat->row_offsets->data().get(), 1154 upTriFactor->csrMat->column_indices->data().get(), 1155 upTriFactorT->csrMat->values->data().get(), 1156 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1157 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1158 CUSPARSE_ACTION_NUMERIC, indexBase, 1159 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1160 #else 1161 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1162 CUSPARSE_ACTION_NUMERIC, indexBase 1163 #endif 1164 );CHKERRCUSPARSE(stat); 1165 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1166 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1167 1168 /* Create the solve analysis information */ 1169 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1170 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1171 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1172 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1173 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1174 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1175 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1176 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1177 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1178 #endif 1179 1180 /* perform the solve analysis */ 1181 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1182 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1183 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1184 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1185 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1186 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1187 #endif 1188 );CHKERRCUSPARSE(stat); 1189 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1190 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1191 1192 /* assign the pointer */ 1193 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1194 PetscFunctionReturn(0); 1195 } 1196 1197 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1198 { 1199 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1200 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1201 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1202 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1203 cusparseStatus_t stat; 1204 cusparseIndexBase_t indexBase; 1205 cudaError_t err; 1206 PetscErrorCode ierr; 1207 1208 PetscFunctionBegin; 1209 if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1210 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1211 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1212 /* create cusparse matrix */ 1213 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1214 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1215 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1216 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1217 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1218 1219 /* set alpha and beta */ 1220 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1221 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1222 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1223 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1224 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1225 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1226 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1227 1228 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1229 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1230 CsrMatrix *matrixT= new CsrMatrix; 1231 matrixT->num_rows = A->cmap->n; 1232 matrixT->num_cols = A->rmap->n; 1233 matrixT->num_entries = a->nz; 1234 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1235 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1236 matrixT->values = new THRUSTARRAY(a->nz); 1237 1238 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1239 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1240 1241 /* compute the transpose, i.e. the CSC */ 1242 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1243 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1244 A->cmap->n, matrix->num_entries, 1245 matrix->values->data().get(), 1246 cusparsestruct->rowoffsets_gpu->data().get(), 1247 matrix->column_indices->data().get(), 1248 matrixT->values->data().get(), 1249 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1250 CUSPARSE_ACTION_NUMERIC,indexBase, 1251 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1252 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1253 #endif 1254 1255 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1256 A->cmap->n, matrix->num_entries, 1257 matrix->values->data().get(), 1258 cusparsestruct->rowoffsets_gpu->data().get(), 1259 matrix->column_indices->data().get(), 1260 matrixT->values->data().get(), 1261 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1262 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1263 CUSPARSE_ACTION_NUMERIC,indexBase, 1264 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1265 #else 1266 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1267 CUSPARSE_ACTION_NUMERIC, indexBase 1268 #endif 1269 );CHKERRCUSPARSE(stat); 1270 matstructT->mat = matrixT; 1271 1272 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1273 stat = cusparseCreateCsr(&matstructT->matDescr, 1274 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1275 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1276 matrixT->values->data().get(), 1277 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1278 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1279 #endif 1280 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1281 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1282 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1283 #else 1284 CsrMatrix *temp = new CsrMatrix; 1285 CsrMatrix *tempT = new CsrMatrix; 1286 /* First convert HYB to CSR */ 1287 temp->num_rows = A->rmap->n; 1288 temp->num_cols = A->cmap->n; 1289 temp->num_entries = a->nz; 1290 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1291 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1292 temp->values = new THRUSTARRAY(a->nz); 1293 1294 stat = cusparse_hyb2csr(cusparsestruct->handle, 1295 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1296 temp->values->data().get(), 1297 temp->row_offsets->data().get(), 1298 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1299 1300 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1301 tempT->num_rows = A->rmap->n; 1302 tempT->num_cols = A->cmap->n; 1303 tempT->num_entries = a->nz; 1304 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1305 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1306 tempT->values = new THRUSTARRAY(a->nz); 1307 1308 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1309 temp->num_cols, temp->num_entries, 1310 temp->values->data().get(), 1311 temp->row_offsets->data().get(), 1312 temp->column_indices->data().get(), 1313 tempT->values->data().get(), 1314 tempT->column_indices->data().get(), 1315 tempT->row_offsets->data().get(), 1316 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1317 1318 /* Last, convert CSC to HYB */ 1319 cusparseHybMat_t hybMat; 1320 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1321 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1322 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1323 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1324 matstructT->descr, tempT->values->data().get(), 1325 tempT->row_offsets->data().get(), 1326 tempT->column_indices->data().get(), 1327 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1328 1329 /* assign the pointer */ 1330 matstructT->mat = hybMat; 1331 /* delete temporaries */ 1332 if (tempT) { 1333 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1334 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1335 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1336 delete (CsrMatrix*) tempT; 1337 } 1338 if (temp) { 1339 if (temp->values) delete (THRUSTARRAY*) temp->values; 1340 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1341 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1342 delete (CsrMatrix*) temp; 1343 } 1344 #endif 1345 } 1346 err = WaitForCUDA();CHKERRCUDA(err); 1347 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1348 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1349 /* the compressed row indices is not used for matTranspose */ 1350 matstructT->cprowIndices = NULL; 1351 /* assign the pointer */ 1352 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1353 PetscFunctionReturn(0); 1354 } 1355 1356 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1357 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1358 { 1359 PetscInt n = xx->map->n; 1360 const PetscScalar *barray; 1361 PetscScalar *xarray; 1362 thrust::device_ptr<const PetscScalar> bGPU; 1363 thrust::device_ptr<PetscScalar> xGPU; 1364 cusparseStatus_t stat; 1365 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1366 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1367 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1368 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1369 PetscErrorCode ierr; 1370 cudaError_t cerr; 1371 1372 PetscFunctionBegin; 1373 /* Analyze the matrix and create the transpose ... on the fly */ 1374 if (!loTriFactorT && !upTriFactorT) { 1375 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1376 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1377 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1378 } 1379 1380 /* Get the GPU pointers */ 1381 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1382 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1383 xGPU = thrust::device_pointer_cast(xarray); 1384 bGPU = thrust::device_pointer_cast(barray); 1385 1386 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1387 /* First, reorder with the row permutation */ 1388 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1389 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1390 xGPU); 1391 1392 /* First, solve U */ 1393 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1394 upTriFactorT->csrMat->num_rows, 1395 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1396 upTriFactorT->csrMat->num_entries, 1397 #endif 1398 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1399 upTriFactorT->csrMat->values->data().get(), 1400 upTriFactorT->csrMat->row_offsets->data().get(), 1401 upTriFactorT->csrMat->column_indices->data().get(), 1402 upTriFactorT->solveInfo, 1403 xarray, tempGPU->data().get() 1404 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1405 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1406 #endif 1407 );CHKERRCUSPARSE(stat); 1408 1409 /* Then, solve L */ 1410 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1411 loTriFactorT->csrMat->num_rows, 1412 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1413 loTriFactorT->csrMat->num_entries, 1414 #endif 1415 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1416 loTriFactorT->csrMat->values->data().get(), 1417 loTriFactorT->csrMat->row_offsets->data().get(), 1418 loTriFactorT->csrMat->column_indices->data().get(), 1419 loTriFactorT->solveInfo, 1420 tempGPU->data().get(), xarray 1421 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1422 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1423 #endif 1424 );CHKERRCUSPARSE(stat); 1425 1426 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1427 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1428 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1429 tempGPU->begin()); 1430 1431 /* Copy the temporary to the full solution. */ 1432 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1433 1434 /* restore */ 1435 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1436 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1437 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1438 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1439 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1440 PetscFunctionReturn(0); 1441 } 1442 1443 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1444 { 1445 const PetscScalar *barray; 1446 PetscScalar *xarray; 1447 cusparseStatus_t stat; 1448 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1449 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1450 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1451 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1452 PetscErrorCode ierr; 1453 cudaError_t cerr; 1454 1455 PetscFunctionBegin; 1456 /* Analyze the matrix and create the transpose ... on the fly */ 1457 if (!loTriFactorT && !upTriFactorT) { 1458 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1459 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1460 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1461 } 1462 1463 /* Get the GPU pointers */ 1464 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1465 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1466 1467 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1468 /* First, solve U */ 1469 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1470 upTriFactorT->csrMat->num_rows, 1471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472 upTriFactorT->csrMat->num_entries, 1473 #endif 1474 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1475 upTriFactorT->csrMat->values->data().get(), 1476 upTriFactorT->csrMat->row_offsets->data().get(), 1477 upTriFactorT->csrMat->column_indices->data().get(), 1478 upTriFactorT->solveInfo, 1479 barray, tempGPU->data().get() 1480 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1482 #endif 1483 );CHKERRCUSPARSE(stat); 1484 1485 /* Then, solve L */ 1486 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1487 loTriFactorT->csrMat->num_rows, 1488 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1489 loTriFactorT->csrMat->num_entries, 1490 #endif 1491 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1492 loTriFactorT->csrMat->values->data().get(), 1493 loTriFactorT->csrMat->row_offsets->data().get(), 1494 loTriFactorT->csrMat->column_indices->data().get(), 1495 loTriFactorT->solveInfo, 1496 tempGPU->data().get(), xarray 1497 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1498 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1499 #endif 1500 );CHKERRCUSPARSE(stat); 1501 1502 /* restore */ 1503 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1504 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1505 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1506 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1507 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1508 PetscFunctionReturn(0); 1509 } 1510 1511 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1512 { 1513 const PetscScalar *barray; 1514 PetscScalar *xarray; 1515 thrust::device_ptr<const PetscScalar> bGPU; 1516 thrust::device_ptr<PetscScalar> xGPU; 1517 cusparseStatus_t stat; 1518 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1519 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1520 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1521 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1522 PetscErrorCode ierr; 1523 cudaError_t cerr; 1524 1525 PetscFunctionBegin; 1526 1527 /* Get the GPU pointers */ 1528 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1529 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1530 xGPU = thrust::device_pointer_cast(xarray); 1531 bGPU = thrust::device_pointer_cast(barray); 1532 1533 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1534 /* First, reorder with the row permutation */ 1535 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1536 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1537 tempGPU->begin()); 1538 1539 /* Next, solve L */ 1540 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1541 loTriFactor->csrMat->num_rows, 1542 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543 loTriFactor->csrMat->num_entries, 1544 #endif 1545 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1546 loTriFactor->csrMat->values->data().get(), 1547 loTriFactor->csrMat->row_offsets->data().get(), 1548 loTriFactor->csrMat->column_indices->data().get(), 1549 loTriFactor->solveInfo, 1550 tempGPU->data().get(), xarray 1551 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1552 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1553 #endif 1554 );CHKERRCUSPARSE(stat); 1555 1556 /* Then, solve U */ 1557 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1558 upTriFactor->csrMat->num_rows, 1559 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1560 upTriFactor->csrMat->num_entries, 1561 #endif 1562 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1563 upTriFactor->csrMat->values->data().get(), 1564 upTriFactor->csrMat->row_offsets->data().get(), 1565 upTriFactor->csrMat->column_indices->data().get(), 1566 upTriFactor->solveInfo, 1567 xarray, tempGPU->data().get() 1568 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1569 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1570 #endif 1571 );CHKERRCUSPARSE(stat); 1572 1573 /* Last, reorder with the column permutation */ 1574 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1575 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1576 xGPU); 1577 1578 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1579 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1580 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1581 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1582 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1583 PetscFunctionReturn(0); 1584 } 1585 1586 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1587 { 1588 const PetscScalar *barray; 1589 PetscScalar *xarray; 1590 cusparseStatus_t stat; 1591 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1594 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1595 PetscErrorCode ierr; 1596 cudaError_t cerr; 1597 1598 PetscFunctionBegin; 1599 /* Get the GPU pointers */ 1600 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1601 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1602 1603 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1604 /* First, solve L */ 1605 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1606 loTriFactor->csrMat->num_rows, 1607 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1608 loTriFactor->csrMat->num_entries, 1609 #endif 1610 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1611 loTriFactor->csrMat->values->data().get(), 1612 loTriFactor->csrMat->row_offsets->data().get(), 1613 loTriFactor->csrMat->column_indices->data().get(), 1614 loTriFactor->solveInfo, 1615 barray, tempGPU->data().get() 1616 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1617 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1618 #endif 1619 );CHKERRCUSPARSE(stat); 1620 1621 /* Next, solve U */ 1622 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1623 upTriFactor->csrMat->num_rows, 1624 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1625 upTriFactor->csrMat->num_entries, 1626 #endif 1627 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1628 upTriFactor->csrMat->values->data().get(), 1629 upTriFactor->csrMat->row_offsets->data().get(), 1630 upTriFactor->csrMat->column_indices->data().get(), 1631 upTriFactor->solveInfo, 1632 tempGPU->data().get(), xarray 1633 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1635 #endif 1636 );CHKERRCUSPARSE(stat); 1637 1638 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1639 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1640 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1641 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1642 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1643 PetscFunctionReturn(0); 1644 } 1645 1646 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1647 { 1648 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1649 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1650 cudaError_t cerr; 1651 PetscErrorCode ierr; 1652 1653 PetscFunctionBegin; 1654 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1655 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1656 1657 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1658 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1659 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1660 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1661 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1662 A->offloadmask = PETSC_OFFLOAD_BOTH; 1663 } 1664 PetscFunctionReturn(0); 1665 } 1666 1667 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1668 { 1669 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1670 PetscErrorCode ierr; 1671 1672 PetscFunctionBegin; 1673 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1674 *array = a->a; 1675 A->offloadmask = PETSC_OFFLOAD_CPU; 1676 PetscFunctionReturn(0); 1677 } 1678 1679 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1680 { 1681 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1682 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1683 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1684 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1685 PetscErrorCode ierr; 1686 cusparseStatus_t stat; 1687 PetscBool both = PETSC_TRUE; 1688 cudaError_t err; 1689 1690 PetscFunctionBegin; 1691 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1692 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1693 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1694 /* Copy values only */ 1695 CsrMatrix *matrix,*matrixT; 1696 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1697 1698 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1699 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1700 matrix->values->assign(a->a, a->a+a->nz); 1701 err = WaitForCUDA();CHKERRCUDA(err); 1702 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1703 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1704 1705 /* Update matT when it was built before */ 1706 if (cusparsestruct->matTranspose) { 1707 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1708 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1709 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1710 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1711 A->cmap->n, matrix->num_entries, 1712 matrix->values->data().get(), 1713 cusparsestruct->rowoffsets_gpu->data().get(), 1714 matrix->column_indices->data().get(), 1715 matrixT->values->data().get(), 1716 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1717 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1718 CUSPARSE_ACTION_NUMERIC,indexBase, 1719 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1720 #else 1721 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1722 CUSPARSE_ACTION_NUMERIC, indexBase 1723 #endif 1724 );CHKERRCUSPARSE(stat); 1725 err = WaitForCUDA();CHKERRCUDA(err); 1726 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1727 } 1728 } else { 1729 PetscInt nnz; 1730 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1731 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1732 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1733 delete cusparsestruct->workVector; 1734 delete cusparsestruct->rowoffsets_gpu; 1735 try { 1736 if (a->compressedrow.use) { 1737 m = a->compressedrow.nrows; 1738 ii = a->compressedrow.i; 1739 ridx = a->compressedrow.rindex; 1740 } else { 1741 m = A->rmap->n; 1742 ii = a->i; 1743 ridx = NULL; 1744 } 1745 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1746 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1747 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1748 else nnz = a->nz; 1749 1750 /* create cusparse matrix */ 1751 cusparsestruct->nrows = m; 1752 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1753 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1754 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1755 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1756 1757 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1758 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1759 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1760 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1761 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1762 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1763 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1764 1765 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1766 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1767 /* set the matrix */ 1768 CsrMatrix *mat= new CsrMatrix; 1769 mat->num_rows = m; 1770 mat->num_cols = A->cmap->n; 1771 mat->num_entries = nnz; 1772 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1773 mat->row_offsets->assign(ii, ii + m+1); 1774 1775 mat->column_indices = new THRUSTINTARRAY32(nnz); 1776 mat->column_indices->assign(a->j, a->j+nnz); 1777 1778 mat->values = new THRUSTARRAY(nnz); 1779 if (a->a) mat->values->assign(a->a, a->a+nnz); 1780 1781 /* assign the pointer */ 1782 matstruct->mat = mat; 1783 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1784 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1785 stat = cusparseCreateCsr(&matstruct->matDescr, 1786 mat->num_rows, mat->num_cols, mat->num_entries, 1787 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1788 mat->values->data().get(), 1789 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1790 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1791 } 1792 #endif 1793 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1794 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1795 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1796 #else 1797 CsrMatrix *mat= new CsrMatrix; 1798 mat->num_rows = m; 1799 mat->num_cols = A->cmap->n; 1800 mat->num_entries = nnz; 1801 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1802 mat->row_offsets->assign(ii, ii + m+1); 1803 1804 mat->column_indices = new THRUSTINTARRAY32(nnz); 1805 mat->column_indices->assign(a->j, a->j+nnz); 1806 1807 mat->values = new THRUSTARRAY(nnz); 1808 if (a->a) mat->values->assign(a->a, a->a+nnz); 1809 1810 cusparseHybMat_t hybMat; 1811 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1812 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1813 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1814 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1815 matstruct->descr, mat->values->data().get(), 1816 mat->row_offsets->data().get(), 1817 mat->column_indices->data().get(), 1818 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1819 /* assign the pointer */ 1820 matstruct->mat = hybMat; 1821 1822 if (mat) { 1823 if (mat->values) delete (THRUSTARRAY*)mat->values; 1824 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1825 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1826 delete (CsrMatrix*)mat; 1827 } 1828 #endif 1829 } 1830 1831 /* assign the compressed row indices */ 1832 if (a->compressedrow.use) { 1833 cusparsestruct->workVector = new THRUSTARRAY(m); 1834 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1835 matstruct->cprowIndices->assign(ridx,ridx+m); 1836 tmp = m; 1837 } else { 1838 cusparsestruct->workVector = NULL; 1839 matstruct->cprowIndices = NULL; 1840 tmp = 0; 1841 } 1842 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1843 1844 /* assign the pointer */ 1845 cusparsestruct->mat = matstruct; 1846 } catch(char *ex) { 1847 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1848 } 1849 err = WaitForCUDA();CHKERRCUDA(err); 1850 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1851 cusparsestruct->nonzerostate = A->nonzerostate; 1852 } 1853 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1854 } 1855 PetscFunctionReturn(0); 1856 } 1857 1858 struct VecCUDAPlusEquals 1859 { 1860 template <typename Tuple> 1861 __host__ __device__ 1862 void operator()(Tuple t) 1863 { 1864 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1865 } 1866 }; 1867 1868 struct VecCUDAEquals 1869 { 1870 template <typename Tuple> 1871 __host__ __device__ 1872 void operator()(Tuple t) 1873 { 1874 thrust::get<1>(t) = thrust::get<0>(t); 1875 } 1876 }; 1877 1878 struct VecCUDAEqualsReverse 1879 { 1880 template <typename Tuple> 1881 __host__ __device__ 1882 void operator()(Tuple t) 1883 { 1884 thrust::get<0>(t) = thrust::get<1>(t); 1885 } 1886 }; 1887 1888 struct MatMatCusparse { 1889 PetscBool cisdense; 1890 PetscScalar *Bt; 1891 Mat X; 1892 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1893 PetscLogDouble flops; 1894 CsrMatrix *Bcsr; 1895 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1896 cusparseSpMatDescr_t matSpBDescr; 1897 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1898 cusparseDnMatDescr_t matBDescr; 1899 cusparseDnMatDescr_t matCDescr; 1900 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1901 size_t mmBufferSize; 1902 void *mmBuffer; 1903 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1904 cusparseSpGEMMDescr_t spgemmDesc; 1905 #endif 1906 }; 1907 1908 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1909 { 1910 PetscErrorCode ierr; 1911 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1912 cudaError_t cerr; 1913 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1914 cusparseStatus_t stat; 1915 #endif 1916 1917 PetscFunctionBegin; 1918 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1919 delete mmdata->Bcsr; 1920 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1921 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1922 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1923 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1924 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1925 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1926 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1927 #endif 1928 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1929 ierr = PetscFree(data);CHKERRQ(ierr); 1930 PetscFunctionReturn(0); 1931 } 1932 1933 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1934 1935 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1936 { 1937 Mat_Product *product = C->product; 1938 Mat A,B; 1939 PetscInt m,n,blda,clda; 1940 PetscBool flg,biscuda; 1941 Mat_SeqAIJCUSPARSE *cusp; 1942 cusparseStatus_t stat; 1943 cusparseOperation_t opA; 1944 const PetscScalar *barray; 1945 PetscScalar *carray; 1946 PetscErrorCode ierr; 1947 MatMatCusparse *mmdata; 1948 Mat_SeqAIJCUSPARSEMultStruct *mat; 1949 CsrMatrix *csrmat; 1950 cudaError_t cerr; 1951 1952 PetscFunctionBegin; 1953 MatCheckProduct(C,1); 1954 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1955 mmdata = (MatMatCusparse*)product->data; 1956 A = product->A; 1957 B = product->B; 1958 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1959 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1960 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1961 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1962 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1963 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1964 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1965 switch (product->type) { 1966 case MATPRODUCT_AB: 1967 case MATPRODUCT_PtAP: 1968 mat = cusp->mat; 1969 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1970 m = A->rmap->n; 1971 n = B->cmap->n; 1972 break; 1973 case MATPRODUCT_AtB: 1974 if (!cusp->transgen) { 1975 mat = cusp->mat; 1976 opA = CUSPARSE_OPERATION_TRANSPOSE; 1977 } else { 1978 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1979 mat = cusp->matTranspose; 1980 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1981 } 1982 m = A->cmap->n; 1983 n = B->cmap->n; 1984 break; 1985 case MATPRODUCT_ABt: 1986 case MATPRODUCT_RARt: 1987 mat = cusp->mat; 1988 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1989 m = A->rmap->n; 1990 n = B->rmap->n; 1991 break; 1992 default: 1993 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1994 } 1995 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1996 csrmat = (CsrMatrix*)mat->mat; 1997 /* if the user passed a CPU matrix, copy the data to the GPU */ 1998 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1999 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2000 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2001 2002 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2003 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2004 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2005 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2006 } else { 2007 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2008 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2009 } 2010 2011 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2012 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2013 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2014 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2015 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2016 size_t mmBufferSize; 2017 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2018 if (!mmdata->matBDescr) { 2019 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2020 mmdata->Blda = blda; 2021 } 2022 2023 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2024 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2025 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2026 mmdata->Clda = clda; 2027 } 2028 2029 if (!mat->matDescr) { 2030 stat = cusparseCreateCsr(&mat->matDescr, 2031 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2032 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2033 csrmat->values->data().get(), 2034 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2035 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2036 } 2037 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2038 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2039 mmdata->matCDescr,cusparse_scalartype, 2040 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2041 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2042 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2043 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2044 mmdata->mmBufferSize = mmBufferSize; 2045 } 2046 mmdata->initialized = PETSC_TRUE; 2047 } else { 2048 /* to be safe, always update pointers of the mats */ 2049 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2050 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2051 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2052 } 2053 2054 /* do cusparseSpMM, which supports transpose on B */ 2055 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2056 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2057 mmdata->matCDescr,cusparse_scalartype, 2058 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2059 #else 2060 PetscInt k; 2061 /* cusparseXcsrmm does not support transpose on B */ 2062 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2063 cublasHandle_t cublasv2handle; 2064 cublasStatus_t cerr; 2065 2066 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2067 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2068 B->cmap->n,B->rmap->n, 2069 &PETSC_CUSPARSE_ONE ,barray,blda, 2070 &PETSC_CUSPARSE_ZERO,barray,blda, 2071 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2072 blda = B->cmap->n; 2073 k = B->cmap->n; 2074 } else { 2075 k = B->rmap->n; 2076 } 2077 2078 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2079 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2080 csrmat->num_entries,mat->alpha_one,mat->descr, 2081 csrmat->values->data().get(), 2082 csrmat->row_offsets->data().get(), 2083 csrmat->column_indices->data().get(), 2084 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2085 carray,clda);CHKERRCUSPARSE(stat); 2086 #endif 2087 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2088 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2089 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2090 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2091 if (product->type == MATPRODUCT_RARt) { 2092 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2093 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2094 } else if (product->type == MATPRODUCT_PtAP) { 2095 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2096 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2097 } else { 2098 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2099 } 2100 if (mmdata->cisdense) { 2101 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2102 } 2103 if (!biscuda) { 2104 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2105 } 2106 PetscFunctionReturn(0); 2107 } 2108 2109 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2110 { 2111 Mat_Product *product = C->product; 2112 Mat A,B; 2113 PetscInt m,n; 2114 PetscBool cisdense,flg; 2115 PetscErrorCode ierr; 2116 MatMatCusparse *mmdata; 2117 Mat_SeqAIJCUSPARSE *cusp; 2118 2119 PetscFunctionBegin; 2120 MatCheckProduct(C,1); 2121 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2122 A = product->A; 2123 B = product->B; 2124 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2125 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2126 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2127 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2128 switch (product->type) { 2129 case MATPRODUCT_AB: 2130 m = A->rmap->n; 2131 n = B->cmap->n; 2132 break; 2133 case MATPRODUCT_AtB: 2134 m = A->cmap->n; 2135 n = B->cmap->n; 2136 break; 2137 case MATPRODUCT_ABt: 2138 m = A->rmap->n; 2139 n = B->rmap->n; 2140 break; 2141 case MATPRODUCT_PtAP: 2142 m = B->cmap->n; 2143 n = B->cmap->n; 2144 break; 2145 case MATPRODUCT_RARt: 2146 m = B->rmap->n; 2147 n = B->rmap->n; 2148 break; 2149 default: 2150 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2151 } 2152 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2153 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2154 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2155 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2156 2157 /* product data */ 2158 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2159 mmdata->cisdense = cisdense; 2160 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2161 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2162 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2163 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2164 } 2165 #endif 2166 /* for these products we need intermediate storage */ 2167 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2168 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2169 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2170 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2171 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2172 } else { 2173 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2174 } 2175 } 2176 C->product->data = mmdata; 2177 C->product->destroy = MatDestroy_MatMatCusparse; 2178 2179 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2180 PetscFunctionReturn(0); 2181 } 2182 2183 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2184 { 2185 Mat_Product *product = C->product; 2186 Mat A,B; 2187 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2188 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2189 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2190 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2191 PetscBool flg; 2192 PetscErrorCode ierr; 2193 cusparseStatus_t stat; 2194 cudaError_t cerr; 2195 MatProductType ptype; 2196 MatMatCusparse *mmdata; 2197 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2198 cusparseSpMatDescr_t BmatSpDescr; 2199 #endif 2200 2201 PetscFunctionBegin; 2202 MatCheckProduct(C,1); 2203 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2204 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2205 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2206 mmdata = (MatMatCusparse*)C->product->data; 2207 A = product->A; 2208 B = product->B; 2209 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2210 mmdata->reusesym = PETSC_FALSE; 2211 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2212 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2213 Cmat = Ccusp->mat; 2214 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2215 Ccsr = (CsrMatrix*)Cmat->mat; 2216 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2217 goto finalize; 2218 } 2219 if (!c->nz) goto finalize; 2220 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2221 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2222 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2223 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2224 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2225 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2226 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2227 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2228 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2229 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2230 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2231 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2232 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2233 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2234 2235 ptype = product->type; 2236 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2237 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2238 switch (ptype) { 2239 case MATPRODUCT_AB: 2240 Amat = Acusp->mat; 2241 Bmat = Bcusp->mat; 2242 break; 2243 case MATPRODUCT_AtB: 2244 Amat = Acusp->matTranspose; 2245 Bmat = Bcusp->mat; 2246 break; 2247 case MATPRODUCT_ABt: 2248 Amat = Acusp->mat; 2249 Bmat = Bcusp->matTranspose; 2250 break; 2251 default: 2252 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2253 } 2254 Cmat = Ccusp->mat; 2255 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2256 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2257 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2258 Acsr = (CsrMatrix*)Amat->mat; 2259 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2260 Ccsr = (CsrMatrix*)Cmat->mat; 2261 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2262 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2263 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2264 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2265 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2266 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2267 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2268 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2269 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2270 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2271 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2272 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2273 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2274 #else 2275 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2276 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2277 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2278 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2279 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2280 #endif 2281 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2282 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2283 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2284 C->offloadmask = PETSC_OFFLOAD_GPU; 2285 finalize: 2286 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2287 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2288 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2289 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2290 c->reallocs = 0; 2291 C->info.mallocs += 0; 2292 C->info.nz_unneeded = 0; 2293 C->assembled = C->was_assembled = PETSC_TRUE; 2294 C->num_ass++; 2295 PetscFunctionReturn(0); 2296 } 2297 2298 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2299 { 2300 Mat_Product *product = C->product; 2301 Mat A,B; 2302 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2303 Mat_SeqAIJ *a,*b,*c; 2304 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2305 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2306 PetscInt i,j,m,n,k; 2307 PetscBool flg; 2308 PetscErrorCode ierr; 2309 cusparseStatus_t stat; 2310 cudaError_t cerr; 2311 MatProductType ptype; 2312 MatMatCusparse *mmdata; 2313 PetscLogDouble flops; 2314 PetscBool biscompressed,ciscompressed; 2315 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2316 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2317 size_t bufSize2; 2318 cusparseSpMatDescr_t BmatSpDescr; 2319 #else 2320 int cnz; 2321 #endif 2322 2323 PetscFunctionBegin; 2324 MatCheckProduct(C,1); 2325 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2326 A = product->A; 2327 B = product->B; 2328 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2329 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2330 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2331 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2332 a = (Mat_SeqAIJ*)A->data; 2333 b = (Mat_SeqAIJ*)B->data; 2334 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2335 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2336 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2337 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2338 2339 /* product data */ 2340 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2341 C->product->data = mmdata; 2342 C->product->destroy = MatDestroy_MatMatCusparse; 2343 2344 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2345 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2346 ptype = product->type; 2347 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2348 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2349 biscompressed = PETSC_FALSE; 2350 ciscompressed = PETSC_FALSE; 2351 switch (ptype) { 2352 case MATPRODUCT_AB: 2353 m = A->rmap->n; 2354 n = B->cmap->n; 2355 k = A->cmap->n; 2356 Amat = Acusp->mat; 2357 Bmat = Bcusp->mat; 2358 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2359 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2360 break; 2361 case MATPRODUCT_AtB: 2362 m = A->cmap->n; 2363 n = B->cmap->n; 2364 k = A->rmap->n; 2365 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2366 Amat = Acusp->matTranspose; 2367 Bmat = Bcusp->mat; 2368 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2369 break; 2370 case MATPRODUCT_ABt: 2371 m = A->rmap->n; 2372 n = B->rmap->n; 2373 k = A->cmap->n; 2374 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2375 Amat = Acusp->mat; 2376 Bmat = Bcusp->matTranspose; 2377 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2378 break; 2379 default: 2380 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2381 } 2382 2383 /* create cusparse matrix */ 2384 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2385 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2386 c = (Mat_SeqAIJ*)C->data; 2387 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2388 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2389 Ccsr = new CsrMatrix; 2390 2391 c->compressedrow.use = ciscompressed; 2392 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2393 c->compressedrow.nrows = a->compressedrow.nrows; 2394 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2395 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2396 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2397 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2398 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2399 } else { 2400 c->compressedrow.nrows = 0; 2401 c->compressedrow.i = NULL; 2402 c->compressedrow.rindex = NULL; 2403 Ccusp->workVector = NULL; 2404 Cmat->cprowIndices = NULL; 2405 } 2406 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2407 Ccusp->mat = Cmat; 2408 Ccusp->mat->mat = Ccsr; 2409 Ccsr->num_rows = Ccusp->nrows; 2410 Ccsr->num_cols = n; 2411 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2412 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2413 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2414 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2415 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2416 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2417 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2418 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2419 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2420 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2421 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2422 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2423 c->nz = 0; 2424 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2425 Ccsr->values = new THRUSTARRAY(c->nz); 2426 goto finalizesym; 2427 } 2428 2429 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2430 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2431 Acsr = (CsrMatrix*)Amat->mat; 2432 if (!biscompressed) { 2433 Bcsr = (CsrMatrix*)Bmat->mat; 2434 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2435 BmatSpDescr = Bmat->matDescr; 2436 #endif 2437 } else { /* we need to use row offsets for the full matrix */ 2438 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2439 Bcsr = new CsrMatrix; 2440 Bcsr->num_rows = B->rmap->n; 2441 Bcsr->num_cols = cBcsr->num_cols; 2442 Bcsr->num_entries = cBcsr->num_entries; 2443 Bcsr->column_indices = cBcsr->column_indices; 2444 Bcsr->values = cBcsr->values; 2445 if (!Bcusp->rowoffsets_gpu) { 2446 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2447 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2448 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2449 } 2450 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2451 mmdata->Bcsr = Bcsr; 2452 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2453 if (Bcsr->num_rows && Bcsr->num_cols) { 2454 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2455 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2456 Bcsr->values->data().get(), 2457 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2458 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2459 } 2460 BmatSpDescr = mmdata->matSpBDescr; 2461 #endif 2462 } 2463 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2464 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2465 /* precompute flops count */ 2466 if (ptype == MATPRODUCT_AB) { 2467 for (i=0, flops = 0; i<A->rmap->n; i++) { 2468 const PetscInt st = a->i[i]; 2469 const PetscInt en = a->i[i+1]; 2470 for (j=st; j<en; j++) { 2471 const PetscInt brow = a->j[j]; 2472 flops += 2.*(b->i[brow+1] - b->i[brow]); 2473 } 2474 } 2475 } else if (ptype == MATPRODUCT_AtB) { 2476 for (i=0, flops = 0; i<A->rmap->n; i++) { 2477 const PetscInt anzi = a->i[i+1] - a->i[i]; 2478 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2479 flops += (2.*anzi)*bnzi; 2480 } 2481 } else { /* TODO */ 2482 flops = 0.; 2483 } 2484 2485 mmdata->flops = flops; 2486 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2487 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2488 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2489 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2490 NULL, NULL, NULL, 2491 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2492 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2493 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2494 /* ask bufferSize bytes for external memory */ 2495 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2496 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2497 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2498 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2499 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2500 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2501 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2502 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2503 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2504 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2505 /* ask bufferSize again bytes for external memory */ 2506 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2507 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2508 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2509 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2510 /* The CUSPARSE documentation is not clear, nor the API 2511 We need both buffers to perform the operations properly! 2512 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2513 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2514 is stored in the descriptor! What a messy API... */ 2515 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2516 /* compute the intermediate product of A * B */ 2517 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2518 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2519 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2520 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2521 /* get matrix C non-zero entries C_nnz1 */ 2522 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2523 c->nz = (PetscInt) C_nnz1; 2524 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2525 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2526 Ccsr->values = new THRUSTARRAY(c->nz); 2527 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2528 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2529 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2530 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2531 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2532 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2533 #else 2534 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2535 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2536 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2537 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2538 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2539 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2540 c->nz = cnz; 2541 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2542 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2543 Ccsr->values = new THRUSTARRAY(c->nz); 2544 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2545 2546 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2547 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2548 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2549 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2550 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2551 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2552 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2553 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2554 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2555 #endif 2556 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2557 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2558 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2559 finalizesym: 2560 c->singlemalloc = PETSC_FALSE; 2561 c->free_a = PETSC_TRUE; 2562 c->free_ij = PETSC_TRUE; 2563 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2564 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2565 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2566 PetscInt *d_i = c->i; 2567 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2568 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2569 ii = *Ccsr->row_offsets; 2570 jj = *Ccsr->column_indices; 2571 if (ciscompressed) d_i = c->compressedrow.i; 2572 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2573 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2574 } else { 2575 PetscInt *d_i = c->i; 2576 if (ciscompressed) d_i = c->compressedrow.i; 2577 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2578 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2579 } 2580 if (ciscompressed) { /* need to expand host row offsets */ 2581 PetscInt r = 0; 2582 c->i[0] = 0; 2583 for (k = 0; k < c->compressedrow.nrows; k++) { 2584 const PetscInt next = c->compressedrow.rindex[k]; 2585 const PetscInt old = c->compressedrow.i[k]; 2586 for (; r < next; r++) c->i[r+1] = old; 2587 } 2588 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2589 } 2590 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2591 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2592 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2593 c->maxnz = c->nz; 2594 c->nonzerorowcnt = 0; 2595 c->rmax = 0; 2596 for (k = 0; k < m; k++) { 2597 const PetscInt nn = c->i[k+1] - c->i[k]; 2598 c->ilen[k] = c->imax[k] = nn; 2599 c->nonzerorowcnt += (PetscInt)!!nn; 2600 c->rmax = PetscMax(c->rmax,nn); 2601 } 2602 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2603 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2604 Ccsr->num_entries = c->nz; 2605 2606 C->nonzerostate++; 2607 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2608 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2609 Ccusp->nonzerostate = C->nonzerostate; 2610 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2611 C->preallocated = PETSC_TRUE; 2612 C->assembled = PETSC_FALSE; 2613 C->was_assembled = PETSC_FALSE; 2614 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2615 mmdata->reusesym = PETSC_TRUE; 2616 C->offloadmask = PETSC_OFFLOAD_GPU; 2617 } 2618 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2619 PetscFunctionReturn(0); 2620 } 2621 2622 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2623 2624 /* handles sparse or dense B */ 2625 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2626 { 2627 Mat_Product *product = mat->product; 2628 PetscErrorCode ierr; 2629 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2630 2631 PetscFunctionBegin; 2632 MatCheckProduct(mat,1); 2633 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2634 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2635 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2636 } 2637 if (product->type == MATPRODUCT_ABC) { 2638 Ciscusp = PETSC_FALSE; 2639 if (!product->C->boundtocpu) { 2640 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2641 } 2642 } 2643 if (isdense) { 2644 switch (product->type) { 2645 case MATPRODUCT_AB: 2646 case MATPRODUCT_AtB: 2647 case MATPRODUCT_ABt: 2648 case MATPRODUCT_PtAP: 2649 case MATPRODUCT_RARt: 2650 if (product->A->boundtocpu) { 2651 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2652 } else { 2653 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2654 } 2655 break; 2656 case MATPRODUCT_ABC: 2657 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2658 break; 2659 default: 2660 break; 2661 } 2662 } else if (Biscusp && Ciscusp) { 2663 switch (product->type) { 2664 case MATPRODUCT_AB: 2665 case MATPRODUCT_AtB: 2666 case MATPRODUCT_ABt: 2667 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2668 break; 2669 case MATPRODUCT_PtAP: 2670 case MATPRODUCT_RARt: 2671 case MATPRODUCT_ABC: 2672 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2673 break; 2674 default: 2675 break; 2676 } 2677 } else { /* fallback for AIJ */ 2678 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2679 } 2680 PetscFunctionReturn(0); 2681 } 2682 2683 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2684 { 2685 PetscErrorCode ierr; 2686 2687 PetscFunctionBegin; 2688 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2689 PetscFunctionReturn(0); 2690 } 2691 2692 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2693 { 2694 PetscErrorCode ierr; 2695 2696 PetscFunctionBegin; 2697 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2698 PetscFunctionReturn(0); 2699 } 2700 2701 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2702 { 2703 PetscErrorCode ierr; 2704 2705 PetscFunctionBegin; 2706 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2707 PetscFunctionReturn(0); 2708 } 2709 2710 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2711 { 2712 PetscErrorCode ierr; 2713 2714 PetscFunctionBegin; 2715 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2716 PetscFunctionReturn(0); 2717 } 2718 2719 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2720 { 2721 PetscErrorCode ierr; 2722 2723 PetscFunctionBegin; 2724 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2725 PetscFunctionReturn(0); 2726 } 2727 2728 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2729 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2730 { 2731 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2732 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2733 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2734 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2735 PetscErrorCode ierr; 2736 cudaError_t cerr; 2737 cusparseStatus_t stat; 2738 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2739 PetscBool compressed; 2740 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2741 PetscInt nx,ny; 2742 #endif 2743 2744 PetscFunctionBegin; 2745 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2746 if (!a->nonzerorowcnt) { 2747 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2748 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2749 PetscFunctionReturn(0); 2750 } 2751 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2752 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2753 if (!trans) { 2754 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2755 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2756 } else { 2757 if (herm || !cusparsestruct->transgen) { 2758 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2759 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2760 } else { 2761 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2762 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2763 } 2764 } 2765 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2766 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2767 2768 try { 2769 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2770 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2771 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2772 2773 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2774 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2775 /* z = A x + beta y. 2776 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2777 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2778 */ 2779 xptr = xarray; 2780 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2781 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2782 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2783 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2784 allocated to accommodate different uses. So we get the length info directly from mat. 2785 */ 2786 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2787 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2788 nx = mat->num_cols; 2789 ny = mat->num_rows; 2790 } 2791 #endif 2792 } else { 2793 /* z = A^T x + beta y 2794 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2795 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2796 */ 2797 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2798 dptr = zarray; 2799 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2800 if (compressed) { /* Scatter x to work vector */ 2801 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2802 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2803 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2804 VecCUDAEqualsReverse()); 2805 } 2806 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2807 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2808 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2809 nx = mat->num_rows; 2810 ny = mat->num_cols; 2811 } 2812 #endif 2813 } 2814 2815 /* csr_spmv does y = alpha op(A) x + beta y */ 2816 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2817 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2818 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2819 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2820 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2821 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2822 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2823 matstruct->matDescr, 2824 matstruct->cuSpMV[opA].vecXDescr, beta, 2825 matstruct->cuSpMV[opA].vecYDescr, 2826 cusparse_scalartype, 2827 cusparsestruct->spmvAlg, 2828 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2829 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2830 2831 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2832 } else { 2833 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2834 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2835 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2836 } 2837 2838 stat = cusparseSpMV(cusparsestruct->handle, opA, 2839 matstruct->alpha_one, 2840 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2841 matstruct->cuSpMV[opA].vecXDescr, 2842 beta, 2843 matstruct->cuSpMV[opA].vecYDescr, 2844 cusparse_scalartype, 2845 cusparsestruct->spmvAlg, 2846 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2847 #else 2848 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2849 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2850 mat->num_rows, mat->num_cols, 2851 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2852 mat->values->data().get(), mat->row_offsets->data().get(), 2853 mat->column_indices->data().get(), xptr, beta, 2854 dptr);CHKERRCUSPARSE(stat); 2855 #endif 2856 } else { 2857 if (cusparsestruct->nrows) { 2858 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2859 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2860 #else 2861 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2862 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2863 matstruct->alpha_one, matstruct->descr, hybMat, 2864 xptr, beta, 2865 dptr);CHKERRCUSPARSE(stat); 2866 #endif 2867 } 2868 } 2869 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2870 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2871 2872 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2873 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2874 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2875 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2876 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2877 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2878 } 2879 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2880 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2881 } 2882 2883 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2884 if (compressed) { 2885 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2886 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2887 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2888 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2889 VecCUDAPlusEquals()); 2890 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2891 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2892 } 2893 } else { 2894 if (yy && yy != zz) { 2895 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2896 } 2897 } 2898 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2899 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2900 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2901 } catch(char *ex) { 2902 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2903 } 2904 if (yy) { 2905 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2906 } else { 2907 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2908 } 2909 PetscFunctionReturn(0); 2910 } 2911 2912 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2913 { 2914 PetscErrorCode ierr; 2915 2916 PetscFunctionBegin; 2917 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2918 PetscFunctionReturn(0); 2919 } 2920 2921 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2922 { 2923 PetscErrorCode ierr; 2924 PetscSplitCSRDataStructure *d_mat = NULL; 2925 PetscFunctionBegin; 2926 if (A->factortype == MAT_FACTOR_NONE) { 2927 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2928 } 2929 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2930 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2931 if (d_mat) { 2932 A->offloadmask = PETSC_OFFLOAD_GPU; 2933 } 2934 2935 PetscFunctionReturn(0); 2936 } 2937 2938 /* --------------------------------------------------------------------------------*/ 2939 /*@ 2940 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2941 (the default parallel PETSc format). This matrix will ultimately pushed down 2942 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2943 assembly performance the user should preallocate the matrix storage by setting 2944 the parameter nz (or the array nnz). By setting these parameters accurately, 2945 performance during matrix assembly can be increased by more than a factor of 50. 2946 2947 Collective 2948 2949 Input Parameters: 2950 + comm - MPI communicator, set to PETSC_COMM_SELF 2951 . m - number of rows 2952 . n - number of columns 2953 . nz - number of nonzeros per row (same for all rows) 2954 - nnz - array containing the number of nonzeros in the various rows 2955 (possibly different for each row) or NULL 2956 2957 Output Parameter: 2958 . A - the matrix 2959 2960 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2961 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2962 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2963 2964 Notes: 2965 If nnz is given then nz is ignored 2966 2967 The AIJ format (also called the Yale sparse matrix format or 2968 compressed row storage), is fully compatible with standard Fortran 77 2969 storage. That is, the stored row and column indices can begin at 2970 either one (as in Fortran) or zero. See the users' manual for details. 2971 2972 Specify the preallocated storage with either nz or nnz (not both). 2973 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2974 allocation. For large problems you MUST preallocate memory or you 2975 will get TERRIBLE performance, see the users' manual chapter on matrices. 2976 2977 By default, this format uses inodes (identical nodes) when possible, to 2978 improve numerical efficiency of matrix-vector products and solves. We 2979 search for consecutive rows with the same nonzero structure, thereby 2980 reusing matrix information to achieve increased efficiency. 2981 2982 Level: intermediate 2983 2984 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2985 @*/ 2986 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2987 { 2988 PetscErrorCode ierr; 2989 2990 PetscFunctionBegin; 2991 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2992 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2993 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2994 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2995 PetscFunctionReturn(0); 2996 } 2997 2998 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2999 { 3000 PetscErrorCode ierr; 3001 PetscSplitCSRDataStructure *d_mat = NULL; 3002 3003 PetscFunctionBegin; 3004 if (A->factortype == MAT_FACTOR_NONE) { 3005 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3006 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3007 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3008 } else { 3009 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3010 } 3011 if (d_mat) { 3012 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3013 cudaError_t err; 3014 PetscSplitCSRDataStructure h_mat; 3015 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3016 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3017 if (a->compressedrow.use) { 3018 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3019 } 3020 err = cudaFree(d_mat);CHKERRCUDA(err); 3021 } 3022 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3023 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3024 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3025 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3026 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3027 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3028 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3029 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3030 PetscFunctionReturn(0); 3031 } 3032 3033 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3034 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3035 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3036 { 3037 PetscErrorCode ierr; 3038 3039 PetscFunctionBegin; 3040 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3041 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3042 PetscFunctionReturn(0); 3043 } 3044 3045 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc. 3046 { 3047 PetscErrorCode ierr; 3048 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3049 PetscBool flgx,flgy; 3050 3051 PetscFunctionBegin; 3052 if (a == (PetscScalar)0.0) PetscFunctionReturn(0); 3053 PetscValidHeaderSpecific(Y,MAT_CLASSID,1); 3054 PetscValidHeaderSpecific(X,MAT_CLASSID,3); 3055 ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr); 3056 ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr); 3057 if (!flgx || !flgy) { 3058 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 3059 PetscFunctionReturn(0); 3060 } 3061 if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE"); 3062 if (str == DIFFERENT_NONZERO_PATTERN) { 3063 if (x->nz == y->nz) { 3064 PetscBool e; 3065 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 3066 if (e) { 3067 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 3068 if (e) { 3069 str = SAME_NONZERO_PATTERN; 3070 } 3071 } 3072 } 3073 } 3074 if (str != SAME_NONZERO_PATTERN) { 3075 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 3076 PetscFunctionReturn(0); 3077 } else { 3078 Mat_SeqAIJCUSPARSE *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3079 Mat_SeqAIJCUSPARSE *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr; 3080 if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3081 if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3082 if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) { 3083 if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) { 3084 ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 3085 } 3086 if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) { 3087 ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr); 3088 } 3089 ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3090 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3091 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3092 } else { 3093 cublasHandle_t cublasv2handle; 3094 cublasStatus_t cberr; 3095 cudaError_t err; 3096 PetscScalar alpha = a; 3097 PetscBLASInt one = 1, bnz = 1; 3098 CsrMatrix *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat; 3099 CsrMatrix *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat; 3100 PetscScalar *aa_y, *aa_x; 3101 aa_y = matrix_y->values->data().get(); 3102 aa_x = matrix_x->values->data().get(); 3103 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3104 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3105 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3106 cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr); 3107 err = WaitForCUDA();CHKERRCUDA(err); 3108 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3109 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3110 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3111 ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr); 3112 if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU; 3113 else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state"); 3114 ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 3115 } 3116 } 3117 PetscFunctionReturn(0); 3118 } 3119 3120 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3121 { 3122 PetscErrorCode ierr; 3123 PetscBool both = PETSC_FALSE; 3124 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3125 3126 PetscFunctionBegin; 3127 if (A->factortype == MAT_FACTOR_NONE) { 3128 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3129 if (spptr->mat) { 3130 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3131 if (matrix->values) { 3132 both = PETSC_TRUE; 3133 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3134 } 3135 } 3136 if (spptr->matTranspose) { 3137 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3138 if (matrix->values) { 3139 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3140 } 3141 } 3142 } 3143 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3144 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3145 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3146 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3147 else A->offloadmask = PETSC_OFFLOAD_CPU; 3148 3149 PetscFunctionReturn(0); 3150 } 3151 3152 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3153 { 3154 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3155 PetscErrorCode ierr; 3156 3157 PetscFunctionBegin; 3158 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3159 if (flg) { 3160 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3161 3162 A->ops->axpy = MatAXPY_SeqAIJ; 3163 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3164 A->ops->mult = MatMult_SeqAIJ; 3165 A->ops->multadd = MatMultAdd_SeqAIJ; 3166 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3167 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3168 A->ops->multhermitiantranspose = NULL; 3169 A->ops->multhermitiantransposeadd = NULL; 3170 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3171 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3172 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3173 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3174 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3175 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3176 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3177 } else { 3178 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3179 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3180 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3181 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3182 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3183 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3184 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3185 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3186 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3187 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3188 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3189 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3190 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3191 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3192 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3193 } 3194 A->boundtocpu = flg; 3195 a->inode.use = flg; 3196 PetscFunctionReturn(0); 3197 } 3198 3199 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3200 { 3201 PetscErrorCode ierr; 3202 cusparseStatus_t stat; 3203 Mat B; 3204 3205 PetscFunctionBegin; 3206 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3207 if (reuse == MAT_INITIAL_MATRIX) { 3208 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3209 } else if (reuse == MAT_REUSE_MATRIX) { 3210 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3211 } 3212 B = *newmat; 3213 3214 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3215 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3216 3217 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3218 if (B->factortype == MAT_FACTOR_NONE) { 3219 Mat_SeqAIJCUSPARSE *spptr; 3220 3221 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3222 spptr->format = MAT_CUSPARSE_CSR; 3223 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3224 B->spptr = spptr; 3225 spptr->deviceMat = NULL; 3226 } else { 3227 Mat_SeqAIJCUSPARSETriFactors *spptr; 3228 3229 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3230 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3231 B->spptr = spptr; 3232 } 3233 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3234 } 3235 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3236 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3237 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3238 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3239 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3240 3241 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3242 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3243 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3244 PetscFunctionReturn(0); 3245 } 3246 3247 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3248 { 3249 PetscErrorCode ierr; 3250 3251 PetscFunctionBegin; 3252 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3253 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3254 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3255 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3256 ierr = PetscOptionsEnd();CHKERRQ(ierr); 3257 PetscFunctionReturn(0); 3258 } 3259 3260 /*MC 3261 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3262 3263 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3264 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3265 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3266 3267 Options Database Keys: 3268 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3269 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3270 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3271 3272 Level: beginner 3273 3274 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3275 M*/ 3276 3277 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3278 3279 3280 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3281 { 3282 PetscErrorCode ierr; 3283 3284 PetscFunctionBegin; 3285 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3286 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3287 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3288 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3289 PetscFunctionReturn(0); 3290 } 3291 3292 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3293 { 3294 PetscErrorCode ierr; 3295 cusparseStatus_t stat; 3296 3297 PetscFunctionBegin; 3298 if (*cusparsestruct) { 3299 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3300 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3301 delete (*cusparsestruct)->workVector; 3302 delete (*cusparsestruct)->rowoffsets_gpu; 3303 delete (*cusparsestruct)->cooPerm; 3304 delete (*cusparsestruct)->cooPerm_a; 3305 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3306 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3307 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 3308 #endif 3309 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3310 } 3311 PetscFunctionReturn(0); 3312 } 3313 3314 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3315 { 3316 PetscFunctionBegin; 3317 if (*mat) { 3318 delete (*mat)->values; 3319 delete (*mat)->column_indices; 3320 delete (*mat)->row_offsets; 3321 delete *mat; 3322 *mat = 0; 3323 } 3324 PetscFunctionReturn(0); 3325 } 3326 3327 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3328 { 3329 cusparseStatus_t stat; 3330 PetscErrorCode ierr; 3331 3332 PetscFunctionBegin; 3333 if (*trifactor) { 3334 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3335 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3336 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3337 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3338 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3339 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3340 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3341 #endif 3342 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3343 } 3344 PetscFunctionReturn(0); 3345 } 3346 3347 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3348 { 3349 CsrMatrix *mat; 3350 cusparseStatus_t stat; 3351 cudaError_t err; 3352 3353 PetscFunctionBegin; 3354 if (*matstruct) { 3355 if ((*matstruct)->mat) { 3356 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3357 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3358 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3359 #else 3360 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3361 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3362 #endif 3363 } else { 3364 mat = (CsrMatrix*)(*matstruct)->mat; 3365 CsrMatrix_Destroy(&mat); 3366 } 3367 } 3368 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3369 delete (*matstruct)->cprowIndices; 3370 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3371 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3372 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3373 3374 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3375 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3376 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3377 for (int i=0; i<3; i++) { 3378 if (mdata->cuSpMV[i].initialized) { 3379 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3380 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3381 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3382 } 3383 } 3384 #endif 3385 delete *matstruct; 3386 *matstruct = NULL; 3387 } 3388 PetscFunctionReturn(0); 3389 } 3390 3391 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3392 { 3393 PetscErrorCode ierr; 3394 3395 PetscFunctionBegin; 3396 if (*trifactors) { 3397 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3398 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3399 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3400 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3401 delete (*trifactors)->rpermIndices; 3402 delete (*trifactors)->cpermIndices; 3403 delete (*trifactors)->workVector; 3404 (*trifactors)->rpermIndices = NULL; 3405 (*trifactors)->cpermIndices = NULL; 3406 (*trifactors)->workVector = NULL; 3407 } 3408 PetscFunctionReturn(0); 3409 } 3410 3411 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3412 { 3413 PetscErrorCode ierr; 3414 cusparseHandle_t handle; 3415 cusparseStatus_t stat; 3416 3417 PetscFunctionBegin; 3418 if (*trifactors) { 3419 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3420 if (handle = (*trifactors)->handle) { 3421 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3422 } 3423 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3424 } 3425 PetscFunctionReturn(0); 3426 } 3427 3428 struct IJCompare 3429 { 3430 __host__ __device__ 3431 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3432 { 3433 if (t1.get<0>() < t2.get<0>()) return true; 3434 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3435 return false; 3436 } 3437 }; 3438 3439 struct IJEqual 3440 { 3441 __host__ __device__ 3442 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3443 { 3444 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3445 return true; 3446 } 3447 }; 3448 3449 struct IJDiff 3450 { 3451 __host__ __device__ 3452 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3453 { 3454 return t1 == t2 ? 0 : 1; 3455 } 3456 }; 3457 3458 struct IJSum 3459 { 3460 __host__ __device__ 3461 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3462 { 3463 return t1||t2; 3464 } 3465 }; 3466 3467 #include <thrust/iterator/discard_iterator.h> 3468 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3469 { 3470 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3471 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3472 THRUSTARRAY *cooPerm_v = NULL; 3473 thrust::device_ptr<const PetscScalar> d_v; 3474 CsrMatrix *matrix; 3475 PetscErrorCode ierr; 3476 cudaError_t cerr; 3477 PetscInt n; 3478 3479 PetscFunctionBegin; 3480 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3481 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3482 if (!cusp->cooPerm) { 3483 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3484 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3485 PetscFunctionReturn(0); 3486 } 3487 matrix = (CsrMatrix*)cusp->mat->mat; 3488 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3489 if (!v) { 3490 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3491 goto finalize; 3492 } 3493 n = cusp->cooPerm->size(); 3494 if (isCudaMem(v)) { 3495 d_v = thrust::device_pointer_cast(v); 3496 } else { 3497 cooPerm_v = new THRUSTARRAY(n); 3498 cooPerm_v->assign(v,v+n); 3499 d_v = cooPerm_v->data(); 3500 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3501 } 3502 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3503 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3504 if (cusp->cooPerm_a) { 3505 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3506 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3507 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3508 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3509 delete cooPerm_w; 3510 } else { 3511 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3512 matrix->values->begin())); 3513 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3514 matrix->values->end())); 3515 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3516 } 3517 } else { 3518 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3519 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3520 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3521 } else { 3522 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3523 matrix->values->begin())); 3524 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3525 matrix->values->end())); 3526 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3527 } 3528 } 3529 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3530 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3531 finalize: 3532 delete cooPerm_v; 3533 A->offloadmask = PETSC_OFFLOAD_GPU; 3534 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3535 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3536 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3537 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3538 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3539 a->reallocs = 0; 3540 A->info.mallocs += 0; 3541 A->info.nz_unneeded = 0; 3542 A->assembled = A->was_assembled = PETSC_TRUE; 3543 A->num_ass++; 3544 PetscFunctionReturn(0); 3545 } 3546 3547 #include <thrust/binary_search.h> 3548 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3549 { 3550 PetscErrorCode ierr; 3551 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3552 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3553 PetscInt cooPerm_n, nzr = 0; 3554 cudaError_t cerr; 3555 3556 PetscFunctionBegin; 3557 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3558 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3559 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3560 if (n != cooPerm_n) { 3561 delete cusp->cooPerm; 3562 delete cusp->cooPerm_a; 3563 cusp->cooPerm = NULL; 3564 cusp->cooPerm_a = NULL; 3565 } 3566 if (n) { 3567 THRUSTINTARRAY d_i(n); 3568 THRUSTINTARRAY d_j(n); 3569 THRUSTINTARRAY ii(A->rmap->n); 3570 3571 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3572 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3573 3574 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3575 d_i.assign(coo_i,coo_i+n); 3576 d_j.assign(coo_j,coo_j+n); 3577 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3578 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3579 3580 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3581 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3582 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3583 *cusp->cooPerm_a = d_i; 3584 THRUSTINTARRAY w = d_j; 3585 3586 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3587 if (nekey == ekey) { /* all entries are unique */ 3588 delete cusp->cooPerm_a; 3589 cusp->cooPerm_a = NULL; 3590 } else { /* I couldn't come up with a more elegant algorithm */ 3591 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3592 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3593 (*cusp->cooPerm_a)[0] = 0; 3594 w[0] = 0; 3595 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3596 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3597 } 3598 thrust::counting_iterator<PetscInt> search_begin(0); 3599 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3600 search_begin, search_begin + A->rmap->n, 3601 ii.begin()); 3602 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3603 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3604 3605 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3606 a->singlemalloc = PETSC_FALSE; 3607 a->free_a = PETSC_TRUE; 3608 a->free_ij = PETSC_TRUE; 3609 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3610 a->i[0] = 0; 3611 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3612 a->nz = a->maxnz = a->i[A->rmap->n]; 3613 a->rmax = 0; 3614 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3615 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3616 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3617 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3618 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3619 for (PetscInt i = 0; i < A->rmap->n; i++) { 3620 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3621 nzr += (PetscInt)!!(nnzr); 3622 a->ilen[i] = a->imax[i] = nnzr; 3623 a->rmax = PetscMax(a->rmax,nnzr); 3624 } 3625 a->nonzerorowcnt = nzr; 3626 A->preallocated = PETSC_TRUE; 3627 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3628 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3629 } else { 3630 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3631 } 3632 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3633 3634 /* We want to allocate the CUSPARSE struct for matvec now. 3635 The code is so convoluted now that I prefer to copy zeros */ 3636 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3637 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3638 A->offloadmask = PETSC_OFFLOAD_CPU; 3639 A->nonzerostate++; 3640 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3641 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3642 3643 A->assembled = PETSC_FALSE; 3644 A->was_assembled = PETSC_FALSE; 3645 PetscFunctionReturn(0); 3646 } 3647 3648 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3649 { 3650 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3651 CsrMatrix *csr; 3652 PetscErrorCode ierr; 3653 3654 PetscFunctionBegin; 3655 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3656 PetscValidPointer(a,2); 3657 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3658 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3659 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3660 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3661 csr = (CsrMatrix*)cusp->mat->mat; 3662 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3663 *a = csr->values->data().get(); 3664 PetscFunctionReturn(0); 3665 } 3666 3667 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3668 { 3669 PetscFunctionBegin; 3670 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3671 PetscValidPointer(a,2); 3672 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3673 *a = NULL; 3674 PetscFunctionReturn(0); 3675 } 3676 3677 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3678 { 3679 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3680 CsrMatrix *csr; 3681 3682 PetscFunctionBegin; 3683 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3684 PetscValidPointer(a,2); 3685 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3686 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3687 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3688 csr = (CsrMatrix*)cusp->mat->mat; 3689 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3690 *a = csr->values->data().get(); 3691 PetscFunctionReturn(0); 3692 } 3693 3694 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3695 { 3696 PetscErrorCode ierr; 3697 3698 PetscFunctionBegin; 3699 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3700 PetscValidPointer(a,2); 3701 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3702 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3703 A->offloadmask = PETSC_OFFLOAD_GPU; 3704 *a = NULL; 3705 PetscFunctionReturn(0); 3706 } 3707 3708 struct IJCompare4 3709 { 3710 __host__ __device__ 3711 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3712 { 3713 if (t1.get<0>() < t2.get<0>()) return true; 3714 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3715 return false; 3716 } 3717 }; 3718 3719 struct Shift 3720 { 3721 int _shift; 3722 3723 Shift(int shift) : _shift(shift) {} 3724 __host__ __device__ 3725 inline int operator() (const int &c) 3726 { 3727 return c + _shift; 3728 } 3729 }; 3730 3731 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3732 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3733 { 3734 PetscErrorCode ierr; 3735 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3736 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3737 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3738 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3739 PetscInt Annz,Bnnz; 3740 cusparseStatus_t stat; 3741 PetscInt i,m,n,zero = 0; 3742 cudaError_t cerr; 3743 3744 PetscFunctionBegin; 3745 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3746 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3747 PetscValidPointer(C,4); 3748 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3749 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3750 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3751 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3752 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3753 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3754 if (reuse == MAT_INITIAL_MATRIX) { 3755 m = A->rmap->n; 3756 n = A->cmap->n + B->cmap->n; 3757 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3758 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3759 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3760 c = (Mat_SeqAIJ*)(*C)->data; 3761 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3762 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3763 Ccsr = new CsrMatrix; 3764 Cmat->cprowIndices = NULL; 3765 c->compressedrow.use = PETSC_FALSE; 3766 c->compressedrow.nrows = 0; 3767 c->compressedrow.i = NULL; 3768 c->compressedrow.rindex = NULL; 3769 Ccusp->workVector = NULL; 3770 Ccusp->nrows = m; 3771 Ccusp->mat = Cmat; 3772 Ccusp->mat->mat = Ccsr; 3773 Ccsr->num_rows = m; 3774 Ccsr->num_cols = n; 3775 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3776 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3777 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3778 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3779 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3780 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3781 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3782 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3783 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3784 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3785 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3786 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3787 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3788 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3789 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3790 3791 Acsr = (CsrMatrix*)Acusp->mat->mat; 3792 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3793 Annz = (PetscInt)Acsr->column_indices->size(); 3794 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3795 c->nz = Annz + Bnnz; 3796 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3797 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3798 Ccsr->values = new THRUSTARRAY(c->nz); 3799 Ccsr->num_entries = c->nz; 3800 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3801 if (c->nz) { 3802 auto Acoo = new THRUSTINTARRAY32(Annz); 3803 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3804 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3805 THRUSTINTARRAY32 *Aroff,*Broff; 3806 3807 if (a->compressedrow.use) { /* need full row offset */ 3808 if (!Acusp->rowoffsets_gpu) { 3809 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3810 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3811 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3812 } 3813 Aroff = Acusp->rowoffsets_gpu; 3814 } else Aroff = Acsr->row_offsets; 3815 if (b->compressedrow.use) { /* need full row offset */ 3816 if (!Bcusp->rowoffsets_gpu) { 3817 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3818 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3819 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3820 } 3821 Broff = Bcusp->rowoffsets_gpu; 3822 } else Broff = Bcsr->row_offsets; 3823 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3824 stat = cusparseXcsr2coo(Acusp->handle, 3825 Aroff->data().get(), 3826 Annz, 3827 m, 3828 Acoo->data().get(), 3829 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3830 stat = cusparseXcsr2coo(Bcusp->handle, 3831 Broff->data().get(), 3832 Bnnz, 3833 m, 3834 Bcoo->data().get(), 3835 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3836 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3837 auto Aperm = thrust::make_constant_iterator(1); 3838 auto Bperm = thrust::make_constant_iterator(0); 3839 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3840 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3841 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3842 #else 3843 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3844 auto Bcib = Bcsr->column_indices->begin(); 3845 auto Bcie = Bcsr->column_indices->end(); 3846 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 3847 #endif 3848 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 3849 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3850 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3851 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 3852 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 3853 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 3854 auto p1 = Ccusp->cooPerm->begin(); 3855 auto p2 = Ccusp->cooPerm->begin(); 3856 thrust::advance(p2,Annz); 3857 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 3858 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 3859 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 3860 #endif 3861 auto cci = thrust::make_counting_iterator(zero); 3862 auto cce = thrust::make_counting_iterator(c->nz); 3863 #if 0 //Errors on SUMMIT cuda 11.1.0 3864 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 3865 #else 3866 auto pred = thrust::identity<int>(); 3867 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 3868 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 3869 #endif 3870 stat = cusparseXcoo2csr(Ccusp->handle, 3871 Ccoo->data().get(), 3872 c->nz, 3873 m, 3874 Ccsr->row_offsets->data().get(), 3875 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3876 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3877 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3878 delete wPerm; 3879 delete Acoo; 3880 delete Bcoo; 3881 delete Ccoo; 3882 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3883 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 3884 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 3885 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3886 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3887 #endif 3888 if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 3889 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3890 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 3891 CsrMatrix *CcsrT = new CsrMatrix; 3892 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3893 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 3894 3895 Ccusp->transgen = PETSC_TRUE; 3896 CmatT->cprowIndices = NULL; 3897 CmatT->mat = CcsrT; 3898 CcsrT->num_rows = n; 3899 CcsrT->num_cols = m; 3900 CcsrT->num_entries = c->nz; 3901 3902 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 3903 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 3904 CcsrT->values = new THRUSTARRAY(c->nz); 3905 3906 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3907 auto rT = CcsrT->row_offsets->begin(); 3908 if (AT) { 3909 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 3910 thrust::advance(rT,-1); 3911 } 3912 if (BT) { 3913 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 3914 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 3915 thrust::copy(titb,tite,rT); 3916 } 3917 auto cT = CcsrT->column_indices->begin(); 3918 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 3919 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 3920 auto vT = CcsrT->values->begin(); 3921 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 3922 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 3923 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3924 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3925 3926 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 3927 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3928 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3929 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3930 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3931 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3932 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3933 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3934 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3935 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3936 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 3937 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 3938 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3939 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3940 #endif 3941 Ccusp->matTranspose = CmatT; 3942 } 3943 } 3944 3945 c->singlemalloc = PETSC_FALSE; 3946 c->free_a = PETSC_TRUE; 3947 c->free_ij = PETSC_TRUE; 3948 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 3949 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 3950 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3951 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3952 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3953 ii = *Ccsr->row_offsets; 3954 jj = *Ccsr->column_indices; 3955 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3956 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3957 } else { 3958 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3959 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3960 } 3961 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 3962 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 3963 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 3964 c->maxnz = c->nz; 3965 c->nonzerorowcnt = 0; 3966 c->rmax = 0; 3967 for (i = 0; i < m; i++) { 3968 const PetscInt nn = c->i[i+1] - c->i[i]; 3969 c->ilen[i] = c->imax[i] = nn; 3970 c->nonzerorowcnt += (PetscInt)!!nn; 3971 c->rmax = PetscMax(c->rmax,nn); 3972 } 3973 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 3974 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 3975 (*C)->nonzerostate++; 3976 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 3977 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 3978 Ccusp->nonzerostate = (*C)->nonzerostate; 3979 (*C)->preallocated = PETSC_TRUE; 3980 } else { 3981 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 3982 c = (Mat_SeqAIJ*)(*C)->data; 3983 if (c->nz) { 3984 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3985 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 3986 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3987 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 3988 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3989 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3990 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3991 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3992 Acsr = (CsrMatrix*)Acusp->mat->mat; 3993 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3994 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 3995 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 3996 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 3997 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 3998 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 3999 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4000 auto pmid = Ccusp->cooPerm->begin(); 4001 thrust::advance(pmid,Acsr->num_entries); 4002 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4003 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4004 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4005 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4006 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4007 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4008 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4009 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4010 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4011 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4012 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4013 if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 4014 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4015 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4016 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4017 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4018 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4019 auto vT = CcsrT->values->begin(); 4020 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4021 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4022 } 4023 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4024 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4025 } 4026 } 4027 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4028 (*C)->assembled = PETSC_TRUE; 4029 (*C)->was_assembled = PETSC_FALSE; 4030 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4031 PetscFunctionReturn(0); 4032 } 4033