1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 75 76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 86 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 87 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 88 89 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 90 { 91 cusparseStatus_t stat; 92 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 93 94 PetscFunctionBegin; 95 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 96 cusparsestruct->stream = stream; 97 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 98 PetscFunctionReturn(0); 99 } 100 101 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 102 { 103 cusparseStatus_t stat; 104 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 105 106 PetscFunctionBegin; 107 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 108 if (cusparsestruct->handle != handle) { 109 if (cusparsestruct->handle) { 110 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 111 } 112 cusparsestruct->handle = handle; 113 } 114 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 115 PetscFunctionReturn(0); 116 } 117 118 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 119 { 120 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 121 PetscBool flg; 122 PetscErrorCode ierr; 123 124 PetscFunctionBegin; 125 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 126 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 127 if (cusparsestruct->handle) cusparsestruct->handle = 0; 128 PetscFunctionReturn(0); 129 } 130 131 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 132 { 133 PetscFunctionBegin; 134 *type = MATSOLVERCUSPARSE; 135 PetscFunctionReturn(0); 136 } 137 138 /*MC 139 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 140 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 141 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 142 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 143 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 144 algorithms are not recommended. This class does NOT support direct solver operations. 145 146 Level: beginner 147 148 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 149 M*/ 150 151 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 152 { 153 PetscErrorCode ierr; 154 PetscInt n = A->rmap->n; 155 156 PetscFunctionBegin; 157 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 158 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 159 (*B)->factortype = ftype; 160 (*B)->useordering = PETSC_TRUE; 161 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 162 163 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 164 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 165 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 166 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 167 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 168 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 169 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 170 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 171 172 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 173 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 174 PetscFunctionReturn(0); 175 } 176 177 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 178 { 179 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 180 181 PetscFunctionBegin; 182 switch (op) { 183 case MAT_CUSPARSE_MULT: 184 cusparsestruct->format = format; 185 break; 186 case MAT_CUSPARSE_ALL: 187 cusparsestruct->format = format; 188 break; 189 default: 190 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 191 } 192 PetscFunctionReturn(0); 193 } 194 195 /*@ 196 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 197 operation. Only the MatMult operation can use different GPU storage formats 198 for MPIAIJCUSPARSE matrices. 199 Not Collective 200 201 Input Parameters: 202 + A - Matrix of type SEQAIJCUSPARSE 203 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 204 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 205 206 Output Parameter: 207 208 Level: intermediate 209 210 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 211 @*/ 212 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 213 { 214 PetscErrorCode ierr; 215 216 PetscFunctionBegin; 217 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 218 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 219 PetscFunctionReturn(0); 220 } 221 222 /*@ 223 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 224 225 Collective on mat 226 227 Input Parameters: 228 + A - Matrix of type SEQAIJCUSPARSE 229 - transgen - the boolean flag 230 231 Level: intermediate 232 233 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose() 234 @*/ 235 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 236 { 237 PetscErrorCode ierr; 238 PetscBool flg; 239 240 PetscFunctionBegin; 241 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 242 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 243 if (flg) { 244 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 245 246 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 247 cusp->transgen = transgen; 248 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 249 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 250 } 251 } 252 PetscFunctionReturn(0); 253 } 254 255 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 256 { 257 PetscErrorCode ierr; 258 MatCUSPARSEStorageFormat format; 259 PetscBool flg; 260 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 261 262 PetscFunctionBegin; 263 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 264 if (A->factortype == MAT_FACTOR_NONE) { 265 PetscBool transgen = cusparsestruct->transgen; 266 267 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 268 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 269 270 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 271 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 272 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 273 274 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 275 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 276 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 277 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 278 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 279 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 280 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 281 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 282 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 283 284 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 285 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 286 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 287 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 288 289 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 290 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 291 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 292 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 293 #endif 294 } 295 ierr = PetscOptionsTail();CHKERRQ(ierr); 296 PetscFunctionReturn(0); 297 } 298 299 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 300 { 301 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 302 PetscErrorCode ierr; 303 304 PetscFunctionBegin; 305 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 306 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 307 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 308 PetscFunctionReturn(0); 309 } 310 311 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 312 { 313 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 314 PetscErrorCode ierr; 315 316 PetscFunctionBegin; 317 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 318 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 319 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 320 PetscFunctionReturn(0); 321 } 322 323 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 324 { 325 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 326 PetscErrorCode ierr; 327 328 PetscFunctionBegin; 329 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 330 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 331 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 332 PetscFunctionReturn(0); 333 } 334 335 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 336 { 337 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 338 PetscErrorCode ierr; 339 340 PetscFunctionBegin; 341 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 342 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 343 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 344 PetscFunctionReturn(0); 345 } 346 347 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 348 { 349 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 350 PetscInt n = A->rmap->n; 351 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 352 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 353 cusparseStatus_t stat; 354 const PetscInt *ai = a->i,*aj = a->j,*vi; 355 const MatScalar *aa = a->a,*v; 356 PetscInt *AiLo, *AjLo; 357 PetscInt i,nz, nzLower, offset, rowOffset; 358 PetscErrorCode ierr; 359 cudaError_t cerr; 360 361 PetscFunctionBegin; 362 if (!n) PetscFunctionReturn(0); 363 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 364 try { 365 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 366 nzLower=n+ai[n]-ai[1]; 367 if (!loTriFactor) { 368 PetscScalar *AALo; 369 370 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 371 372 /* Allocate Space for the lower triangular matrix */ 373 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 374 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 375 376 /* Fill the lower triangular matrix */ 377 AiLo[0] = (PetscInt) 0; 378 AiLo[n] = nzLower; 379 AjLo[0] = (PetscInt) 0; 380 AALo[0] = (MatScalar) 1.0; 381 v = aa; 382 vi = aj; 383 offset = 1; 384 rowOffset= 1; 385 for (i=1; i<n; i++) { 386 nz = ai[i+1] - ai[i]; 387 /* additional 1 for the term on the diagonal */ 388 AiLo[i] = rowOffset; 389 rowOffset += nz+1; 390 391 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 392 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 393 394 offset += nz; 395 AjLo[offset] = (PetscInt) i; 396 AALo[offset] = (MatScalar) 1.0; 397 offset += 1; 398 399 v += nz; 400 vi += nz; 401 } 402 403 /* allocate space for the triangular factor information */ 404 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 405 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 406 /* Create the matrix description */ 407 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 408 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 409 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 410 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 411 #else 412 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 413 #endif 414 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 415 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 416 417 /* set the operation */ 418 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 419 420 /* set the matrix */ 421 loTriFactor->csrMat = new CsrMatrix; 422 loTriFactor->csrMat->num_rows = n; 423 loTriFactor->csrMat->num_cols = n; 424 loTriFactor->csrMat->num_entries = nzLower; 425 426 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 427 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 428 429 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 430 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 431 432 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 433 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 434 435 /* Create the solve analysis information */ 436 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 437 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 438 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 439 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 440 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 441 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 442 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 443 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 444 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 445 #endif 446 447 /* perform the solve analysis */ 448 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 449 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 450 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 451 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 453 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 454 #endif 455 );CHKERRCUSPARSE(stat); 456 cerr = WaitForCUDA();CHKERRCUDA(cerr); 457 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 458 459 /* assign the pointer */ 460 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 461 loTriFactor->AA_h = AALo; 462 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 463 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 464 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 465 } else { /* update values only */ 466 if (!loTriFactor->AA_h) { 467 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 468 } 469 /* Fill the lower triangular matrix */ 470 loTriFactor->AA_h[0] = 1.0; 471 v = aa; 472 vi = aj; 473 offset = 1; 474 for (i=1; i<n; i++) { 475 nz = ai[i+1] - ai[i]; 476 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 477 offset += nz; 478 loTriFactor->AA_h[offset] = 1.0; 479 offset += 1; 480 v += nz; 481 } 482 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 483 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 484 } 485 } catch(char *ex) { 486 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 487 } 488 } 489 PetscFunctionReturn(0); 490 } 491 492 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 493 { 494 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 495 PetscInt n = A->rmap->n; 496 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 497 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 498 cusparseStatus_t stat; 499 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 500 const MatScalar *aa = a->a,*v; 501 PetscInt *AiUp, *AjUp; 502 PetscInt i,nz, nzUpper, offset; 503 PetscErrorCode ierr; 504 cudaError_t cerr; 505 506 PetscFunctionBegin; 507 if (!n) PetscFunctionReturn(0); 508 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 509 try { 510 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 511 nzUpper = adiag[0]-adiag[n]; 512 if (!upTriFactor) { 513 PetscScalar *AAUp; 514 515 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 516 517 /* Allocate Space for the upper triangular matrix */ 518 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 519 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 520 521 /* Fill the upper triangular matrix */ 522 AiUp[0]=(PetscInt) 0; 523 AiUp[n]=nzUpper; 524 offset = nzUpper; 525 for (i=n-1; i>=0; i--) { 526 v = aa + adiag[i+1] + 1; 527 vi = aj + adiag[i+1] + 1; 528 529 /* number of elements NOT on the diagonal */ 530 nz = adiag[i] - adiag[i+1]-1; 531 532 /* decrement the offset */ 533 offset -= (nz+1); 534 535 /* first, set the diagonal elements */ 536 AjUp[offset] = (PetscInt) i; 537 AAUp[offset] = (MatScalar)1./v[nz]; 538 AiUp[i] = AiUp[i+1] - (nz+1); 539 540 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 541 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 542 } 543 544 /* allocate space for the triangular factor information */ 545 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 546 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 547 548 /* Create the matrix description */ 549 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 550 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 551 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 552 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 553 #else 554 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 555 #endif 556 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 557 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 558 559 /* set the operation */ 560 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 561 562 /* set the matrix */ 563 upTriFactor->csrMat = new CsrMatrix; 564 upTriFactor->csrMat->num_rows = n; 565 upTriFactor->csrMat->num_cols = n; 566 upTriFactor->csrMat->num_entries = nzUpper; 567 568 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 569 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 570 571 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 572 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 573 574 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 575 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 576 577 /* Create the solve analysis information */ 578 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 579 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 580 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 581 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 582 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 583 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 584 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 585 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 586 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 587 #endif 588 589 /* perform the solve analysis */ 590 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 591 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 592 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 593 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 594 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 595 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 596 #endif 597 );CHKERRCUSPARSE(stat); 598 cerr = WaitForCUDA();CHKERRCUDA(cerr); 599 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 600 601 /* assign the pointer */ 602 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 603 upTriFactor->AA_h = AAUp; 604 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 605 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 606 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 607 } else { 608 if (!upTriFactor->AA_h) { 609 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 610 } 611 /* Fill the upper triangular matrix */ 612 offset = nzUpper; 613 for (i=n-1; i>=0; i--) { 614 v = aa + adiag[i+1] + 1; 615 616 /* number of elements NOT on the diagonal */ 617 nz = adiag[i] - adiag[i+1]-1; 618 619 /* decrement the offset */ 620 offset -= (nz+1); 621 622 /* first, set the diagonal elements */ 623 upTriFactor->AA_h[offset] = 1./v[nz]; 624 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 625 } 626 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 627 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 628 } 629 } catch(char *ex) { 630 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 631 } 632 } 633 PetscFunctionReturn(0); 634 } 635 636 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 637 { 638 PetscErrorCode ierr; 639 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 640 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 641 IS isrow = a->row,iscol = a->icol; 642 PetscBool row_identity,col_identity; 643 PetscInt n = A->rmap->n; 644 645 PetscFunctionBegin; 646 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 647 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 648 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 649 650 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 651 cusparseTriFactors->nnz=a->nz; 652 653 A->offloadmask = PETSC_OFFLOAD_BOTH; 654 /* lower triangular indices */ 655 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 656 if (!row_identity && !cusparseTriFactors->rpermIndices) { 657 const PetscInt *r; 658 659 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 660 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 661 cusparseTriFactors->rpermIndices->assign(r, r+n); 662 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 663 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 664 } 665 666 /* upper triangular indices */ 667 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 668 if (!col_identity && !cusparseTriFactors->cpermIndices) { 669 const PetscInt *c; 670 671 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 672 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 673 cusparseTriFactors->cpermIndices->assign(c, c+n); 674 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 675 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 676 } 677 PetscFunctionReturn(0); 678 } 679 680 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 681 { 682 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 683 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 684 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 685 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 686 cusparseStatus_t stat; 687 PetscErrorCode ierr; 688 cudaError_t cerr; 689 PetscInt *AiUp, *AjUp; 690 PetscScalar *AAUp; 691 PetscScalar *AALo; 692 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 693 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 694 const PetscInt *ai = b->i,*aj = b->j,*vj; 695 const MatScalar *aa = b->a,*v; 696 697 PetscFunctionBegin; 698 if (!n) PetscFunctionReturn(0); 699 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 700 try { 701 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 702 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 703 if (!upTriFactor && !loTriFactor) { 704 /* Allocate Space for the upper triangular matrix */ 705 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 706 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 707 708 /* Fill the upper triangular matrix */ 709 AiUp[0]=(PetscInt) 0; 710 AiUp[n]=nzUpper; 711 offset = 0; 712 for (i=0; i<n; i++) { 713 /* set the pointers */ 714 v = aa + ai[i]; 715 vj = aj + ai[i]; 716 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 717 718 /* first, set the diagonal elements */ 719 AjUp[offset] = (PetscInt) i; 720 AAUp[offset] = (MatScalar)1.0/v[nz]; 721 AiUp[i] = offset; 722 AALo[offset] = (MatScalar)1.0/v[nz]; 723 724 offset+=1; 725 if (nz>0) { 726 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 727 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 728 for (j=offset; j<offset+nz; j++) { 729 AAUp[j] = -AAUp[j]; 730 AALo[j] = AAUp[j]/v[nz]; 731 } 732 offset+=nz; 733 } 734 } 735 736 /* allocate space for the triangular factor information */ 737 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 738 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 739 740 /* Create the matrix description */ 741 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 742 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 743 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 744 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 745 #else 746 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 747 #endif 748 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 749 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 750 751 /* set the matrix */ 752 upTriFactor->csrMat = new CsrMatrix; 753 upTriFactor->csrMat->num_rows = A->rmap->n; 754 upTriFactor->csrMat->num_cols = A->cmap->n; 755 upTriFactor->csrMat->num_entries = a->nz; 756 757 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 758 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 759 760 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 761 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 762 763 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 764 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 765 766 /* set the operation */ 767 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 768 769 /* Create the solve analysis information */ 770 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 771 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 772 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 773 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 774 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 775 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 776 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 777 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 778 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 779 #endif 780 781 /* perform the solve analysis */ 782 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 783 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 784 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 785 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 786 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 787 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 788 #endif 789 );CHKERRCUSPARSE(stat); 790 cerr = WaitForCUDA();CHKERRCUDA(cerr); 791 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 792 793 /* assign the pointer */ 794 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 795 796 /* allocate space for the triangular factor information */ 797 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 798 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 799 800 /* Create the matrix description */ 801 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 802 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 803 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 804 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 805 #else 806 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 807 #endif 808 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 809 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 810 811 /* set the operation */ 812 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 813 814 /* set the matrix */ 815 loTriFactor->csrMat = new CsrMatrix; 816 loTriFactor->csrMat->num_rows = A->rmap->n; 817 loTriFactor->csrMat->num_cols = A->cmap->n; 818 loTriFactor->csrMat->num_entries = a->nz; 819 820 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 821 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 822 823 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 824 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 825 826 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 827 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 828 829 /* Create the solve analysis information */ 830 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 831 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 832 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 833 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 834 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 835 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 836 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 837 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 838 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 839 #endif 840 841 /* perform the solve analysis */ 842 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 843 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 844 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 845 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 846 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 847 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 848 #endif 849 );CHKERRCUSPARSE(stat); 850 cerr = WaitForCUDA();CHKERRCUDA(cerr); 851 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 852 853 /* assign the pointer */ 854 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 855 856 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 857 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 858 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 859 } else { 860 /* Fill the upper triangular matrix */ 861 offset = 0; 862 for (i=0; i<n; i++) { 863 /* set the pointers */ 864 v = aa + ai[i]; 865 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 866 867 /* first, set the diagonal elements */ 868 AAUp[offset] = 1.0/v[nz]; 869 AALo[offset] = 1.0/v[nz]; 870 871 offset+=1; 872 if (nz>0) { 873 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 874 for (j=offset; j<offset+nz; j++) { 875 AAUp[j] = -AAUp[j]; 876 AALo[j] = AAUp[j]/v[nz]; 877 } 878 offset+=nz; 879 } 880 } 881 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 882 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 883 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 884 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 885 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 886 } 887 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 888 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 889 } catch(char *ex) { 890 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 891 } 892 } 893 PetscFunctionReturn(0); 894 } 895 896 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 897 { 898 PetscErrorCode ierr; 899 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 900 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 901 IS ip = a->row; 902 PetscBool perm_identity; 903 PetscInt n = A->rmap->n; 904 905 PetscFunctionBegin; 906 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 907 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 908 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 909 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 910 911 A->offloadmask = PETSC_OFFLOAD_BOTH; 912 913 /* lower triangular indices */ 914 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 915 if (!perm_identity) { 916 IS iip; 917 const PetscInt *irip,*rip; 918 919 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 920 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 921 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 922 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 923 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 924 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 925 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 926 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 927 ierr = ISDestroy(&iip);CHKERRQ(ierr); 928 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 929 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 930 } 931 PetscFunctionReturn(0); 932 } 933 934 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 935 { 936 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 937 IS isrow = b->row,iscol = b->col; 938 PetscBool row_identity,col_identity; 939 PetscErrorCode ierr; 940 941 PetscFunctionBegin; 942 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 943 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 944 B->offloadmask = PETSC_OFFLOAD_CPU; 945 /* determine which version of MatSolve needs to be used. */ 946 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 947 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 948 if (row_identity && col_identity) { 949 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 950 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 951 B->ops->matsolve = NULL; 952 B->ops->matsolvetranspose = NULL; 953 } else { 954 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 955 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 956 B->ops->matsolve = NULL; 957 B->ops->matsolvetranspose = NULL; 958 } 959 960 /* get the triangular factors */ 961 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 962 PetscFunctionReturn(0); 963 } 964 965 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 966 { 967 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 968 IS ip = b->row; 969 PetscBool perm_identity; 970 PetscErrorCode ierr; 971 972 PetscFunctionBegin; 973 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 974 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 975 B->offloadmask = PETSC_OFFLOAD_CPU; 976 /* determine which version of MatSolve needs to be used. */ 977 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 978 if (perm_identity) { 979 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 980 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 981 B->ops->matsolve = NULL; 982 B->ops->matsolvetranspose = NULL; 983 } else { 984 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 985 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 986 B->ops->matsolve = NULL; 987 B->ops->matsolvetranspose = NULL; 988 } 989 990 /* get the triangular factors */ 991 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 992 PetscFunctionReturn(0); 993 } 994 995 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 996 { 997 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 998 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 999 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1000 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1002 cusparseStatus_t stat; 1003 cusparseIndexBase_t indexBase; 1004 cusparseMatrixType_t matrixType; 1005 cusparseFillMode_t fillMode; 1006 cusparseDiagType_t diagType; 1007 cudaError_t cerr; 1008 PetscErrorCode ierr; 1009 1010 PetscFunctionBegin; 1011 /* allocate space for the transpose of the lower triangular factor */ 1012 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1013 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1014 1015 /* set the matrix descriptors of the lower triangular factor */ 1016 matrixType = cusparseGetMatType(loTriFactor->descr); 1017 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1018 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1019 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1020 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1021 1022 /* Create the matrix description */ 1023 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1024 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1025 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1026 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1027 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1028 1029 /* set the operation */ 1030 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1031 1032 /* allocate GPU space for the CSC of the lower triangular factor*/ 1033 loTriFactorT->csrMat = new CsrMatrix; 1034 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1035 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1036 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1037 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1038 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1039 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1040 1041 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1042 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1043 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1044 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1045 loTriFactor->csrMat->values->data().get(), 1046 loTriFactor->csrMat->row_offsets->data().get(), 1047 loTriFactor->csrMat->column_indices->data().get(), 1048 loTriFactorT->csrMat->values->data().get(), 1049 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1050 CUSPARSE_ACTION_NUMERIC,indexBase, 1051 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1052 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1053 #endif 1054 1055 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1056 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1057 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1058 loTriFactor->csrMat->values->data().get(), 1059 loTriFactor->csrMat->row_offsets->data().get(), 1060 loTriFactor->csrMat->column_indices->data().get(), 1061 loTriFactorT->csrMat->values->data().get(), 1062 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1063 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1064 CUSPARSE_ACTION_NUMERIC, indexBase, 1065 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1066 #else 1067 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1068 CUSPARSE_ACTION_NUMERIC, indexBase 1069 #endif 1070 );CHKERRCUSPARSE(stat); 1071 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1072 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1073 1074 /* Create the solve analysis information */ 1075 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1076 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1077 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1078 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1079 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1080 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1081 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1082 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1083 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1084 #endif 1085 1086 /* perform the solve analysis */ 1087 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1088 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1089 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1090 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1091 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1092 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1093 #endif 1094 );CHKERRCUSPARSE(stat); 1095 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1096 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1097 1098 /* assign the pointer */ 1099 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1100 1101 /*********************************************/ 1102 /* Now the Transpose of the Upper Tri Factor */ 1103 /*********************************************/ 1104 1105 /* allocate space for the transpose of the upper triangular factor */ 1106 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1107 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1108 1109 /* set the matrix descriptors of the upper triangular factor */ 1110 matrixType = cusparseGetMatType(upTriFactor->descr); 1111 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1112 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1113 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1114 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1115 1116 /* Create the matrix description */ 1117 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1118 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1119 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1121 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1122 1123 /* set the operation */ 1124 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1125 1126 /* allocate GPU space for the CSC of the upper triangular factor*/ 1127 upTriFactorT->csrMat = new CsrMatrix; 1128 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1129 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1130 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1131 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1132 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1133 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1134 1135 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1137 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1138 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1139 upTriFactor->csrMat->values->data().get(), 1140 upTriFactor->csrMat->row_offsets->data().get(), 1141 upTriFactor->csrMat->column_indices->data().get(), 1142 upTriFactorT->csrMat->values->data().get(), 1143 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1144 CUSPARSE_ACTION_NUMERIC,indexBase, 1145 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1146 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1147 #endif 1148 1149 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1150 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1151 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1152 upTriFactor->csrMat->values->data().get(), 1153 upTriFactor->csrMat->row_offsets->data().get(), 1154 upTriFactor->csrMat->column_indices->data().get(), 1155 upTriFactorT->csrMat->values->data().get(), 1156 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1157 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1158 CUSPARSE_ACTION_NUMERIC, indexBase, 1159 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1160 #else 1161 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1162 CUSPARSE_ACTION_NUMERIC, indexBase 1163 #endif 1164 );CHKERRCUSPARSE(stat); 1165 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1166 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1167 1168 /* Create the solve analysis information */ 1169 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1170 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1171 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1172 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1173 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1174 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1175 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1176 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1177 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1178 #endif 1179 1180 /* perform the solve analysis */ 1181 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1182 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1183 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1184 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1185 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1186 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1187 #endif 1188 );CHKERRCUSPARSE(stat); 1189 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1190 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1191 1192 /* assign the pointer */ 1193 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1194 PetscFunctionReturn(0); 1195 } 1196 1197 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1198 { 1199 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1200 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1201 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1202 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1203 cusparseStatus_t stat; 1204 cusparseIndexBase_t indexBase; 1205 cudaError_t err; 1206 PetscErrorCode ierr; 1207 1208 PetscFunctionBegin; 1209 if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1210 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1211 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1212 /* create cusparse matrix */ 1213 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1214 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1215 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1216 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1217 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1218 1219 /* set alpha and beta */ 1220 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1221 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1222 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1223 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1224 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1225 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1226 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1227 1228 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1229 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1230 CsrMatrix *matrixT= new CsrMatrix; 1231 matrixT->num_rows = A->cmap->n; 1232 matrixT->num_cols = A->rmap->n; 1233 matrixT->num_entries = a->nz; 1234 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1235 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1236 matrixT->values = new THRUSTARRAY(a->nz); 1237 1238 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1239 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1240 1241 /* compute the transpose, i.e. the CSC */ 1242 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1243 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1244 A->cmap->n, matrix->num_entries, 1245 matrix->values->data().get(), 1246 cusparsestruct->rowoffsets_gpu->data().get(), 1247 matrix->column_indices->data().get(), 1248 matrixT->values->data().get(), 1249 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1250 CUSPARSE_ACTION_NUMERIC,indexBase, 1251 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1252 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1253 #endif 1254 1255 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1256 A->cmap->n, matrix->num_entries, 1257 matrix->values->data().get(), 1258 cusparsestruct->rowoffsets_gpu->data().get(), 1259 matrix->column_indices->data().get(), 1260 matrixT->values->data().get(), 1261 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1262 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1263 CUSPARSE_ACTION_NUMERIC,indexBase, 1264 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1265 #else 1266 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1267 CUSPARSE_ACTION_NUMERIC, indexBase 1268 #endif 1269 );CHKERRCUSPARSE(stat); 1270 matstructT->mat = matrixT; 1271 1272 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1273 stat = cusparseCreateCsr(&matstructT->matDescr, 1274 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1275 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1276 matrixT->values->data().get(), 1277 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1278 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1279 #endif 1280 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1281 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1282 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1283 #else 1284 CsrMatrix *temp = new CsrMatrix; 1285 CsrMatrix *tempT = new CsrMatrix; 1286 /* First convert HYB to CSR */ 1287 temp->num_rows = A->rmap->n; 1288 temp->num_cols = A->cmap->n; 1289 temp->num_entries = a->nz; 1290 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1291 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1292 temp->values = new THRUSTARRAY(a->nz); 1293 1294 stat = cusparse_hyb2csr(cusparsestruct->handle, 1295 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1296 temp->values->data().get(), 1297 temp->row_offsets->data().get(), 1298 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1299 1300 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1301 tempT->num_rows = A->rmap->n; 1302 tempT->num_cols = A->cmap->n; 1303 tempT->num_entries = a->nz; 1304 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1305 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1306 tempT->values = new THRUSTARRAY(a->nz); 1307 1308 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1309 temp->num_cols, temp->num_entries, 1310 temp->values->data().get(), 1311 temp->row_offsets->data().get(), 1312 temp->column_indices->data().get(), 1313 tempT->values->data().get(), 1314 tempT->column_indices->data().get(), 1315 tempT->row_offsets->data().get(), 1316 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1317 1318 /* Last, convert CSC to HYB */ 1319 cusparseHybMat_t hybMat; 1320 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1321 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1322 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1323 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1324 matstructT->descr, tempT->values->data().get(), 1325 tempT->row_offsets->data().get(), 1326 tempT->column_indices->data().get(), 1327 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1328 1329 /* assign the pointer */ 1330 matstructT->mat = hybMat; 1331 /* delete temporaries */ 1332 if (tempT) { 1333 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1334 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1335 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1336 delete (CsrMatrix*) tempT; 1337 } 1338 if (temp) { 1339 if (temp->values) delete (THRUSTARRAY*) temp->values; 1340 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1341 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1342 delete (CsrMatrix*) temp; 1343 } 1344 #endif 1345 } 1346 err = WaitForCUDA();CHKERRCUDA(err); 1347 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1348 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1349 /* the compressed row indices is not used for matTranspose */ 1350 matstructT->cprowIndices = NULL; 1351 /* assign the pointer */ 1352 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1353 PetscFunctionReturn(0); 1354 } 1355 1356 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1357 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1358 { 1359 PetscInt n = xx->map->n; 1360 const PetscScalar *barray; 1361 PetscScalar *xarray; 1362 thrust::device_ptr<const PetscScalar> bGPU; 1363 thrust::device_ptr<PetscScalar> xGPU; 1364 cusparseStatus_t stat; 1365 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1366 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1367 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1368 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1369 PetscErrorCode ierr; 1370 cudaError_t cerr; 1371 1372 PetscFunctionBegin; 1373 /* Analyze the matrix and create the transpose ... on the fly */ 1374 if (!loTriFactorT && !upTriFactorT) { 1375 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1376 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1377 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1378 } 1379 1380 /* Get the GPU pointers */ 1381 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1382 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1383 xGPU = thrust::device_pointer_cast(xarray); 1384 bGPU = thrust::device_pointer_cast(barray); 1385 1386 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1387 /* First, reorder with the row permutation */ 1388 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1389 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1390 xGPU); 1391 1392 /* First, solve U */ 1393 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1394 upTriFactorT->csrMat->num_rows, 1395 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1396 upTriFactorT->csrMat->num_entries, 1397 #endif 1398 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1399 upTriFactorT->csrMat->values->data().get(), 1400 upTriFactorT->csrMat->row_offsets->data().get(), 1401 upTriFactorT->csrMat->column_indices->data().get(), 1402 upTriFactorT->solveInfo, 1403 xarray, tempGPU->data().get() 1404 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1405 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1406 #endif 1407 );CHKERRCUSPARSE(stat); 1408 1409 /* Then, solve L */ 1410 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1411 loTriFactorT->csrMat->num_rows, 1412 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1413 loTriFactorT->csrMat->num_entries, 1414 #endif 1415 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1416 loTriFactorT->csrMat->values->data().get(), 1417 loTriFactorT->csrMat->row_offsets->data().get(), 1418 loTriFactorT->csrMat->column_indices->data().get(), 1419 loTriFactorT->solveInfo, 1420 tempGPU->data().get(), xarray 1421 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1422 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1423 #endif 1424 );CHKERRCUSPARSE(stat); 1425 1426 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1427 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1428 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1429 tempGPU->begin()); 1430 1431 /* Copy the temporary to the full solution. */ 1432 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1433 1434 /* restore */ 1435 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1436 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1437 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1438 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1439 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1440 PetscFunctionReturn(0); 1441 } 1442 1443 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1444 { 1445 const PetscScalar *barray; 1446 PetscScalar *xarray; 1447 cusparseStatus_t stat; 1448 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1449 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1450 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1451 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1452 PetscErrorCode ierr; 1453 cudaError_t cerr; 1454 1455 PetscFunctionBegin; 1456 /* Analyze the matrix and create the transpose ... on the fly */ 1457 if (!loTriFactorT && !upTriFactorT) { 1458 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1459 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1460 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1461 } 1462 1463 /* Get the GPU pointers */ 1464 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1465 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1466 1467 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1468 /* First, solve U */ 1469 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1470 upTriFactorT->csrMat->num_rows, 1471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472 upTriFactorT->csrMat->num_entries, 1473 #endif 1474 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1475 upTriFactorT->csrMat->values->data().get(), 1476 upTriFactorT->csrMat->row_offsets->data().get(), 1477 upTriFactorT->csrMat->column_indices->data().get(), 1478 upTriFactorT->solveInfo, 1479 barray, tempGPU->data().get() 1480 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1482 #endif 1483 );CHKERRCUSPARSE(stat); 1484 1485 /* Then, solve L */ 1486 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1487 loTriFactorT->csrMat->num_rows, 1488 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1489 loTriFactorT->csrMat->num_entries, 1490 #endif 1491 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1492 loTriFactorT->csrMat->values->data().get(), 1493 loTriFactorT->csrMat->row_offsets->data().get(), 1494 loTriFactorT->csrMat->column_indices->data().get(), 1495 loTriFactorT->solveInfo, 1496 tempGPU->data().get(), xarray 1497 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1498 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1499 #endif 1500 );CHKERRCUSPARSE(stat); 1501 1502 /* restore */ 1503 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1504 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1505 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1506 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1507 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1508 PetscFunctionReturn(0); 1509 } 1510 1511 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1512 { 1513 const PetscScalar *barray; 1514 PetscScalar *xarray; 1515 thrust::device_ptr<const PetscScalar> bGPU; 1516 thrust::device_ptr<PetscScalar> xGPU; 1517 cusparseStatus_t stat; 1518 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1519 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1520 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1521 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1522 PetscErrorCode ierr; 1523 cudaError_t cerr; 1524 1525 PetscFunctionBegin; 1526 1527 /* Get the GPU pointers */ 1528 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1529 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1530 xGPU = thrust::device_pointer_cast(xarray); 1531 bGPU = thrust::device_pointer_cast(barray); 1532 1533 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1534 /* First, reorder with the row permutation */ 1535 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1536 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1537 tempGPU->begin()); 1538 1539 /* Next, solve L */ 1540 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1541 loTriFactor->csrMat->num_rows, 1542 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543 loTriFactor->csrMat->num_entries, 1544 #endif 1545 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1546 loTriFactor->csrMat->values->data().get(), 1547 loTriFactor->csrMat->row_offsets->data().get(), 1548 loTriFactor->csrMat->column_indices->data().get(), 1549 loTriFactor->solveInfo, 1550 tempGPU->data().get(), xarray 1551 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1552 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1553 #endif 1554 );CHKERRCUSPARSE(stat); 1555 1556 /* Then, solve U */ 1557 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1558 upTriFactor->csrMat->num_rows, 1559 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1560 upTriFactor->csrMat->num_entries, 1561 #endif 1562 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1563 upTriFactor->csrMat->values->data().get(), 1564 upTriFactor->csrMat->row_offsets->data().get(), 1565 upTriFactor->csrMat->column_indices->data().get(), 1566 upTriFactor->solveInfo, 1567 xarray, tempGPU->data().get() 1568 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1569 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1570 #endif 1571 );CHKERRCUSPARSE(stat); 1572 1573 /* Last, reorder with the column permutation */ 1574 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1575 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1576 xGPU); 1577 1578 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1579 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1580 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1581 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1582 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1583 PetscFunctionReturn(0); 1584 } 1585 1586 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1587 { 1588 const PetscScalar *barray; 1589 PetscScalar *xarray; 1590 cusparseStatus_t stat; 1591 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1594 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1595 PetscErrorCode ierr; 1596 cudaError_t cerr; 1597 1598 PetscFunctionBegin; 1599 /* Get the GPU pointers */ 1600 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1601 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1602 1603 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1604 /* First, solve L */ 1605 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1606 loTriFactor->csrMat->num_rows, 1607 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1608 loTriFactor->csrMat->num_entries, 1609 #endif 1610 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1611 loTriFactor->csrMat->values->data().get(), 1612 loTriFactor->csrMat->row_offsets->data().get(), 1613 loTriFactor->csrMat->column_indices->data().get(), 1614 loTriFactor->solveInfo, 1615 barray, tempGPU->data().get() 1616 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1617 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1618 #endif 1619 );CHKERRCUSPARSE(stat); 1620 1621 /* Next, solve U */ 1622 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1623 upTriFactor->csrMat->num_rows, 1624 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1625 upTriFactor->csrMat->num_entries, 1626 #endif 1627 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1628 upTriFactor->csrMat->values->data().get(), 1629 upTriFactor->csrMat->row_offsets->data().get(), 1630 upTriFactor->csrMat->column_indices->data().get(), 1631 upTriFactor->solveInfo, 1632 tempGPU->data().get(), xarray 1633 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1635 #endif 1636 );CHKERRCUSPARSE(stat); 1637 1638 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1639 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1640 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1641 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1642 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1643 PetscFunctionReturn(0); 1644 } 1645 1646 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1647 { 1648 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1649 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1650 cudaError_t cerr; 1651 PetscErrorCode ierr; 1652 1653 PetscFunctionBegin; 1654 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1655 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1656 1657 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1658 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1659 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1660 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1661 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1662 A->offloadmask = PETSC_OFFLOAD_BOTH; 1663 } 1664 PetscFunctionReturn(0); 1665 } 1666 1667 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1668 { 1669 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1670 PetscErrorCode ierr; 1671 1672 PetscFunctionBegin; 1673 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1674 *array = a->a; 1675 A->offloadmask = PETSC_OFFLOAD_CPU; 1676 PetscFunctionReturn(0); 1677 } 1678 1679 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1680 { 1681 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1682 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1683 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1684 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1685 PetscErrorCode ierr; 1686 cusparseStatus_t stat; 1687 cudaError_t err; 1688 1689 PetscFunctionBegin; 1690 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1691 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1692 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1693 /* Copy values only */ 1694 CsrMatrix *matrix,*matrixT; 1695 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1696 1697 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1698 matrix->values->assign(a->a, a->a+a->nz); 1699 err = WaitForCUDA();CHKERRCUDA(err); 1700 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1701 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1702 1703 /* Update matT when it was built before */ 1704 if (cusparsestruct->matTranspose) { 1705 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1706 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1707 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1708 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1709 A->cmap->n, matrix->num_entries, 1710 matrix->values->data().get(), 1711 cusparsestruct->rowoffsets_gpu->data().get(), 1712 matrix->column_indices->data().get(), 1713 matrixT->values->data().get(), 1714 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1715 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1716 CUSPARSE_ACTION_NUMERIC,indexBase, 1717 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1718 #else 1719 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1720 CUSPARSE_ACTION_NUMERIC, indexBase 1721 #endif 1722 );CHKERRCUSPARSE(stat); 1723 err = WaitForCUDA();CHKERRCUDA(err); 1724 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1725 } 1726 } else { 1727 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1728 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1729 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1730 delete cusparsestruct->workVector; 1731 delete cusparsestruct->rowoffsets_gpu; 1732 try { 1733 if (a->compressedrow.use) { 1734 m = a->compressedrow.nrows; 1735 ii = a->compressedrow.i; 1736 ridx = a->compressedrow.rindex; 1737 } else { 1738 m = A->rmap->n; 1739 ii = a->i; 1740 ridx = NULL; 1741 } 1742 cusparsestruct->nrows = m; 1743 1744 /* create cusparse matrix */ 1745 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1746 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1747 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1748 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1749 1750 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1751 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1752 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1753 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1754 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1755 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1756 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1757 1758 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1759 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1760 /* set the matrix */ 1761 CsrMatrix *mat= new CsrMatrix; 1762 mat->num_rows = m; 1763 mat->num_cols = A->cmap->n; 1764 mat->num_entries = a->nz; 1765 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1766 mat->row_offsets->assign(ii, ii + m+1); 1767 1768 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1769 mat->column_indices->assign(a->j, a->j+a->nz); 1770 1771 mat->values = new THRUSTARRAY(a->nz); 1772 mat->values->assign(a->a, a->a+a->nz); 1773 1774 /* assign the pointer */ 1775 matstruct->mat = mat; 1776 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1777 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1778 stat = cusparseCreateCsr(&matstruct->matDescr, 1779 mat->num_rows, mat->num_cols, mat->num_entries, 1780 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1781 mat->values->data().get(), 1782 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1783 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1784 } 1785 #endif 1786 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1787 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1788 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1789 #else 1790 CsrMatrix *mat= new CsrMatrix; 1791 mat->num_rows = m; 1792 mat->num_cols = A->cmap->n; 1793 mat->num_entries = a->nz; 1794 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1795 mat->row_offsets->assign(ii, ii + m+1); 1796 1797 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1798 mat->column_indices->assign(a->j, a->j+a->nz); 1799 1800 mat->values = new THRUSTARRAY(a->nz); 1801 mat->values->assign(a->a, a->a+a->nz); 1802 1803 cusparseHybMat_t hybMat; 1804 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1805 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1806 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1807 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1808 matstruct->descr, mat->values->data().get(), 1809 mat->row_offsets->data().get(), 1810 mat->column_indices->data().get(), 1811 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1812 /* assign the pointer */ 1813 matstruct->mat = hybMat; 1814 1815 if (mat) { 1816 if (mat->values) delete (THRUSTARRAY*)mat->values; 1817 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1818 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1819 delete (CsrMatrix*)mat; 1820 } 1821 #endif 1822 } 1823 1824 /* assign the compressed row indices */ 1825 if (a->compressedrow.use) { 1826 cusparsestruct->workVector = new THRUSTARRAY(m); 1827 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1828 matstruct->cprowIndices->assign(ridx,ridx+m); 1829 tmp = m; 1830 } else { 1831 cusparsestruct->workVector = NULL; 1832 matstruct->cprowIndices = NULL; 1833 tmp = 0; 1834 } 1835 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1836 1837 /* assign the pointer */ 1838 cusparsestruct->mat = matstruct; 1839 } catch(char *ex) { 1840 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1841 } 1842 err = WaitForCUDA();CHKERRCUDA(err); 1843 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1844 cusparsestruct->nonzerostate = A->nonzerostate; 1845 } 1846 A->offloadmask = PETSC_OFFLOAD_BOTH; 1847 } 1848 PetscFunctionReturn(0); 1849 } 1850 1851 struct VecCUDAPlusEquals 1852 { 1853 template <typename Tuple> 1854 __host__ __device__ 1855 void operator()(Tuple t) 1856 { 1857 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1858 } 1859 }; 1860 1861 struct VecCUDAEquals 1862 { 1863 template <typename Tuple> 1864 __host__ __device__ 1865 void operator()(Tuple t) 1866 { 1867 thrust::get<1>(t) = thrust::get<0>(t); 1868 } 1869 }; 1870 1871 struct VecCUDAEqualsReverse 1872 { 1873 template <typename Tuple> 1874 __host__ __device__ 1875 void operator()(Tuple t) 1876 { 1877 thrust::get<0>(t) = thrust::get<1>(t); 1878 } 1879 }; 1880 1881 struct MatMatCusparse { 1882 PetscBool cisdense; 1883 PetscScalar *Bt; 1884 Mat X; 1885 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1886 PetscLogDouble flops; 1887 CsrMatrix *Bcsr; 1888 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1889 cusparseSpMatDescr_t matSpBDescr; 1890 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1891 cusparseDnMatDescr_t matBDescr; 1892 cusparseDnMatDescr_t matCDescr; 1893 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1894 size_t mmBufferSize; 1895 void *mmBuffer; 1896 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1897 cusparseSpGEMMDescr_t spgemmDesc; 1898 #endif 1899 }; 1900 1901 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1902 { 1903 PetscErrorCode ierr; 1904 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1905 cudaError_t cerr; 1906 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1907 cusparseStatus_t stat; 1908 #endif 1909 1910 PetscFunctionBegin; 1911 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1912 delete mmdata->Bcsr; 1913 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1914 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1915 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1916 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1917 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1918 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1919 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1920 #endif 1921 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1922 ierr = PetscFree(data);CHKERRQ(ierr); 1923 PetscFunctionReturn(0); 1924 } 1925 1926 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1927 1928 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1929 { 1930 Mat_Product *product = C->product; 1931 Mat A,B; 1932 PetscInt m,n,blda,clda; 1933 PetscBool flg,biscuda; 1934 Mat_SeqAIJCUSPARSE *cusp; 1935 cusparseStatus_t stat; 1936 cusparseOperation_t opA; 1937 const PetscScalar *barray; 1938 PetscScalar *carray; 1939 PetscErrorCode ierr; 1940 MatMatCusparse *mmdata; 1941 Mat_SeqAIJCUSPARSEMultStruct *mat; 1942 CsrMatrix *csrmat; 1943 cudaError_t cerr; 1944 1945 PetscFunctionBegin; 1946 MatCheckProduct(C,1); 1947 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1948 mmdata = (MatMatCusparse*)product->data; 1949 A = product->A; 1950 B = product->B; 1951 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1952 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1953 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1954 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1955 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1956 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1957 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1958 switch (product->type) { 1959 case MATPRODUCT_AB: 1960 case MATPRODUCT_PtAP: 1961 mat = cusp->mat; 1962 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1963 m = A->rmap->n; 1964 n = B->cmap->n; 1965 break; 1966 case MATPRODUCT_AtB: 1967 if (!cusp->transgen) { 1968 mat = cusp->mat; 1969 opA = CUSPARSE_OPERATION_TRANSPOSE; 1970 } else { 1971 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1972 mat = cusp->matTranspose; 1973 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1974 } 1975 m = A->cmap->n; 1976 n = B->cmap->n; 1977 break; 1978 case MATPRODUCT_ABt: 1979 case MATPRODUCT_RARt: 1980 mat = cusp->mat; 1981 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1982 m = A->rmap->n; 1983 n = B->rmap->n; 1984 break; 1985 default: 1986 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1987 } 1988 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1989 csrmat = (CsrMatrix*)mat->mat; 1990 /* if the user passed a CPU matrix, copy the data to the GPU */ 1991 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1992 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1993 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1994 1995 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1996 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1997 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1998 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1999 } else { 2000 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2001 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2002 } 2003 2004 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2005 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2006 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2007 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2008 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2009 size_t mmBufferSize; 2010 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2011 if (!mmdata->matBDescr) { 2012 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2013 mmdata->Blda = blda; 2014 } 2015 2016 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2017 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2018 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2019 mmdata->Clda = clda; 2020 } 2021 2022 if (!mat->matDescr) { 2023 stat = cusparseCreateCsr(&mat->matDescr, 2024 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2025 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2026 csrmat->values->data().get(), 2027 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2028 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2029 } 2030 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2031 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2032 mmdata->matCDescr,cusparse_scalartype, 2033 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2034 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2035 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2036 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2037 mmdata->mmBufferSize = mmBufferSize; 2038 } 2039 mmdata->initialized = PETSC_TRUE; 2040 } else { 2041 /* to be safe, always update pointers of the mats */ 2042 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2043 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2044 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2045 } 2046 2047 /* do cusparseSpMM, which supports transpose on B */ 2048 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2049 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2050 mmdata->matCDescr,cusparse_scalartype, 2051 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2052 #else 2053 PetscInt k; 2054 /* cusparseXcsrmm does not support transpose on B */ 2055 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2056 cublasHandle_t cublasv2handle; 2057 cublasStatus_t cerr; 2058 2059 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2060 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2061 B->cmap->n,B->rmap->n, 2062 &PETSC_CUSPARSE_ONE ,barray,blda, 2063 &PETSC_CUSPARSE_ZERO,barray,blda, 2064 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2065 blda = B->cmap->n; 2066 k = B->cmap->n; 2067 } else { 2068 k = B->rmap->n; 2069 } 2070 2071 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2072 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2073 csrmat->num_entries,mat->alpha_one,mat->descr, 2074 csrmat->values->data().get(), 2075 csrmat->row_offsets->data().get(), 2076 csrmat->column_indices->data().get(), 2077 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2078 carray,clda);CHKERRCUSPARSE(stat); 2079 #endif 2080 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2081 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2082 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2083 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2084 if (product->type == MATPRODUCT_RARt) { 2085 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2086 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2087 } else if (product->type == MATPRODUCT_PtAP) { 2088 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2089 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2090 } else { 2091 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2092 } 2093 if (mmdata->cisdense) { 2094 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2095 } 2096 if (!biscuda) { 2097 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2098 } 2099 PetscFunctionReturn(0); 2100 } 2101 2102 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2103 { 2104 Mat_Product *product = C->product; 2105 Mat A,B; 2106 PetscInt m,n; 2107 PetscBool cisdense,flg; 2108 PetscErrorCode ierr; 2109 MatMatCusparse *mmdata; 2110 Mat_SeqAIJCUSPARSE *cusp; 2111 2112 PetscFunctionBegin; 2113 MatCheckProduct(C,1); 2114 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2115 A = product->A; 2116 B = product->B; 2117 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2118 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2119 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2120 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2121 switch (product->type) { 2122 case MATPRODUCT_AB: 2123 m = A->rmap->n; 2124 n = B->cmap->n; 2125 break; 2126 case MATPRODUCT_AtB: 2127 m = A->cmap->n; 2128 n = B->cmap->n; 2129 break; 2130 case MATPRODUCT_ABt: 2131 m = A->rmap->n; 2132 n = B->rmap->n; 2133 break; 2134 case MATPRODUCT_PtAP: 2135 m = B->cmap->n; 2136 n = B->cmap->n; 2137 break; 2138 case MATPRODUCT_RARt: 2139 m = B->rmap->n; 2140 n = B->rmap->n; 2141 break; 2142 default: 2143 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2144 } 2145 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2146 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2147 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2148 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2149 2150 /* product data */ 2151 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2152 mmdata->cisdense = cisdense; 2153 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2154 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2155 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2156 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2157 } 2158 #endif 2159 /* for these products we need intermediate storage */ 2160 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2161 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2162 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2163 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2164 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2165 } else { 2166 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2167 } 2168 } 2169 C->product->data = mmdata; 2170 C->product->destroy = MatDestroy_MatMatCusparse; 2171 2172 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2173 PetscFunctionReturn(0); 2174 } 2175 2176 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2177 { 2178 Mat_Product *product = C->product; 2179 Mat A,B; 2180 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2181 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2182 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2183 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2184 PetscBool flg; 2185 PetscErrorCode ierr; 2186 cusparseStatus_t stat; 2187 cudaError_t cerr; 2188 MatProductType ptype; 2189 MatMatCusparse *mmdata; 2190 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2191 cusparseSpMatDescr_t BmatSpDescr; 2192 #endif 2193 2194 PetscFunctionBegin; 2195 MatCheckProduct(C,1); 2196 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2197 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2198 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2199 mmdata = (MatMatCusparse*)C->product->data; 2200 A = product->A; 2201 B = product->B; 2202 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2203 mmdata->reusesym = PETSC_FALSE; 2204 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2205 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2206 Cmat = Ccusp->mat; 2207 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2208 Ccsr = (CsrMatrix*)Cmat->mat; 2209 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2210 goto finalize; 2211 } 2212 if (!c->nz) goto finalize; 2213 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2214 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2215 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2216 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2217 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2218 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2219 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2220 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2221 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2222 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2223 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2224 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2225 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2226 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2227 2228 ptype = product->type; 2229 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2230 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2231 switch (ptype) { 2232 case MATPRODUCT_AB: 2233 Amat = Acusp->mat; 2234 Bmat = Bcusp->mat; 2235 break; 2236 case MATPRODUCT_AtB: 2237 Amat = Acusp->matTranspose; 2238 Bmat = Bcusp->mat; 2239 break; 2240 case MATPRODUCT_ABt: 2241 Amat = Acusp->mat; 2242 Bmat = Bcusp->matTranspose; 2243 break; 2244 default: 2245 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2246 } 2247 Cmat = Ccusp->mat; 2248 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2249 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2250 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2251 Acsr = (CsrMatrix*)Amat->mat; 2252 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2253 Ccsr = (CsrMatrix*)Cmat->mat; 2254 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2255 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2256 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2257 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2259 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2260 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2261 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2262 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2263 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2264 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2265 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2266 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2267 #else 2268 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2269 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2270 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2271 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2272 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2273 #endif 2274 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2275 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2276 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2277 C->offloadmask = PETSC_OFFLOAD_GPU; 2278 finalize: 2279 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2280 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2281 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2282 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2283 c->reallocs = 0; 2284 C->info.mallocs += 0; 2285 C->info.nz_unneeded = 0; 2286 C->assembled = C->was_assembled = PETSC_TRUE; 2287 C->num_ass++; 2288 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 2289 ierr = MatSeqAIJCUSPARSECopyFromGPU(C);CHKERRQ(ierr); 2290 PetscFunctionReturn(0); 2291 } 2292 2293 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2294 { 2295 Mat_Product *product = C->product; 2296 Mat A,B; 2297 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2298 Mat_SeqAIJ *a,*b,*c; 2299 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2300 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2301 PetscInt i,j,m,n,k; 2302 PetscBool flg; 2303 PetscErrorCode ierr; 2304 cusparseStatus_t stat; 2305 cudaError_t cerr; 2306 MatProductType ptype; 2307 MatMatCusparse *mmdata; 2308 PetscLogDouble flops; 2309 PetscBool biscompressed,ciscompressed; 2310 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2311 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2312 size_t bufSize2; 2313 cusparseSpMatDescr_t BmatSpDescr; 2314 #else 2315 int cnz; 2316 #endif 2317 2318 PetscFunctionBegin; 2319 MatCheckProduct(C,1); 2320 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2321 A = product->A; 2322 B = product->B; 2323 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2324 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2325 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2326 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2327 a = (Mat_SeqAIJ*)A->data; 2328 b = (Mat_SeqAIJ*)B->data; 2329 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2330 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2331 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2332 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2333 2334 /* product data */ 2335 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2336 C->product->data = mmdata; 2337 C->product->destroy = MatDestroy_MatMatCusparse; 2338 2339 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2340 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2341 ptype = product->type; 2342 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2343 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2344 biscompressed = PETSC_FALSE; 2345 ciscompressed = PETSC_FALSE; 2346 switch (ptype) { 2347 case MATPRODUCT_AB: 2348 m = A->rmap->n; 2349 n = B->cmap->n; 2350 k = A->cmap->n; 2351 Amat = Acusp->mat; 2352 Bmat = Bcusp->mat; 2353 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2354 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2355 break; 2356 case MATPRODUCT_AtB: 2357 m = A->cmap->n; 2358 n = B->cmap->n; 2359 k = A->rmap->n; 2360 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2361 Amat = Acusp->matTranspose; 2362 Bmat = Bcusp->mat; 2363 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2364 break; 2365 case MATPRODUCT_ABt: 2366 m = A->rmap->n; 2367 n = B->rmap->n; 2368 k = A->cmap->n; 2369 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2370 Amat = Acusp->mat; 2371 Bmat = Bcusp->matTranspose; 2372 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2373 break; 2374 default: 2375 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2376 } 2377 2378 /* create cusparse matrix */ 2379 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2380 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2381 c = (Mat_SeqAIJ*)C->data; 2382 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2383 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2384 Ccsr = new CsrMatrix; 2385 2386 c->compressedrow.use = ciscompressed; 2387 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2388 c->compressedrow.nrows = a->compressedrow.nrows; 2389 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2390 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2391 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2392 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2393 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2394 } else { 2395 c->compressedrow.nrows = 0; 2396 c->compressedrow.i = NULL; 2397 c->compressedrow.rindex = NULL; 2398 Ccusp->workVector = NULL; 2399 Cmat->cprowIndices = NULL; 2400 } 2401 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2402 Ccusp->mat = Cmat; 2403 Ccusp->mat->mat = Ccsr; 2404 Ccsr->num_rows = Ccusp->nrows; 2405 Ccsr->num_cols = n; 2406 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2407 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2408 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2409 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2410 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2411 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2412 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2413 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2414 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2415 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2416 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2417 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2418 c->nz = 0; 2419 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2420 Ccsr->values = new THRUSTARRAY(c->nz); 2421 goto finalizesym; 2422 } 2423 2424 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2425 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2426 Acsr = (CsrMatrix*)Amat->mat; 2427 if (!biscompressed) { 2428 Bcsr = (CsrMatrix*)Bmat->mat; 2429 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2430 BmatSpDescr = Bmat->matDescr; 2431 #endif 2432 } else { /* we need to use row offsets for the full matrix */ 2433 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2434 Bcsr = new CsrMatrix; 2435 Bcsr->num_rows = B->rmap->n; 2436 Bcsr->num_cols = cBcsr->num_cols; 2437 Bcsr->num_entries = cBcsr->num_entries; 2438 Bcsr->column_indices = cBcsr->column_indices; 2439 Bcsr->values = cBcsr->values; 2440 if (!Bcusp->rowoffsets_gpu) { 2441 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2442 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2443 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2444 } 2445 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2446 mmdata->Bcsr = Bcsr; 2447 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2448 if (Bcsr->num_rows && Bcsr->num_cols) { 2449 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2450 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2451 Bcsr->values->data().get(), 2452 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2453 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2454 } 2455 BmatSpDescr = mmdata->matSpBDescr; 2456 #endif 2457 } 2458 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2459 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2460 /* precompute flops count */ 2461 if (ptype == MATPRODUCT_AB) { 2462 for (i=0, flops = 0; i<A->rmap->n; i++) { 2463 const PetscInt st = a->i[i]; 2464 const PetscInt en = a->i[i+1]; 2465 for (j=st; j<en; j++) { 2466 const PetscInt brow = a->j[j]; 2467 flops += 2.*(b->i[brow+1] - b->i[brow]); 2468 } 2469 } 2470 } else if (ptype == MATPRODUCT_AtB) { 2471 for (i=0, flops = 0; i<A->rmap->n; i++) { 2472 const PetscInt anzi = a->i[i+1] - a->i[i]; 2473 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2474 flops += (2.*anzi)*bnzi; 2475 } 2476 } else { /* TODO */ 2477 flops = 0.; 2478 } 2479 2480 mmdata->flops = flops; 2481 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2482 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2483 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2484 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2485 NULL, NULL, NULL, 2486 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2487 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2488 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2489 /* ask bufferSize bytes for external memory */ 2490 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2491 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2492 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2493 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2494 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUSPARSE(stat); 2495 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2496 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2497 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2498 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2499 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2500 /* ask bufferSize again bytes for external memory */ 2501 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2502 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2503 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2504 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2505 /* The CUSPARSE documentation is not clear, nor the API 2506 We need both buffers to perform the operations properly! 2507 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2508 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2509 is stored in the descriptor! What a messy API... */ 2510 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUSPARSE(stat); 2511 /* compute the intermediate product of A * B */ 2512 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2513 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2514 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2515 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2516 /* get matrix C non-zero entries C_nnz1 */ 2517 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2518 c->nz = (PetscInt) C_nnz1; 2519 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2520 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2521 Ccsr->values = new THRUSTARRAY(c->nz); 2522 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2523 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2524 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2525 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2526 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2527 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2528 #else 2529 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2530 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2531 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2532 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2533 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2534 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2535 c->nz = cnz; 2536 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2537 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2538 Ccsr->values = new THRUSTARRAY(c->nz); 2539 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2540 2541 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2542 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2543 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2544 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2545 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2546 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2547 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2548 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2549 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2550 #endif 2551 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2552 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2553 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2554 finalizesym: 2555 c->singlemalloc = PETSC_FALSE; 2556 c->free_a = PETSC_TRUE; 2557 c->free_ij = PETSC_TRUE; 2558 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2559 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2560 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2561 PetscInt *d_i = c->i; 2562 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2563 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2564 ii = *Ccsr->row_offsets; 2565 jj = *Ccsr->column_indices; 2566 if (ciscompressed) d_i = c->compressedrow.i; 2567 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2568 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2569 } else { 2570 PetscInt *d_i = c->i; 2571 if (ciscompressed) d_i = c->compressedrow.i; 2572 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2573 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2574 } 2575 if (ciscompressed) { /* need to expand host row offsets */ 2576 PetscInt r = 0; 2577 c->i[0] = 0; 2578 for (k = 0; k < c->compressedrow.nrows; k++) { 2579 const PetscInt next = c->compressedrow.rindex[k]; 2580 const PetscInt old = c->compressedrow.i[k]; 2581 for (; r < next; r++) c->i[r+1] = old; 2582 } 2583 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2584 } 2585 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2586 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2587 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2588 c->maxnz = c->nz; 2589 c->nonzerorowcnt = 0; 2590 c->rmax = 0; 2591 for (k = 0; k < m; k++) { 2592 const PetscInt nn = c->i[k+1] - c->i[k]; 2593 c->ilen[k] = c->imax[k] = nn; 2594 c->nonzerorowcnt += (PetscInt)!!nn; 2595 c->rmax = PetscMax(c->rmax,nn); 2596 } 2597 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2598 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2599 Ccsr->num_entries = c->nz; 2600 2601 C->nonzerostate++; 2602 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2603 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2604 Ccusp->nonzerostate = C->nonzerostate; 2605 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2606 C->preallocated = PETSC_TRUE; 2607 C->assembled = PETSC_FALSE; 2608 C->was_assembled = PETSC_FALSE; 2609 if (product->api_user) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2610 mmdata->reusesym = PETSC_TRUE; 2611 C->offloadmask = PETSC_OFFLOAD_GPU; 2612 } 2613 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2614 PetscFunctionReturn(0); 2615 } 2616 2617 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2618 2619 /* handles sparse or dense B */ 2620 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2621 { 2622 Mat_Product *product = mat->product; 2623 PetscErrorCode ierr; 2624 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2625 2626 PetscFunctionBegin; 2627 MatCheckProduct(mat,1); 2628 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2629 if (!product->B->boundtocpu) { 2630 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2631 } 2632 if (product->type == MATPRODUCT_ABC) { 2633 Ciscusp = PETSC_FALSE; 2634 if (!product->C->boundtocpu) { 2635 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2636 } 2637 } 2638 if (isdense) { 2639 switch (product->type) { 2640 case MATPRODUCT_AB: 2641 case MATPRODUCT_AtB: 2642 case MATPRODUCT_ABt: 2643 case MATPRODUCT_PtAP: 2644 case MATPRODUCT_RARt: 2645 if (product->A->boundtocpu) { 2646 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2647 } else { 2648 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2649 } 2650 break; 2651 case MATPRODUCT_ABC: 2652 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2653 break; 2654 default: 2655 break; 2656 } 2657 } else if (Biscusp && Ciscusp) { 2658 switch (product->type) { 2659 case MATPRODUCT_AB: 2660 case MATPRODUCT_AtB: 2661 case MATPRODUCT_ABt: 2662 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2663 break; 2664 case MATPRODUCT_PtAP: 2665 case MATPRODUCT_RARt: 2666 case MATPRODUCT_ABC: 2667 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2668 break; 2669 default: 2670 break; 2671 } 2672 } else { /* fallback for AIJ */ 2673 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2674 } 2675 PetscFunctionReturn(0); 2676 } 2677 2678 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2679 { 2680 PetscErrorCode ierr; 2681 2682 PetscFunctionBegin; 2683 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2684 PetscFunctionReturn(0); 2685 } 2686 2687 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2688 { 2689 PetscErrorCode ierr; 2690 2691 PetscFunctionBegin; 2692 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2693 PetscFunctionReturn(0); 2694 } 2695 2696 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2697 { 2698 PetscErrorCode ierr; 2699 2700 PetscFunctionBegin; 2701 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2702 PetscFunctionReturn(0); 2703 } 2704 2705 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2706 { 2707 PetscErrorCode ierr; 2708 2709 PetscFunctionBegin; 2710 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2711 PetscFunctionReturn(0); 2712 } 2713 2714 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2715 { 2716 PetscErrorCode ierr; 2717 2718 PetscFunctionBegin; 2719 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2720 PetscFunctionReturn(0); 2721 } 2722 2723 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2724 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2725 { 2726 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2727 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2728 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2729 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2730 PetscErrorCode ierr; 2731 cudaError_t cerr; 2732 cusparseStatus_t stat; 2733 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2734 PetscBool compressed; 2735 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2736 PetscInt nx,ny; 2737 #endif 2738 2739 PetscFunctionBegin; 2740 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2741 if (!a->nonzerorowcnt) { 2742 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2743 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2744 PetscFunctionReturn(0); 2745 } 2746 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2747 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2748 if (!trans) { 2749 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2750 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2751 } else { 2752 if (herm || !cusparsestruct->transgen) { 2753 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2754 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2755 } else { 2756 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2757 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2758 } 2759 } 2760 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2761 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2762 2763 try { 2764 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2765 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2766 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2767 2768 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2769 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2770 /* z = A x + beta y. 2771 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2772 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2773 */ 2774 xptr = xarray; 2775 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2776 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2777 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2778 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2779 allocated to accommodate different uses. So we get the length info directly from mat. 2780 */ 2781 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2782 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2783 nx = mat->num_cols; 2784 ny = mat->num_rows; 2785 } 2786 #endif 2787 } else { 2788 /* z = A^T x + beta y 2789 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2790 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2791 */ 2792 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2793 dptr = zarray; 2794 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2795 if (compressed) { /* Scatter x to work vector */ 2796 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2797 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2798 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2799 VecCUDAEqualsReverse()); 2800 } 2801 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2802 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2803 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2804 nx = mat->num_rows; 2805 ny = mat->num_cols; 2806 } 2807 #endif 2808 } 2809 2810 /* csr_spmv does y = alpha op(A) x + beta y */ 2811 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2812 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2813 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2814 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2815 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2816 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2817 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2818 matstruct->matDescr, 2819 matstruct->cuSpMV[opA].vecXDescr, beta, 2820 matstruct->cuSpMV[opA].vecYDescr, 2821 cusparse_scalartype, 2822 cusparsestruct->spmvAlg, 2823 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2824 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2825 2826 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2827 } else { 2828 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2829 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2830 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2831 } 2832 2833 stat = cusparseSpMV(cusparsestruct->handle, opA, 2834 matstruct->alpha_one, 2835 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2836 matstruct->cuSpMV[opA].vecXDescr, 2837 beta, 2838 matstruct->cuSpMV[opA].vecYDescr, 2839 cusparse_scalartype, 2840 cusparsestruct->spmvAlg, 2841 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2842 #else 2843 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2844 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2845 mat->num_rows, mat->num_cols, 2846 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2847 mat->values->data().get(), mat->row_offsets->data().get(), 2848 mat->column_indices->data().get(), xptr, beta, 2849 dptr);CHKERRCUSPARSE(stat); 2850 #endif 2851 } else { 2852 if (cusparsestruct->nrows) { 2853 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2854 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2855 #else 2856 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2857 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2858 matstruct->alpha_one, matstruct->descr, hybMat, 2859 xptr, beta, 2860 dptr);CHKERRCUSPARSE(stat); 2861 #endif 2862 } 2863 } 2864 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2865 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2866 2867 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2868 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2869 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2870 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2871 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2872 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2873 } 2874 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2875 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2876 } 2877 2878 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2879 if (compressed) { 2880 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2881 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2882 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2883 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2884 VecCUDAPlusEquals()); 2885 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2886 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2887 } 2888 } else { 2889 if (yy && yy != zz) { 2890 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2891 } 2892 } 2893 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2894 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2895 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2896 } catch(char *ex) { 2897 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2898 } 2899 if (yy) { 2900 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2901 } else { 2902 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2903 } 2904 PetscFunctionReturn(0); 2905 } 2906 2907 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2908 { 2909 PetscErrorCode ierr; 2910 2911 PetscFunctionBegin; 2912 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2913 PetscFunctionReturn(0); 2914 } 2915 2916 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2917 { 2918 PetscErrorCode ierr; 2919 PetscSplitCSRDataStructure *d_mat = NULL; 2920 PetscFunctionBegin; 2921 if (A->factortype == MAT_FACTOR_NONE) { 2922 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2923 } 2924 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2925 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2926 if (d_mat) { 2927 A->offloadmask = PETSC_OFFLOAD_GPU; 2928 } 2929 2930 PetscFunctionReturn(0); 2931 } 2932 2933 /* --------------------------------------------------------------------------------*/ 2934 /*@ 2935 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2936 (the default parallel PETSc format). This matrix will ultimately pushed down 2937 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2938 assembly performance the user should preallocate the matrix storage by setting 2939 the parameter nz (or the array nnz). By setting these parameters accurately, 2940 performance during matrix assembly can be increased by more than a factor of 50. 2941 2942 Collective 2943 2944 Input Parameters: 2945 + comm - MPI communicator, set to PETSC_COMM_SELF 2946 . m - number of rows 2947 . n - number of columns 2948 . nz - number of nonzeros per row (same for all rows) 2949 - nnz - array containing the number of nonzeros in the various rows 2950 (possibly different for each row) or NULL 2951 2952 Output Parameter: 2953 . A - the matrix 2954 2955 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2956 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2957 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2958 2959 Notes: 2960 If nnz is given then nz is ignored 2961 2962 The AIJ format (also called the Yale sparse matrix format or 2963 compressed row storage), is fully compatible with standard Fortran 77 2964 storage. That is, the stored row and column indices can begin at 2965 either one (as in Fortran) or zero. See the users' manual for details. 2966 2967 Specify the preallocated storage with either nz or nnz (not both). 2968 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2969 allocation. For large problems you MUST preallocate memory or you 2970 will get TERRIBLE performance, see the users' manual chapter on matrices. 2971 2972 By default, this format uses inodes (identical nodes) when possible, to 2973 improve numerical efficiency of matrix-vector products and solves. We 2974 search for consecutive rows with the same nonzero structure, thereby 2975 reusing matrix information to achieve increased efficiency. 2976 2977 Level: intermediate 2978 2979 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2980 @*/ 2981 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2982 { 2983 PetscErrorCode ierr; 2984 2985 PetscFunctionBegin; 2986 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2987 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2988 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2989 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2990 PetscFunctionReturn(0); 2991 } 2992 2993 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2994 { 2995 PetscErrorCode ierr; 2996 PetscSplitCSRDataStructure *d_mat = NULL; 2997 2998 PetscFunctionBegin; 2999 if (A->factortype == MAT_FACTOR_NONE) { 3000 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3001 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3002 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3003 } else { 3004 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3005 } 3006 if (d_mat) { 3007 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3008 cudaError_t err; 3009 PetscSplitCSRDataStructure h_mat; 3010 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3011 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3012 if (a->compressedrow.use) { 3013 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3014 } 3015 err = cudaFree(d_mat);CHKERRCUDA(err); 3016 } 3017 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3018 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3019 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3020 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3021 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3022 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3023 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3024 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3025 PetscFunctionReturn(0); 3026 } 3027 3028 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3029 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3030 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3031 { 3032 PetscErrorCode ierr; 3033 3034 PetscFunctionBegin; 3035 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3036 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3037 PetscFunctionReturn(0); 3038 } 3039 3040 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc. 3041 { 3042 PetscErrorCode ierr; 3043 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3044 PetscBool flgx,flgy; 3045 3046 PetscFunctionBegin; 3047 if (a == (PetscScalar)0.0) PetscFunctionReturn(0); 3048 PetscValidHeaderSpecific(Y,MAT_CLASSID,1); 3049 PetscValidHeaderSpecific(X,MAT_CLASSID,3); 3050 ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr); 3051 ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr); 3052 if (!flgx || !flgy) { 3053 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 3054 PetscFunctionReturn(0); 3055 } 3056 if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE"); 3057 if (str == DIFFERENT_NONZERO_PATTERN) { 3058 if (x->nz == y->nz) { 3059 PetscBool e; 3060 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 3061 if (e) { 3062 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 3063 if (e) { 3064 str = SAME_NONZERO_PATTERN; 3065 } 3066 } 3067 } 3068 } 3069 if (str != SAME_NONZERO_PATTERN) { 3070 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 3071 PetscFunctionReturn(0); 3072 } else { 3073 Mat_SeqAIJCUSPARSE *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3074 Mat_SeqAIJCUSPARSE *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr; 3075 if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3076 if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3077 if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) { 3078 if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) { 3079 ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 3080 } 3081 if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) { 3082 ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr); 3083 } 3084 ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3085 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3086 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3087 } else { 3088 cublasHandle_t cublasv2handle; 3089 cublasStatus_t cberr; 3090 cudaError_t err; 3091 PetscScalar alpha = a; 3092 PetscBLASInt one = 1, bnz = 1; 3093 CsrMatrix *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat; 3094 CsrMatrix *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat; 3095 PetscScalar *aa_y, *aa_x; 3096 aa_y = matrix_y->values->data().get(); 3097 aa_x = matrix_x->values->data().get(); 3098 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3099 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3100 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3101 cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr); 3102 err = WaitForCUDA();CHKERRCUDA(err); 3103 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3104 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3105 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3106 ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr); 3107 if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU; 3108 else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state"); 3109 ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 3110 } 3111 } 3112 PetscFunctionReturn(0); 3113 } 3114 3115 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3116 { 3117 PetscErrorCode ierr; 3118 PetscBool both = PETSC_FALSE; 3119 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3120 3121 PetscFunctionBegin; 3122 if (A->factortype == MAT_FACTOR_NONE) { 3123 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3124 if (spptr->mat) { 3125 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3126 if (matrix->values) { 3127 both = PETSC_TRUE; 3128 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3129 } 3130 } 3131 if (spptr->matTranspose) { 3132 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3133 if (matrix->values) { 3134 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3135 } 3136 } 3137 } 3138 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3139 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3140 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3141 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3142 else A->offloadmask = PETSC_OFFLOAD_CPU; 3143 3144 PetscFunctionReturn(0); 3145 } 3146 3147 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3148 { 3149 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3150 PetscErrorCode ierr; 3151 3152 PetscFunctionBegin; 3153 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3154 if (flg) { 3155 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3156 3157 A->ops->axpy = MatAXPY_SeqAIJ; 3158 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3159 A->ops->mult = MatMult_SeqAIJ; 3160 A->ops->multadd = MatMultAdd_SeqAIJ; 3161 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3162 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3163 A->ops->multhermitiantranspose = NULL; 3164 A->ops->multhermitiantransposeadd = NULL; 3165 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3166 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3167 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3168 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3169 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3170 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3171 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3172 } else { 3173 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3174 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3175 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3176 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3177 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3178 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3179 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3180 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3181 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3182 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3183 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3184 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3185 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3186 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3187 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3188 } 3189 A->boundtocpu = flg; 3190 a->inode.use = flg; 3191 PetscFunctionReturn(0); 3192 } 3193 3194 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3195 { 3196 PetscErrorCode ierr; 3197 cusparseStatus_t stat; 3198 Mat B; 3199 3200 PetscFunctionBegin; 3201 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3202 if (reuse == MAT_INITIAL_MATRIX) { 3203 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3204 } else if (reuse == MAT_REUSE_MATRIX) { 3205 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3206 } 3207 B = *newmat; 3208 3209 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3210 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3211 3212 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3213 if (B->factortype == MAT_FACTOR_NONE) { 3214 Mat_SeqAIJCUSPARSE *spptr; 3215 3216 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3217 spptr->format = MAT_CUSPARSE_CSR; 3218 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3219 B->spptr = spptr; 3220 spptr->deviceMat = NULL; 3221 } else { 3222 Mat_SeqAIJCUSPARSETriFactors *spptr; 3223 3224 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3225 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3226 B->spptr = spptr; 3227 } 3228 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3229 } 3230 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3231 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3232 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3233 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3234 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3235 3236 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3237 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3238 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3239 PetscFunctionReturn(0); 3240 } 3241 3242 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3243 { 3244 PetscErrorCode ierr; 3245 3246 PetscFunctionBegin; 3247 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3248 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3249 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3250 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3251 ierr = PetscOptionsEnd();CHKERRQ(ierr); 3252 PetscFunctionReturn(0); 3253 } 3254 3255 /*MC 3256 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3257 3258 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3259 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3260 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3261 3262 Options Database Keys: 3263 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3264 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3265 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3266 3267 Level: beginner 3268 3269 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3270 M*/ 3271 3272 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3273 3274 3275 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3276 { 3277 PetscErrorCode ierr; 3278 3279 PetscFunctionBegin; 3280 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3281 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3282 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3283 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3284 PetscFunctionReturn(0); 3285 } 3286 3287 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3288 { 3289 PetscErrorCode ierr; 3290 cusparseStatus_t stat; 3291 3292 PetscFunctionBegin; 3293 if (*cusparsestruct) { 3294 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3295 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3296 delete (*cusparsestruct)->workVector; 3297 delete (*cusparsestruct)->rowoffsets_gpu; 3298 delete (*cusparsestruct)->cooPerm; 3299 delete (*cusparsestruct)->cooPerm_a; 3300 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3301 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3302 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 3303 #endif 3304 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3305 } 3306 PetscFunctionReturn(0); 3307 } 3308 3309 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3310 { 3311 PetscFunctionBegin; 3312 if (*mat) { 3313 delete (*mat)->values; 3314 delete (*mat)->column_indices; 3315 delete (*mat)->row_offsets; 3316 delete *mat; 3317 *mat = 0; 3318 } 3319 PetscFunctionReturn(0); 3320 } 3321 3322 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3323 { 3324 cusparseStatus_t stat; 3325 PetscErrorCode ierr; 3326 3327 PetscFunctionBegin; 3328 if (*trifactor) { 3329 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3330 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3331 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3332 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3333 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3334 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3335 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3336 #endif 3337 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3338 } 3339 PetscFunctionReturn(0); 3340 } 3341 3342 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3343 { 3344 CsrMatrix *mat; 3345 cusparseStatus_t stat; 3346 cudaError_t err; 3347 3348 PetscFunctionBegin; 3349 if (*matstruct) { 3350 if ((*matstruct)->mat) { 3351 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3352 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3353 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3354 #else 3355 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3356 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3357 #endif 3358 } else { 3359 mat = (CsrMatrix*)(*matstruct)->mat; 3360 CsrMatrix_Destroy(&mat); 3361 } 3362 } 3363 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3364 delete (*matstruct)->cprowIndices; 3365 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3366 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3367 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3368 3369 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3370 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3371 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3372 for (int i=0; i<3; i++) { 3373 if (mdata->cuSpMV[i].initialized) { 3374 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3375 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3376 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3377 } 3378 } 3379 #endif 3380 delete *matstruct; 3381 *matstruct = NULL; 3382 } 3383 PetscFunctionReturn(0); 3384 } 3385 3386 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3387 { 3388 PetscErrorCode ierr; 3389 3390 PetscFunctionBegin; 3391 if (*trifactors) { 3392 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3393 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3394 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3395 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3396 delete (*trifactors)->rpermIndices; 3397 delete (*trifactors)->cpermIndices; 3398 delete (*trifactors)->workVector; 3399 (*trifactors)->rpermIndices = NULL; 3400 (*trifactors)->cpermIndices = NULL; 3401 (*trifactors)->workVector = NULL; 3402 } 3403 PetscFunctionReturn(0); 3404 } 3405 3406 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3407 { 3408 PetscErrorCode ierr; 3409 cusparseHandle_t handle; 3410 cusparseStatus_t stat; 3411 3412 PetscFunctionBegin; 3413 if (*trifactors) { 3414 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3415 if (handle = (*trifactors)->handle) { 3416 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3417 } 3418 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3419 } 3420 PetscFunctionReturn(0); 3421 } 3422 3423 struct IJCompare 3424 { 3425 __host__ __device__ 3426 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3427 { 3428 if (t1.get<0>() < t2.get<0>()) return true; 3429 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3430 return false; 3431 } 3432 }; 3433 3434 struct IJEqual 3435 { 3436 __host__ __device__ 3437 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3438 { 3439 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3440 return true; 3441 } 3442 }; 3443 3444 struct IJDiff 3445 { 3446 __host__ __device__ 3447 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3448 { 3449 return t1 == t2 ? 0 : 1; 3450 } 3451 }; 3452 3453 struct IJSum 3454 { 3455 __host__ __device__ 3456 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3457 { 3458 return t1||t2; 3459 } 3460 }; 3461 3462 #include <thrust/iterator/discard_iterator.h> 3463 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3464 { 3465 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3466 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3467 THRUSTARRAY *cooPerm_v = NULL,*cooPerm_w; 3468 thrust::device_ptr<const PetscScalar> d_v; 3469 CsrMatrix *matrix; 3470 PetscErrorCode ierr; 3471 cudaError_t cerr; 3472 PetscInt n; 3473 3474 PetscFunctionBegin; 3475 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3476 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3477 if (!cusp->cooPerm) { 3478 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3479 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3480 PetscFunctionReturn(0); 3481 } 3482 matrix = (CsrMatrix*)cusp->mat->mat; 3483 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3484 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 3485 if (!v) { 3486 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3487 goto finalize; 3488 } 3489 n = cusp->cooPerm->size(); 3490 if (isCudaMem(v)) { 3491 d_v = thrust::device_pointer_cast(v); 3492 } else { 3493 cooPerm_v = new THRUSTARRAY(n); 3494 cooPerm_v->assign(v,v+n); 3495 d_v = cooPerm_v->data(); 3496 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3497 } 3498 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3499 if (cusp->cooPerm_a) { 3500 cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3501 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3502 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3503 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3504 delete cooPerm_w; 3505 } else { 3506 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3507 matrix->values->begin())); 3508 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3509 matrix->values->end())); 3510 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3511 } 3512 } else { 3513 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3514 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3515 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3516 } else { 3517 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3518 matrix->values->begin())); 3519 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3520 matrix->values->end())); 3521 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3522 } 3523 } 3524 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3525 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 3526 finalize: 3527 delete cooPerm_v; 3528 A->offloadmask = PETSC_OFFLOAD_GPU; 3529 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3530 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3531 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3532 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3533 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3534 a->reallocs = 0; 3535 A->info.mallocs += 0; 3536 A->info.nz_unneeded = 0; 3537 A->assembled = A->was_assembled = PETSC_TRUE; 3538 A->num_ass++; 3539 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 3540 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3541 PetscFunctionReturn(0); 3542 } 3543 3544 #include <thrust/binary_search.h> 3545 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3546 { 3547 PetscErrorCode ierr; 3548 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3549 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3550 PetscInt cooPerm_n, nzr = 0; 3551 cudaError_t cerr; 3552 3553 PetscFunctionBegin; 3554 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3555 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3556 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3557 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3558 if (n != cooPerm_n) { 3559 delete cusp->cooPerm; 3560 delete cusp->cooPerm_a; 3561 cusp->cooPerm = NULL; 3562 cusp->cooPerm_a = NULL; 3563 } 3564 if (n) { 3565 THRUSTINTARRAY d_i(n); 3566 THRUSTINTARRAY d_j(n); 3567 THRUSTINTARRAY ii(A->rmap->n); 3568 3569 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3570 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3571 3572 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3573 d_i.assign(coo_i,coo_i+n); 3574 d_j.assign(coo_j,coo_j+n); 3575 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3576 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3577 3578 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3579 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3580 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3581 *cusp->cooPerm_a = d_i; 3582 THRUSTINTARRAY w = d_j; 3583 3584 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3585 if (nekey == ekey) { /* all entries are unique */ 3586 delete cusp->cooPerm_a; 3587 cusp->cooPerm_a = NULL; 3588 } else { /* I couldn't come up with a more elegant algorithm */ 3589 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3590 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3591 (*cusp->cooPerm_a)[0] = 0; 3592 w[0] = 0; 3593 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3594 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3595 } 3596 thrust::counting_iterator<PetscInt> search_begin(0); 3597 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3598 search_begin, search_begin + A->rmap->n, 3599 ii.begin()); 3600 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3601 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3602 3603 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3604 a->singlemalloc = PETSC_FALSE; 3605 a->free_a = PETSC_TRUE; 3606 a->free_ij = PETSC_TRUE; 3607 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3608 a->i[0] = 0; 3609 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3610 a->nz = a->maxnz = a->i[A->rmap->n]; 3611 a->rmax = 0; 3612 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3613 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3614 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3615 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3616 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3617 for (PetscInt i = 0; i < A->rmap->n; i++) { 3618 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3619 nzr += (PetscInt)!!(nnzr); 3620 a->ilen[i] = a->imax[i] = nnzr; 3621 a->rmax = PetscMax(a->rmax,nnzr); 3622 } 3623 a->nonzerorowcnt = nzr; 3624 A->preallocated = PETSC_TRUE; 3625 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3626 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3627 } else { 3628 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3629 } 3630 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3631 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3632 3633 /* We want to allocate the CUSPARSE struct for matvec now. 3634 The code is so convoluted now that I prefer to copy zeros */ 3635 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3636 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3637 A->offloadmask = PETSC_OFFLOAD_CPU; 3638 A->nonzerostate++; 3639 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3640 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3641 3642 A->assembled = PETSC_FALSE; 3643 A->was_assembled = PETSC_FALSE; 3644 PetscFunctionReturn(0); 3645 } 3646 3647 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3648 { 3649 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3650 CsrMatrix *csr; 3651 PetscErrorCode ierr; 3652 3653 PetscFunctionBegin; 3654 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3655 PetscValidPointer(a,2); 3656 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3657 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3658 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3659 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3660 csr = (CsrMatrix*)cusp->mat->mat; 3661 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3662 *a = csr->values->data().get(); 3663 PetscFunctionReturn(0); 3664 } 3665 3666 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3667 { 3668 PetscFunctionBegin; 3669 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3670 PetscValidPointer(a,2); 3671 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3672 *a = NULL; 3673 PetscFunctionReturn(0); 3674 } 3675 3676 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3677 { 3678 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3679 CsrMatrix *csr; 3680 3681 PetscFunctionBegin; 3682 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3683 PetscValidPointer(a,2); 3684 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3685 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3686 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3687 csr = (CsrMatrix*)cusp->mat->mat; 3688 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3689 *a = csr->values->data().get(); 3690 PetscFunctionReturn(0); 3691 } 3692 3693 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3694 { 3695 PetscErrorCode ierr; 3696 3697 PetscFunctionBegin; 3698 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3699 PetscValidPointer(a,2); 3700 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3701 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3702 A->offloadmask = PETSC_OFFLOAD_GPU; 3703 *a = NULL; 3704 PetscFunctionReturn(0); 3705 } 3706 3707 struct IJCompare4 3708 { 3709 __host__ __device__ 3710 inline bool operator() (const thrust::tuple<int, int, PetscScalar, bool> &t1, const thrust::tuple<int, int, PetscScalar, bool> &t2) 3711 { 3712 if (t1.get<0>() < t2.get<0>()) return true; 3713 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3714 return false; 3715 } 3716 }; 3717 3718 struct Shift 3719 { 3720 int _shift; 3721 3722 Shift(int shift) : _shift(shift) {} 3723 __host__ __device__ 3724 inline int operator() (const int &c) 3725 { 3726 return c + _shift; 3727 } 3728 }; 3729 3730 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3731 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3732 { 3733 PetscErrorCode ierr; 3734 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3735 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3736 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3737 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3738 PetscInt Annz,Bnnz; 3739 cusparseStatus_t stat; 3740 PetscInt i,m,n,zero = 0; 3741 cudaError_t cerr; 3742 3743 PetscFunctionBegin; 3744 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3745 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3746 PetscValidPointer(C,4); 3747 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3748 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3749 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3750 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3751 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3752 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3753 if (reuse == MAT_INITIAL_MATRIX) { 3754 m = A->rmap->n; 3755 n = A->cmap->n + B->cmap->n; 3756 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3757 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3758 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3759 c = (Mat_SeqAIJ*)(*C)->data; 3760 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3761 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3762 Ccsr = new CsrMatrix; 3763 Cmat->cprowIndices = NULL; 3764 c->compressedrow.use = PETSC_FALSE; 3765 c->compressedrow.nrows = 0; 3766 c->compressedrow.i = NULL; 3767 c->compressedrow.rindex = NULL; 3768 Ccusp->workVector = NULL; 3769 Ccusp->nrows = m; 3770 Ccusp->mat = Cmat; 3771 Ccusp->mat->mat = Ccsr; 3772 Ccsr->num_rows = m; 3773 Ccsr->num_cols = n; 3774 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3775 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3776 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3777 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3778 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3779 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3780 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3781 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3782 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3783 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3784 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3785 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3786 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3787 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3788 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3789 3790 Acsr = (CsrMatrix*)Acusp->mat->mat; 3791 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3792 Annz = (PetscInt)Acsr->column_indices->size(); 3793 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3794 c->nz = Annz + Bnnz; 3795 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3796 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3797 Ccsr->values = new THRUSTARRAY(c->nz); 3798 Ccsr->num_entries = c->nz; 3799 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3800 3801 if (c->nz) { 3802 THRUSTINTARRAY32 Acoo(Annz); 3803 THRUSTINTARRAY32 Bcoo(Bnnz); 3804 THRUSTINTARRAY32 *roff; 3805 if (a->compressedrow.use) { /* need full row offset */ 3806 if (!Acusp->rowoffsets_gpu) { 3807 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3808 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3809 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3810 } 3811 roff = Acusp->rowoffsets_gpu; 3812 } else roff = Acsr->row_offsets; 3813 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3814 stat = cusparseXcsr2coo(Acusp->handle, 3815 roff->data().get(), 3816 Annz, 3817 m, 3818 Acoo.data().get(), 3819 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3820 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3821 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3822 if (b->compressedrow.use) { /* need full row offset */ 3823 if (!Bcusp->rowoffsets_gpu) { 3824 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3825 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3826 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3827 } 3828 roff = Bcusp->rowoffsets_gpu; 3829 } else roff = Bcsr->row_offsets; 3830 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3831 stat = cusparseXcsr2coo(Bcusp->handle, 3832 roff->data().get(), 3833 Bnnz, 3834 m, 3835 Bcoo.data().get(), 3836 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3837 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3838 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3839 THRUSTINTARRAY32 Ccoo(c->nz); 3840 auto Aperm = thrust::make_constant_iterator(true); 3841 auto Bperm = thrust::make_constant_iterator(false); 3842 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3843 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3844 thrust::device_vector<bool> wPerm(Annz+Bnnz); 3845 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo.begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3846 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo.end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3847 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.begin(),Bcib,Bcsr->values->begin(),Bperm)); 3848 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.end(),Bcie,Bcsr->values->end(),Bperm)); 3849 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo.begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm.begin())); 3850 auto p1 = Ccusp->cooPerm->begin(); 3851 auto p2 = Ccusp->cooPerm->begin(); 3852 thrust::advance(p2,Annz); 3853 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3854 thrust::merge(Azb,Aze,Bzb,Bze,Czb,IJCompare4()); 3855 thrust::partition_copy(thrust::make_counting_iterator(zero),thrust::make_counting_iterator(c->nz),wPerm.begin(),p1,p2,thrust::identity<bool>()); 3856 stat = cusparseXcoo2csr(Ccusp->handle, 3857 Ccoo.data().get(), 3858 c->nz, 3859 m, 3860 Ccsr->row_offsets->data().get(), 3861 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3862 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3863 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3864 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3865 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 3866 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 3867 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3868 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3869 #endif 3870 if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 3871 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3872 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 3873 CsrMatrix *CcsrT = new CsrMatrix; 3874 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3875 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 3876 3877 Ccusp->transgen = PETSC_TRUE; 3878 CmatT->cprowIndices = NULL; 3879 CmatT->mat = CcsrT; 3880 CcsrT->num_rows = n; 3881 CcsrT->num_cols = m; 3882 CcsrT->num_entries = c->nz; 3883 3884 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 3885 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 3886 CcsrT->values = new THRUSTARRAY(c->nz); 3887 3888 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3889 auto rT = CcsrT->row_offsets->begin(); 3890 if (AT) { 3891 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 3892 thrust::advance(rT,-1); 3893 } 3894 if (BT) { 3895 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 3896 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 3897 thrust::copy(titb,tite,rT); 3898 } 3899 auto cT = CcsrT->column_indices->begin(); 3900 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 3901 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 3902 auto vT = CcsrT->values->begin(); 3903 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 3904 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 3905 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3906 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3907 3908 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 3909 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3910 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3911 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3912 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3913 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3914 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3915 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3916 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3917 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3918 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 3919 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 3920 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3921 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3922 #endif 3923 Ccusp->matTranspose = CmatT; 3924 } 3925 } 3926 3927 c->singlemalloc = PETSC_FALSE; 3928 c->free_a = PETSC_TRUE; 3929 c->free_ij = PETSC_TRUE; 3930 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 3931 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 3932 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3933 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3934 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3935 ii = *Ccsr->row_offsets; 3936 jj = *Ccsr->column_indices; 3937 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3938 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3939 } else { 3940 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3941 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3942 } 3943 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 3944 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 3945 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 3946 c->maxnz = c->nz; 3947 c->nonzerorowcnt = 0; 3948 c->rmax = 0; 3949 for (i = 0; i < m; i++) { 3950 const PetscInt nn = c->i[i+1] - c->i[i]; 3951 c->ilen[i] = c->imax[i] = nn; 3952 c->nonzerorowcnt += (PetscInt)!!nn; 3953 c->rmax = PetscMax(c->rmax,nn); 3954 } 3955 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 3956 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 3957 (*C)->nonzerostate++; 3958 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 3959 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 3960 Ccusp->nonzerostate = (*C)->nonzerostate; 3961 (*C)->preallocated = PETSC_TRUE; 3962 } else { 3963 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 3964 c = (Mat_SeqAIJ*)(*C)->data; 3965 if (c->nz) { 3966 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3967 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 3968 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3969 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 3970 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3971 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3972 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3973 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3974 Acsr = (CsrMatrix*)Acusp->mat->mat; 3975 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3976 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 3977 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 3978 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 3979 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 3980 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 3981 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 3982 auto pmid = Ccusp->cooPerm->begin(); 3983 thrust::advance(pmid,Acsr->num_entries); 3984 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3985 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 3986 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 3987 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 3988 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 3989 thrust::for_each(zibait,zieait,VecCUDAEquals()); 3990 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 3991 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 3992 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 3993 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 3994 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 3995 if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 3996 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 3997 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3998 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3999 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4000 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4001 auto vT = CcsrT->values->begin(); 4002 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4003 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4004 } 4005 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4006 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4007 } 4008 } 4009 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4010 (*C)->assembled = PETSC_TRUE; 4011 (*C)->was_assembled = PETSC_FALSE; 4012 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4013 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 4014 ierr = MatSeqAIJCUSPARSECopyFromGPU(*C);CHKERRQ(ierr); 4015 PetscFunctionReturn(0); 4016 } 4017