1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 75 76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 86 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 87 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 88 89 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 90 { 91 cusparseStatus_t stat; 92 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 93 94 PetscFunctionBegin; 95 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 96 cusparsestruct->stream = stream; 97 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 98 PetscFunctionReturn(0); 99 } 100 101 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 102 { 103 cusparseStatus_t stat; 104 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 105 106 PetscFunctionBegin; 107 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 108 if (cusparsestruct->handle != handle) { 109 if (cusparsestruct->handle) { 110 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 111 } 112 cusparsestruct->handle = handle; 113 } 114 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 115 PetscFunctionReturn(0); 116 } 117 118 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 119 { 120 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 121 PetscBool flg; 122 PetscErrorCode ierr; 123 124 PetscFunctionBegin; 125 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 126 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 127 if (cusparsestruct->handle) cusparsestruct->handle = 0; 128 PetscFunctionReturn(0); 129 } 130 131 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 132 { 133 PetscFunctionBegin; 134 *type = MATSOLVERCUSPARSE; 135 PetscFunctionReturn(0); 136 } 137 138 /*MC 139 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 140 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 141 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 142 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 143 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 144 algorithms are not recommended. This class does NOT support direct solver operations. 145 146 Level: beginner 147 148 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 149 M*/ 150 151 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 152 { 153 PetscErrorCode ierr; 154 PetscInt n = A->rmap->n; 155 156 PetscFunctionBegin; 157 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 158 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 159 (*B)->factortype = ftype; 160 (*B)->useordering = PETSC_TRUE; 161 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 162 163 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 164 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 165 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 166 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 167 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 168 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 169 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 170 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 171 172 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 173 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 174 PetscFunctionReturn(0); 175 } 176 177 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 178 { 179 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 180 181 PetscFunctionBegin; 182 switch (op) { 183 case MAT_CUSPARSE_MULT: 184 cusparsestruct->format = format; 185 break; 186 case MAT_CUSPARSE_ALL: 187 cusparsestruct->format = format; 188 break; 189 default: 190 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 191 } 192 PetscFunctionReturn(0); 193 } 194 195 /*@ 196 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 197 operation. Only the MatMult operation can use different GPU storage formats 198 for MPIAIJCUSPARSE matrices. 199 Not Collective 200 201 Input Parameters: 202 + A - Matrix of type SEQAIJCUSPARSE 203 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 204 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 205 206 Output Parameter: 207 208 Level: intermediate 209 210 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 211 @*/ 212 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 213 { 214 PetscErrorCode ierr; 215 216 PetscFunctionBegin; 217 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 218 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 219 PetscFunctionReturn(0); 220 } 221 222 /*@ 223 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 224 225 Collective on mat 226 227 Input Parameters: 228 + A - Matrix of type SEQAIJCUSPARSE 229 - transgen - the boolean flag 230 231 Level: intermediate 232 233 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose() 234 @*/ 235 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 236 { 237 PetscErrorCode ierr; 238 PetscBool flg; 239 240 PetscFunctionBegin; 241 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 242 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 243 if (flg) { 244 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 245 246 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 247 cusp->transgen = transgen; 248 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 249 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 250 } 251 } 252 PetscFunctionReturn(0); 253 } 254 255 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 256 { 257 PetscErrorCode ierr; 258 MatCUSPARSEStorageFormat format; 259 PetscBool flg; 260 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 261 262 PetscFunctionBegin; 263 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 264 if (A->factortype == MAT_FACTOR_NONE) { 265 PetscBool transgen = cusparsestruct->transgen; 266 267 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 268 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 269 270 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 271 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 272 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 273 274 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 275 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 276 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 277 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 278 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 279 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 280 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 281 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 282 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 283 284 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 285 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 286 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 287 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 288 289 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 290 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 291 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 292 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 293 #endif 294 } 295 ierr = PetscOptionsTail();CHKERRQ(ierr); 296 PetscFunctionReturn(0); 297 } 298 299 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 300 { 301 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 302 PetscErrorCode ierr; 303 304 PetscFunctionBegin; 305 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 306 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 307 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 308 PetscFunctionReturn(0); 309 } 310 311 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 312 { 313 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 314 PetscErrorCode ierr; 315 316 PetscFunctionBegin; 317 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 318 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 319 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 320 PetscFunctionReturn(0); 321 } 322 323 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 324 { 325 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 326 PetscErrorCode ierr; 327 328 PetscFunctionBegin; 329 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 330 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 331 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 332 PetscFunctionReturn(0); 333 } 334 335 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 336 { 337 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 338 PetscErrorCode ierr; 339 340 PetscFunctionBegin; 341 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 342 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 343 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 344 PetscFunctionReturn(0); 345 } 346 347 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 348 { 349 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 350 PetscInt n = A->rmap->n; 351 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 352 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 353 cusparseStatus_t stat; 354 const PetscInt *ai = a->i,*aj = a->j,*vi; 355 const MatScalar *aa = a->a,*v; 356 PetscInt *AiLo, *AjLo; 357 PetscInt i,nz, nzLower, offset, rowOffset; 358 PetscErrorCode ierr; 359 cudaError_t cerr; 360 361 PetscFunctionBegin; 362 if (!n) PetscFunctionReturn(0); 363 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 364 try { 365 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 366 nzLower=n+ai[n]-ai[1]; 367 if (!loTriFactor) { 368 PetscScalar *AALo; 369 370 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 371 372 /* Allocate Space for the lower triangular matrix */ 373 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 374 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 375 376 /* Fill the lower triangular matrix */ 377 AiLo[0] = (PetscInt) 0; 378 AiLo[n] = nzLower; 379 AjLo[0] = (PetscInt) 0; 380 AALo[0] = (MatScalar) 1.0; 381 v = aa; 382 vi = aj; 383 offset = 1; 384 rowOffset= 1; 385 for (i=1; i<n; i++) { 386 nz = ai[i+1] - ai[i]; 387 /* additional 1 for the term on the diagonal */ 388 AiLo[i] = rowOffset; 389 rowOffset += nz+1; 390 391 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 392 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 393 394 offset += nz; 395 AjLo[offset] = (PetscInt) i; 396 AALo[offset] = (MatScalar) 1.0; 397 offset += 1; 398 399 v += nz; 400 vi += nz; 401 } 402 403 /* allocate space for the triangular factor information */ 404 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 405 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 406 /* Create the matrix description */ 407 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 408 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 409 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 410 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 411 #else 412 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 413 #endif 414 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 415 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 416 417 /* set the operation */ 418 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 419 420 /* set the matrix */ 421 loTriFactor->csrMat = new CsrMatrix; 422 loTriFactor->csrMat->num_rows = n; 423 loTriFactor->csrMat->num_cols = n; 424 loTriFactor->csrMat->num_entries = nzLower; 425 426 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 427 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 428 429 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 430 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 431 432 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 433 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 434 435 /* Create the solve analysis information */ 436 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 437 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 438 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 439 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 440 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 441 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 442 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 443 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 444 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 445 #endif 446 447 /* perform the solve analysis */ 448 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 449 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 450 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 451 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 453 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 454 #endif 455 );CHKERRCUSPARSE(stat); 456 cerr = WaitForCUDA();CHKERRCUDA(cerr); 457 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 458 459 /* assign the pointer */ 460 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 461 loTriFactor->AA_h = AALo; 462 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 463 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 464 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 465 } else { /* update values only */ 466 if (!loTriFactor->AA_h) { 467 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 468 } 469 /* Fill the lower triangular matrix */ 470 loTriFactor->AA_h[0] = 1.0; 471 v = aa; 472 vi = aj; 473 offset = 1; 474 for (i=1; i<n; i++) { 475 nz = ai[i+1] - ai[i]; 476 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 477 offset += nz; 478 loTriFactor->AA_h[offset] = 1.0; 479 offset += 1; 480 v += nz; 481 } 482 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 483 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 484 } 485 } catch(char *ex) { 486 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 487 } 488 } 489 PetscFunctionReturn(0); 490 } 491 492 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 493 { 494 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 495 PetscInt n = A->rmap->n; 496 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 497 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 498 cusparseStatus_t stat; 499 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 500 const MatScalar *aa = a->a,*v; 501 PetscInt *AiUp, *AjUp; 502 PetscInt i,nz, nzUpper, offset; 503 PetscErrorCode ierr; 504 cudaError_t cerr; 505 506 PetscFunctionBegin; 507 if (!n) PetscFunctionReturn(0); 508 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 509 try { 510 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 511 nzUpper = adiag[0]-adiag[n]; 512 if (!upTriFactor) { 513 PetscScalar *AAUp; 514 515 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 516 517 /* Allocate Space for the upper triangular matrix */ 518 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 519 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 520 521 /* Fill the upper triangular matrix */ 522 AiUp[0]=(PetscInt) 0; 523 AiUp[n]=nzUpper; 524 offset = nzUpper; 525 for (i=n-1; i>=0; i--) { 526 v = aa + adiag[i+1] + 1; 527 vi = aj + adiag[i+1] + 1; 528 529 /* number of elements NOT on the diagonal */ 530 nz = adiag[i] - adiag[i+1]-1; 531 532 /* decrement the offset */ 533 offset -= (nz+1); 534 535 /* first, set the diagonal elements */ 536 AjUp[offset] = (PetscInt) i; 537 AAUp[offset] = (MatScalar)1./v[nz]; 538 AiUp[i] = AiUp[i+1] - (nz+1); 539 540 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 541 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 542 } 543 544 /* allocate space for the triangular factor information */ 545 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 546 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 547 548 /* Create the matrix description */ 549 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 550 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 551 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 552 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 553 #else 554 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 555 #endif 556 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 557 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 558 559 /* set the operation */ 560 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 561 562 /* set the matrix */ 563 upTriFactor->csrMat = new CsrMatrix; 564 upTriFactor->csrMat->num_rows = n; 565 upTriFactor->csrMat->num_cols = n; 566 upTriFactor->csrMat->num_entries = nzUpper; 567 568 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 569 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 570 571 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 572 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 573 574 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 575 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 576 577 /* Create the solve analysis information */ 578 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 579 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 580 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 581 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 582 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 583 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 584 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 585 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 586 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 587 #endif 588 589 /* perform the solve analysis */ 590 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 591 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 592 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 593 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 594 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 595 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 596 #endif 597 );CHKERRCUSPARSE(stat); 598 cerr = WaitForCUDA();CHKERRCUDA(cerr); 599 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 600 601 /* assign the pointer */ 602 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 603 upTriFactor->AA_h = AAUp; 604 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 605 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 606 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 607 } else { 608 if (!upTriFactor->AA_h) { 609 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 610 } 611 /* Fill the upper triangular matrix */ 612 offset = nzUpper; 613 for (i=n-1; i>=0; i--) { 614 v = aa + adiag[i+1] + 1; 615 616 /* number of elements NOT on the diagonal */ 617 nz = adiag[i] - adiag[i+1]-1; 618 619 /* decrement the offset */ 620 offset -= (nz+1); 621 622 /* first, set the diagonal elements */ 623 upTriFactor->AA_h[offset] = 1./v[nz]; 624 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 625 } 626 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 627 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 628 } 629 } catch(char *ex) { 630 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 631 } 632 } 633 PetscFunctionReturn(0); 634 } 635 636 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 637 { 638 PetscErrorCode ierr; 639 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 640 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 641 IS isrow = a->row,iscol = a->icol; 642 PetscBool row_identity,col_identity; 643 PetscInt n = A->rmap->n; 644 645 PetscFunctionBegin; 646 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 647 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 648 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 649 650 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 651 cusparseTriFactors->nnz=a->nz; 652 653 A->offloadmask = PETSC_OFFLOAD_BOTH; 654 /* lower triangular indices */ 655 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 656 if (!row_identity && !cusparseTriFactors->rpermIndices) { 657 const PetscInt *r; 658 659 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 660 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 661 cusparseTriFactors->rpermIndices->assign(r, r+n); 662 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 663 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 664 } 665 666 /* upper triangular indices */ 667 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 668 if (!col_identity && !cusparseTriFactors->cpermIndices) { 669 const PetscInt *c; 670 671 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 672 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 673 cusparseTriFactors->cpermIndices->assign(c, c+n); 674 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 675 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 676 } 677 PetscFunctionReturn(0); 678 } 679 680 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 681 { 682 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 683 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 684 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 685 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 686 cusparseStatus_t stat; 687 PetscErrorCode ierr; 688 cudaError_t cerr; 689 PetscInt *AiUp, *AjUp; 690 PetscScalar *AAUp; 691 PetscScalar *AALo; 692 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 693 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 694 const PetscInt *ai = b->i,*aj = b->j,*vj; 695 const MatScalar *aa = b->a,*v; 696 697 PetscFunctionBegin; 698 if (!n) PetscFunctionReturn(0); 699 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 700 try { 701 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 702 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 703 if (!upTriFactor && !loTriFactor) { 704 /* Allocate Space for the upper triangular matrix */ 705 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 706 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 707 708 /* Fill the upper triangular matrix */ 709 AiUp[0]=(PetscInt) 0; 710 AiUp[n]=nzUpper; 711 offset = 0; 712 for (i=0; i<n; i++) { 713 /* set the pointers */ 714 v = aa + ai[i]; 715 vj = aj + ai[i]; 716 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 717 718 /* first, set the diagonal elements */ 719 AjUp[offset] = (PetscInt) i; 720 AAUp[offset] = (MatScalar)1.0/v[nz]; 721 AiUp[i] = offset; 722 AALo[offset] = (MatScalar)1.0/v[nz]; 723 724 offset+=1; 725 if (nz>0) { 726 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 727 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 728 for (j=offset; j<offset+nz; j++) { 729 AAUp[j] = -AAUp[j]; 730 AALo[j] = AAUp[j]/v[nz]; 731 } 732 offset+=nz; 733 } 734 } 735 736 /* allocate space for the triangular factor information */ 737 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 738 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 739 740 /* Create the matrix description */ 741 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 742 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 743 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 744 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 745 #else 746 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 747 #endif 748 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 749 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 750 751 /* set the matrix */ 752 upTriFactor->csrMat = new CsrMatrix; 753 upTriFactor->csrMat->num_rows = A->rmap->n; 754 upTriFactor->csrMat->num_cols = A->cmap->n; 755 upTriFactor->csrMat->num_entries = a->nz; 756 757 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 758 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 759 760 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 761 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 762 763 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 764 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 765 766 /* set the operation */ 767 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 768 769 /* Create the solve analysis information */ 770 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 771 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 772 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 773 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 774 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 775 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 776 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 777 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 778 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 779 #endif 780 781 /* perform the solve analysis */ 782 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 783 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 784 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 785 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 786 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 787 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 788 #endif 789 );CHKERRCUSPARSE(stat); 790 cerr = WaitForCUDA();CHKERRCUDA(cerr); 791 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 792 793 /* assign the pointer */ 794 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 795 796 /* allocate space for the triangular factor information */ 797 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 798 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 799 800 /* Create the matrix description */ 801 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 802 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 803 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 804 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 805 #else 806 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 807 #endif 808 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 809 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 810 811 /* set the operation */ 812 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 813 814 /* set the matrix */ 815 loTriFactor->csrMat = new CsrMatrix; 816 loTriFactor->csrMat->num_rows = A->rmap->n; 817 loTriFactor->csrMat->num_cols = A->cmap->n; 818 loTriFactor->csrMat->num_entries = a->nz; 819 820 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 821 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 822 823 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 824 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 825 826 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 827 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 828 829 /* Create the solve analysis information */ 830 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 831 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 832 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 833 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 834 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 835 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 836 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 837 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 838 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 839 #endif 840 841 /* perform the solve analysis */ 842 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 843 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 844 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 845 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 846 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 847 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 848 #endif 849 );CHKERRCUSPARSE(stat); 850 cerr = WaitForCUDA();CHKERRCUDA(cerr); 851 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 852 853 /* assign the pointer */ 854 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 855 856 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 857 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 858 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 859 } else { 860 /* Fill the upper triangular matrix */ 861 offset = 0; 862 for (i=0; i<n; i++) { 863 /* set the pointers */ 864 v = aa + ai[i]; 865 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 866 867 /* first, set the diagonal elements */ 868 AAUp[offset] = 1.0/v[nz]; 869 AALo[offset] = 1.0/v[nz]; 870 871 offset+=1; 872 if (nz>0) { 873 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 874 for (j=offset; j<offset+nz; j++) { 875 AAUp[j] = -AAUp[j]; 876 AALo[j] = AAUp[j]/v[nz]; 877 } 878 offset+=nz; 879 } 880 } 881 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 882 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 883 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 884 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 885 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 886 } 887 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 888 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 889 } catch(char *ex) { 890 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 891 } 892 } 893 PetscFunctionReturn(0); 894 } 895 896 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 897 { 898 PetscErrorCode ierr; 899 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 900 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 901 IS ip = a->row; 902 PetscBool perm_identity; 903 PetscInt n = A->rmap->n; 904 905 PetscFunctionBegin; 906 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 907 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 908 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 909 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 910 911 A->offloadmask = PETSC_OFFLOAD_BOTH; 912 913 /* lower triangular indices */ 914 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 915 if (!perm_identity) { 916 IS iip; 917 const PetscInt *irip,*rip; 918 919 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 920 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 921 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 922 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 923 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 924 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 925 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 926 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 927 ierr = ISDestroy(&iip);CHKERRQ(ierr); 928 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 929 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 930 } 931 PetscFunctionReturn(0); 932 } 933 934 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 935 { 936 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 937 IS isrow = b->row,iscol = b->col; 938 PetscBool row_identity,col_identity; 939 PetscErrorCode ierr; 940 941 PetscFunctionBegin; 942 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 943 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 944 B->offloadmask = PETSC_OFFLOAD_CPU; 945 /* determine which version of MatSolve needs to be used. */ 946 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 947 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 948 if (row_identity && col_identity) { 949 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 950 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 951 B->ops->matsolve = NULL; 952 B->ops->matsolvetranspose = NULL; 953 } else { 954 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 955 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 956 B->ops->matsolve = NULL; 957 B->ops->matsolvetranspose = NULL; 958 } 959 960 /* get the triangular factors */ 961 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 962 PetscFunctionReturn(0); 963 } 964 965 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 966 { 967 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 968 IS ip = b->row; 969 PetscBool perm_identity; 970 PetscErrorCode ierr; 971 972 PetscFunctionBegin; 973 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 974 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 975 B->offloadmask = PETSC_OFFLOAD_CPU; 976 /* determine which version of MatSolve needs to be used. */ 977 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 978 if (perm_identity) { 979 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 980 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 981 B->ops->matsolve = NULL; 982 B->ops->matsolvetranspose = NULL; 983 } else { 984 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 985 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 986 B->ops->matsolve = NULL; 987 B->ops->matsolvetranspose = NULL; 988 } 989 990 /* get the triangular factors */ 991 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 992 PetscFunctionReturn(0); 993 } 994 995 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 996 { 997 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 998 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 999 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1000 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1002 cusparseStatus_t stat; 1003 cusparseIndexBase_t indexBase; 1004 cusparseMatrixType_t matrixType; 1005 cusparseFillMode_t fillMode; 1006 cusparseDiagType_t diagType; 1007 cudaError_t cerr; 1008 PetscErrorCode ierr; 1009 1010 PetscFunctionBegin; 1011 /* allocate space for the transpose of the lower triangular factor */ 1012 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1013 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1014 1015 /* set the matrix descriptors of the lower triangular factor */ 1016 matrixType = cusparseGetMatType(loTriFactor->descr); 1017 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1018 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1019 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1020 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1021 1022 /* Create the matrix description */ 1023 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1024 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1025 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1026 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1027 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1028 1029 /* set the operation */ 1030 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1031 1032 /* allocate GPU space for the CSC of the lower triangular factor*/ 1033 loTriFactorT->csrMat = new CsrMatrix; 1034 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1035 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1036 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1037 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1038 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1039 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1040 1041 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1042 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1043 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1044 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1045 loTriFactor->csrMat->values->data().get(), 1046 loTriFactor->csrMat->row_offsets->data().get(), 1047 loTriFactor->csrMat->column_indices->data().get(), 1048 loTriFactorT->csrMat->values->data().get(), 1049 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1050 CUSPARSE_ACTION_NUMERIC,indexBase, 1051 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1052 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1053 #endif 1054 1055 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1056 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1057 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1058 loTriFactor->csrMat->values->data().get(), 1059 loTriFactor->csrMat->row_offsets->data().get(), 1060 loTriFactor->csrMat->column_indices->data().get(), 1061 loTriFactorT->csrMat->values->data().get(), 1062 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1063 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1064 CUSPARSE_ACTION_NUMERIC, indexBase, 1065 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1066 #else 1067 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1068 CUSPARSE_ACTION_NUMERIC, indexBase 1069 #endif 1070 );CHKERRCUSPARSE(stat); 1071 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1072 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1073 1074 /* Create the solve analysis information */ 1075 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1076 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1077 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1078 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1079 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1080 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1081 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1082 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1083 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1084 #endif 1085 1086 /* perform the solve analysis */ 1087 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1088 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1089 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1090 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1091 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1092 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1093 #endif 1094 );CHKERRCUSPARSE(stat); 1095 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1096 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1097 1098 /* assign the pointer */ 1099 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1100 1101 /*********************************************/ 1102 /* Now the Transpose of the Upper Tri Factor */ 1103 /*********************************************/ 1104 1105 /* allocate space for the transpose of the upper triangular factor */ 1106 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1107 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1108 1109 /* set the matrix descriptors of the upper triangular factor */ 1110 matrixType = cusparseGetMatType(upTriFactor->descr); 1111 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1112 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1113 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1114 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1115 1116 /* Create the matrix description */ 1117 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1118 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1119 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1121 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1122 1123 /* set the operation */ 1124 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1125 1126 /* allocate GPU space for the CSC of the upper triangular factor*/ 1127 upTriFactorT->csrMat = new CsrMatrix; 1128 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1129 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1130 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1131 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1132 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1133 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1134 1135 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1137 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1138 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1139 upTriFactor->csrMat->values->data().get(), 1140 upTriFactor->csrMat->row_offsets->data().get(), 1141 upTriFactor->csrMat->column_indices->data().get(), 1142 upTriFactorT->csrMat->values->data().get(), 1143 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1144 CUSPARSE_ACTION_NUMERIC,indexBase, 1145 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1146 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1147 #endif 1148 1149 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1150 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1151 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1152 upTriFactor->csrMat->values->data().get(), 1153 upTriFactor->csrMat->row_offsets->data().get(), 1154 upTriFactor->csrMat->column_indices->data().get(), 1155 upTriFactorT->csrMat->values->data().get(), 1156 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1157 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1158 CUSPARSE_ACTION_NUMERIC, indexBase, 1159 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1160 #else 1161 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1162 CUSPARSE_ACTION_NUMERIC, indexBase 1163 #endif 1164 );CHKERRCUSPARSE(stat); 1165 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1166 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1167 1168 /* Create the solve analysis information */ 1169 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1170 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1171 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1172 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1173 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1174 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1175 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1176 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1177 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1178 #endif 1179 1180 /* perform the solve analysis */ 1181 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1182 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1183 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1184 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1185 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1186 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1187 #endif 1188 );CHKERRCUSPARSE(stat); 1189 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1190 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1191 1192 /* assign the pointer */ 1193 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1194 PetscFunctionReturn(0); 1195 } 1196 1197 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1198 { 1199 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1200 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1201 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1202 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1203 cusparseStatus_t stat; 1204 cusparseIndexBase_t indexBase; 1205 cudaError_t err; 1206 PetscErrorCode ierr; 1207 1208 PetscFunctionBegin; 1209 if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1210 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1211 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1212 /* create cusparse matrix */ 1213 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1214 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1215 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1216 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1217 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1218 1219 /* set alpha and beta */ 1220 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1221 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1222 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1223 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1224 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1225 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1226 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1227 1228 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1229 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1230 CsrMatrix *matrixT= new CsrMatrix; 1231 matrixT->num_rows = A->cmap->n; 1232 matrixT->num_cols = A->rmap->n; 1233 matrixT->num_entries = a->nz; 1234 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1235 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1236 matrixT->values = new THRUSTARRAY(a->nz); 1237 1238 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1239 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1240 1241 /* compute the transpose, i.e. the CSC */ 1242 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1243 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1244 A->cmap->n, matrix->num_entries, 1245 matrix->values->data().get(), 1246 cusparsestruct->rowoffsets_gpu->data().get(), 1247 matrix->column_indices->data().get(), 1248 matrixT->values->data().get(), 1249 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1250 CUSPARSE_ACTION_NUMERIC,indexBase, 1251 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1252 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1253 #endif 1254 1255 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1256 A->cmap->n, matrix->num_entries, 1257 matrix->values->data().get(), 1258 cusparsestruct->rowoffsets_gpu->data().get(), 1259 matrix->column_indices->data().get(), 1260 matrixT->values->data().get(), 1261 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1262 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1263 CUSPARSE_ACTION_NUMERIC,indexBase, 1264 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1265 #else 1266 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1267 CUSPARSE_ACTION_NUMERIC, indexBase 1268 #endif 1269 );CHKERRCUSPARSE(stat); 1270 matstructT->mat = matrixT; 1271 1272 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1273 stat = cusparseCreateCsr(&matstructT->matDescr, 1274 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1275 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1276 matrixT->values->data().get(), 1277 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1278 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1279 #endif 1280 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1281 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1282 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1283 #else 1284 CsrMatrix *temp = new CsrMatrix; 1285 CsrMatrix *tempT = new CsrMatrix; 1286 /* First convert HYB to CSR */ 1287 temp->num_rows = A->rmap->n; 1288 temp->num_cols = A->cmap->n; 1289 temp->num_entries = a->nz; 1290 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1291 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1292 temp->values = new THRUSTARRAY(a->nz); 1293 1294 stat = cusparse_hyb2csr(cusparsestruct->handle, 1295 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1296 temp->values->data().get(), 1297 temp->row_offsets->data().get(), 1298 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1299 1300 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1301 tempT->num_rows = A->rmap->n; 1302 tempT->num_cols = A->cmap->n; 1303 tempT->num_entries = a->nz; 1304 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1305 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1306 tempT->values = new THRUSTARRAY(a->nz); 1307 1308 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1309 temp->num_cols, temp->num_entries, 1310 temp->values->data().get(), 1311 temp->row_offsets->data().get(), 1312 temp->column_indices->data().get(), 1313 tempT->values->data().get(), 1314 tempT->column_indices->data().get(), 1315 tempT->row_offsets->data().get(), 1316 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1317 1318 /* Last, convert CSC to HYB */ 1319 cusparseHybMat_t hybMat; 1320 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1321 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1322 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1323 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1324 matstructT->descr, tempT->values->data().get(), 1325 tempT->row_offsets->data().get(), 1326 tempT->column_indices->data().get(), 1327 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1328 1329 /* assign the pointer */ 1330 matstructT->mat = hybMat; 1331 /* delete temporaries */ 1332 if (tempT) { 1333 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1334 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1335 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1336 delete (CsrMatrix*) tempT; 1337 } 1338 if (temp) { 1339 if (temp->values) delete (THRUSTARRAY*) temp->values; 1340 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1341 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1342 delete (CsrMatrix*) temp; 1343 } 1344 #endif 1345 } 1346 err = WaitForCUDA();CHKERRCUDA(err); 1347 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1348 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1349 /* the compressed row indices is not used for matTranspose */ 1350 matstructT->cprowIndices = NULL; 1351 /* assign the pointer */ 1352 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1353 PetscFunctionReturn(0); 1354 } 1355 1356 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1357 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1358 { 1359 PetscInt n = xx->map->n; 1360 const PetscScalar *barray; 1361 PetscScalar *xarray; 1362 thrust::device_ptr<const PetscScalar> bGPU; 1363 thrust::device_ptr<PetscScalar> xGPU; 1364 cusparseStatus_t stat; 1365 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1366 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1367 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1368 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1369 PetscErrorCode ierr; 1370 cudaError_t cerr; 1371 1372 PetscFunctionBegin; 1373 /* Analyze the matrix and create the transpose ... on the fly */ 1374 if (!loTriFactorT && !upTriFactorT) { 1375 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1376 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1377 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1378 } 1379 1380 /* Get the GPU pointers */ 1381 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1382 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1383 xGPU = thrust::device_pointer_cast(xarray); 1384 bGPU = thrust::device_pointer_cast(barray); 1385 1386 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1387 /* First, reorder with the row permutation */ 1388 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1389 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1390 xGPU); 1391 1392 /* First, solve U */ 1393 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1394 upTriFactorT->csrMat->num_rows, 1395 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1396 upTriFactorT->csrMat->num_entries, 1397 #endif 1398 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1399 upTriFactorT->csrMat->values->data().get(), 1400 upTriFactorT->csrMat->row_offsets->data().get(), 1401 upTriFactorT->csrMat->column_indices->data().get(), 1402 upTriFactorT->solveInfo, 1403 xarray, tempGPU->data().get() 1404 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1405 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1406 #endif 1407 );CHKERRCUSPARSE(stat); 1408 1409 /* Then, solve L */ 1410 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1411 loTriFactorT->csrMat->num_rows, 1412 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1413 loTriFactorT->csrMat->num_entries, 1414 #endif 1415 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1416 loTriFactorT->csrMat->values->data().get(), 1417 loTriFactorT->csrMat->row_offsets->data().get(), 1418 loTriFactorT->csrMat->column_indices->data().get(), 1419 loTriFactorT->solveInfo, 1420 tempGPU->data().get(), xarray 1421 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1422 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1423 #endif 1424 );CHKERRCUSPARSE(stat); 1425 1426 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1427 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1428 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1429 tempGPU->begin()); 1430 1431 /* Copy the temporary to the full solution. */ 1432 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1433 1434 /* restore */ 1435 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1436 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1437 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1438 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1439 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1440 PetscFunctionReturn(0); 1441 } 1442 1443 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1444 { 1445 const PetscScalar *barray; 1446 PetscScalar *xarray; 1447 cusparseStatus_t stat; 1448 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1449 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1450 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1451 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1452 PetscErrorCode ierr; 1453 cudaError_t cerr; 1454 1455 PetscFunctionBegin; 1456 /* Analyze the matrix and create the transpose ... on the fly */ 1457 if (!loTriFactorT && !upTriFactorT) { 1458 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1459 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1460 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1461 } 1462 1463 /* Get the GPU pointers */ 1464 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1465 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1466 1467 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1468 /* First, solve U */ 1469 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1470 upTriFactorT->csrMat->num_rows, 1471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472 upTriFactorT->csrMat->num_entries, 1473 #endif 1474 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1475 upTriFactorT->csrMat->values->data().get(), 1476 upTriFactorT->csrMat->row_offsets->data().get(), 1477 upTriFactorT->csrMat->column_indices->data().get(), 1478 upTriFactorT->solveInfo, 1479 barray, tempGPU->data().get() 1480 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1482 #endif 1483 );CHKERRCUSPARSE(stat); 1484 1485 /* Then, solve L */ 1486 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1487 loTriFactorT->csrMat->num_rows, 1488 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1489 loTriFactorT->csrMat->num_entries, 1490 #endif 1491 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1492 loTriFactorT->csrMat->values->data().get(), 1493 loTriFactorT->csrMat->row_offsets->data().get(), 1494 loTriFactorT->csrMat->column_indices->data().get(), 1495 loTriFactorT->solveInfo, 1496 tempGPU->data().get(), xarray 1497 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1498 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1499 #endif 1500 );CHKERRCUSPARSE(stat); 1501 1502 /* restore */ 1503 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1504 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1505 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1506 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1507 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1508 PetscFunctionReturn(0); 1509 } 1510 1511 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1512 { 1513 const PetscScalar *barray; 1514 PetscScalar *xarray; 1515 thrust::device_ptr<const PetscScalar> bGPU; 1516 thrust::device_ptr<PetscScalar> xGPU; 1517 cusparseStatus_t stat; 1518 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1519 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1520 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1521 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1522 PetscErrorCode ierr; 1523 cudaError_t cerr; 1524 1525 PetscFunctionBegin; 1526 1527 /* Get the GPU pointers */ 1528 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1529 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1530 xGPU = thrust::device_pointer_cast(xarray); 1531 bGPU = thrust::device_pointer_cast(barray); 1532 1533 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1534 /* First, reorder with the row permutation */ 1535 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1536 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1537 tempGPU->begin()); 1538 1539 /* Next, solve L */ 1540 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1541 loTriFactor->csrMat->num_rows, 1542 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543 loTriFactor->csrMat->num_entries, 1544 #endif 1545 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1546 loTriFactor->csrMat->values->data().get(), 1547 loTriFactor->csrMat->row_offsets->data().get(), 1548 loTriFactor->csrMat->column_indices->data().get(), 1549 loTriFactor->solveInfo, 1550 tempGPU->data().get(), xarray 1551 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1552 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1553 #endif 1554 );CHKERRCUSPARSE(stat); 1555 1556 /* Then, solve U */ 1557 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1558 upTriFactor->csrMat->num_rows, 1559 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1560 upTriFactor->csrMat->num_entries, 1561 #endif 1562 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1563 upTriFactor->csrMat->values->data().get(), 1564 upTriFactor->csrMat->row_offsets->data().get(), 1565 upTriFactor->csrMat->column_indices->data().get(), 1566 upTriFactor->solveInfo, 1567 xarray, tempGPU->data().get() 1568 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1569 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1570 #endif 1571 );CHKERRCUSPARSE(stat); 1572 1573 /* Last, reorder with the column permutation */ 1574 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1575 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1576 xGPU); 1577 1578 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1579 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1580 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1581 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1582 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1583 PetscFunctionReturn(0); 1584 } 1585 1586 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1587 { 1588 const PetscScalar *barray; 1589 PetscScalar *xarray; 1590 cusparseStatus_t stat; 1591 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1594 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1595 PetscErrorCode ierr; 1596 cudaError_t cerr; 1597 1598 PetscFunctionBegin; 1599 /* Get the GPU pointers */ 1600 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1601 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1602 1603 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1604 /* First, solve L */ 1605 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1606 loTriFactor->csrMat->num_rows, 1607 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1608 loTriFactor->csrMat->num_entries, 1609 #endif 1610 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1611 loTriFactor->csrMat->values->data().get(), 1612 loTriFactor->csrMat->row_offsets->data().get(), 1613 loTriFactor->csrMat->column_indices->data().get(), 1614 loTriFactor->solveInfo, 1615 barray, tempGPU->data().get() 1616 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1617 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1618 #endif 1619 );CHKERRCUSPARSE(stat); 1620 1621 /* Next, solve U */ 1622 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1623 upTriFactor->csrMat->num_rows, 1624 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1625 upTriFactor->csrMat->num_entries, 1626 #endif 1627 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1628 upTriFactor->csrMat->values->data().get(), 1629 upTriFactor->csrMat->row_offsets->data().get(), 1630 upTriFactor->csrMat->column_indices->data().get(), 1631 upTriFactor->solveInfo, 1632 tempGPU->data().get(), xarray 1633 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1635 #endif 1636 );CHKERRCUSPARSE(stat); 1637 1638 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1639 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1640 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1641 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1642 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1643 PetscFunctionReturn(0); 1644 } 1645 1646 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1647 { 1648 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1649 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1650 cudaError_t cerr; 1651 PetscErrorCode ierr; 1652 1653 PetscFunctionBegin; 1654 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1655 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1656 1657 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1658 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1659 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1660 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1661 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1662 A->offloadmask = PETSC_OFFLOAD_BOTH; 1663 } 1664 PetscFunctionReturn(0); 1665 } 1666 1667 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1668 { 1669 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1670 PetscErrorCode ierr; 1671 1672 PetscFunctionBegin; 1673 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1674 *array = a->a; 1675 A->offloadmask = PETSC_OFFLOAD_CPU; 1676 PetscFunctionReturn(0); 1677 } 1678 1679 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1680 { 1681 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1682 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1683 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1684 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1685 PetscErrorCode ierr; 1686 cusparseStatus_t stat; 1687 cudaError_t err; 1688 1689 PetscFunctionBegin; 1690 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1691 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1692 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1693 /* Copy values only */ 1694 CsrMatrix *matrix,*matrixT; 1695 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1696 1697 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1698 matrix->values->assign(a->a, a->a+a->nz); 1699 err = WaitForCUDA();CHKERRCUDA(err); 1700 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1701 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1702 1703 /* Update matT when it was built before */ 1704 if (cusparsestruct->matTranspose) { 1705 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1706 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1707 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1708 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1709 A->cmap->n, matrix->num_entries, 1710 matrix->values->data().get(), 1711 cusparsestruct->rowoffsets_gpu->data().get(), 1712 matrix->column_indices->data().get(), 1713 matrixT->values->data().get(), 1714 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1715 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1716 CUSPARSE_ACTION_NUMERIC,indexBase, 1717 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1718 #else 1719 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1720 CUSPARSE_ACTION_NUMERIC, indexBase 1721 #endif 1722 );CHKERRCUSPARSE(stat); 1723 err = WaitForCUDA();CHKERRCUDA(err); 1724 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1725 } 1726 } else { 1727 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1728 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1729 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1730 delete cusparsestruct->workVector; 1731 delete cusparsestruct->rowoffsets_gpu; 1732 try { 1733 if (a->compressedrow.use) { 1734 m = a->compressedrow.nrows; 1735 ii = a->compressedrow.i; 1736 ridx = a->compressedrow.rindex; 1737 } else { 1738 m = A->rmap->n; 1739 ii = a->i; 1740 ridx = NULL; 1741 } 1742 cusparsestruct->nrows = m; 1743 1744 /* create cusparse matrix */ 1745 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1746 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1747 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1748 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1749 1750 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1751 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1752 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1753 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1754 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1755 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1756 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1757 1758 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1759 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1760 /* set the matrix */ 1761 CsrMatrix *mat= new CsrMatrix; 1762 mat->num_rows = m; 1763 mat->num_cols = A->cmap->n; 1764 mat->num_entries = a->nz; 1765 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1766 mat->row_offsets->assign(ii, ii + m+1); 1767 1768 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1769 mat->column_indices->assign(a->j, a->j+a->nz); 1770 1771 mat->values = new THRUSTARRAY(a->nz); 1772 mat->values->assign(a->a, a->a+a->nz); 1773 1774 /* assign the pointer */ 1775 matstruct->mat = mat; 1776 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1777 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1778 stat = cusparseCreateCsr(&matstruct->matDescr, 1779 mat->num_rows, mat->num_cols, mat->num_entries, 1780 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1781 mat->values->data().get(), 1782 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1783 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1784 } 1785 #endif 1786 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1787 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1788 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1789 #else 1790 CsrMatrix *mat= new CsrMatrix; 1791 mat->num_rows = m; 1792 mat->num_cols = A->cmap->n; 1793 mat->num_entries = a->nz; 1794 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1795 mat->row_offsets->assign(ii, ii + m+1); 1796 1797 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1798 mat->column_indices->assign(a->j, a->j+a->nz); 1799 1800 mat->values = new THRUSTARRAY(a->nz); 1801 mat->values->assign(a->a, a->a+a->nz); 1802 1803 cusparseHybMat_t hybMat; 1804 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1805 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1806 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1807 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1808 matstruct->descr, mat->values->data().get(), 1809 mat->row_offsets->data().get(), 1810 mat->column_indices->data().get(), 1811 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1812 /* assign the pointer */ 1813 matstruct->mat = hybMat; 1814 1815 if (mat) { 1816 if (mat->values) delete (THRUSTARRAY*)mat->values; 1817 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1818 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1819 delete (CsrMatrix*)mat; 1820 } 1821 #endif 1822 } 1823 1824 /* assign the compressed row indices */ 1825 if (a->compressedrow.use) { 1826 cusparsestruct->workVector = new THRUSTARRAY(m); 1827 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1828 matstruct->cprowIndices->assign(ridx,ridx+m); 1829 tmp = m; 1830 } else { 1831 cusparsestruct->workVector = NULL; 1832 matstruct->cprowIndices = NULL; 1833 tmp = 0; 1834 } 1835 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1836 1837 /* assign the pointer */ 1838 cusparsestruct->mat = matstruct; 1839 } catch(char *ex) { 1840 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1841 } 1842 err = WaitForCUDA();CHKERRCUDA(err); 1843 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1844 cusparsestruct->nonzerostate = A->nonzerostate; 1845 } 1846 A->offloadmask = PETSC_OFFLOAD_BOTH; 1847 } 1848 PetscFunctionReturn(0); 1849 } 1850 1851 struct VecCUDAPlusEquals 1852 { 1853 template <typename Tuple> 1854 __host__ __device__ 1855 void operator()(Tuple t) 1856 { 1857 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1858 } 1859 }; 1860 1861 struct VecCUDAEquals 1862 { 1863 template <typename Tuple> 1864 __host__ __device__ 1865 void operator()(Tuple t) 1866 { 1867 thrust::get<1>(t) = thrust::get<0>(t); 1868 } 1869 }; 1870 1871 struct VecCUDAEqualsReverse 1872 { 1873 template <typename Tuple> 1874 __host__ __device__ 1875 void operator()(Tuple t) 1876 { 1877 thrust::get<0>(t) = thrust::get<1>(t); 1878 } 1879 }; 1880 1881 struct MatMatCusparse { 1882 PetscBool cisdense; 1883 PetscScalar *Bt; 1884 Mat X; 1885 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1886 PetscLogDouble flops; 1887 CsrMatrix *Bcsr; 1888 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1889 cusparseSpMatDescr_t matSpBDescr; 1890 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1891 cusparseDnMatDescr_t matBDescr; 1892 cusparseDnMatDescr_t matCDescr; 1893 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1894 size_t mmBufferSize; 1895 void *mmBuffer; 1896 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1897 cusparseSpGEMMDescr_t spgemmDesc; 1898 #endif 1899 }; 1900 1901 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1902 { 1903 PetscErrorCode ierr; 1904 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1905 cudaError_t cerr; 1906 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1907 cusparseStatus_t stat; 1908 #endif 1909 1910 PetscFunctionBegin; 1911 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1912 delete mmdata->Bcsr; 1913 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1914 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1915 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1916 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1917 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1918 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1919 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1920 #endif 1921 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1922 ierr = PetscFree(data);CHKERRQ(ierr); 1923 PetscFunctionReturn(0); 1924 } 1925 1926 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1927 1928 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1929 { 1930 Mat_Product *product = C->product; 1931 Mat A,B; 1932 PetscInt m,n,blda,clda; 1933 PetscBool flg,biscuda; 1934 Mat_SeqAIJCUSPARSE *cusp; 1935 cusparseStatus_t stat; 1936 cusparseOperation_t opA; 1937 const PetscScalar *barray; 1938 PetscScalar *carray; 1939 PetscErrorCode ierr; 1940 MatMatCusparse *mmdata; 1941 Mat_SeqAIJCUSPARSEMultStruct *mat; 1942 CsrMatrix *csrmat; 1943 cudaError_t cerr; 1944 1945 PetscFunctionBegin; 1946 MatCheckProduct(C,1); 1947 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1948 mmdata = (MatMatCusparse*)product->data; 1949 A = product->A; 1950 B = product->B; 1951 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1952 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1953 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1954 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1955 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1956 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1957 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1958 switch (product->type) { 1959 case MATPRODUCT_AB: 1960 case MATPRODUCT_PtAP: 1961 mat = cusp->mat; 1962 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1963 m = A->rmap->n; 1964 n = B->cmap->n; 1965 break; 1966 case MATPRODUCT_AtB: 1967 if (!cusp->transgen) { 1968 mat = cusp->mat; 1969 opA = CUSPARSE_OPERATION_TRANSPOSE; 1970 } else { 1971 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1972 mat = cusp->matTranspose; 1973 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1974 } 1975 m = A->cmap->n; 1976 n = B->cmap->n; 1977 break; 1978 case MATPRODUCT_ABt: 1979 case MATPRODUCT_RARt: 1980 mat = cusp->mat; 1981 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1982 m = A->rmap->n; 1983 n = B->rmap->n; 1984 break; 1985 default: 1986 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1987 } 1988 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1989 csrmat = (CsrMatrix*)mat->mat; 1990 /* if the user passed a CPU matrix, copy the data to the GPU */ 1991 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1992 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1993 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1994 1995 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1996 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1997 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1998 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1999 } else { 2000 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2001 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2002 } 2003 2004 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2005 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2006 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2007 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2008 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2009 size_t mmBufferSize; 2010 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2011 if (!mmdata->matBDescr) { 2012 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2013 mmdata->Blda = blda; 2014 } 2015 2016 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2017 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2018 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2019 mmdata->Clda = clda; 2020 } 2021 2022 if (!mat->matDescr) { 2023 stat = cusparseCreateCsr(&mat->matDescr, 2024 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2025 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2026 csrmat->values->data().get(), 2027 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2028 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2029 } 2030 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2031 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2032 mmdata->matCDescr,cusparse_scalartype, 2033 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2034 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2035 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2036 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2037 mmdata->mmBufferSize = mmBufferSize; 2038 } 2039 mmdata->initialized = PETSC_TRUE; 2040 } else { 2041 /* to be safe, always update pointers of the mats */ 2042 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2043 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2044 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2045 } 2046 2047 /* do cusparseSpMM, which supports transpose on B */ 2048 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2049 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2050 mmdata->matCDescr,cusparse_scalartype, 2051 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2052 #else 2053 PetscInt k; 2054 /* cusparseXcsrmm does not support transpose on B */ 2055 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2056 cublasHandle_t cublasv2handle; 2057 cublasStatus_t cerr; 2058 2059 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2060 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2061 B->cmap->n,B->rmap->n, 2062 &PETSC_CUSPARSE_ONE ,barray,blda, 2063 &PETSC_CUSPARSE_ZERO,barray,blda, 2064 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2065 blda = B->cmap->n; 2066 k = B->cmap->n; 2067 } else { 2068 k = B->rmap->n; 2069 } 2070 2071 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2072 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2073 csrmat->num_entries,mat->alpha_one,mat->descr, 2074 csrmat->values->data().get(), 2075 csrmat->row_offsets->data().get(), 2076 csrmat->column_indices->data().get(), 2077 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2078 carray,clda);CHKERRCUSPARSE(stat); 2079 #endif 2080 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2081 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2082 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2083 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2084 if (product->type == MATPRODUCT_RARt) { 2085 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2086 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2087 } else if (product->type == MATPRODUCT_PtAP) { 2088 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2089 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2090 } else { 2091 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2092 } 2093 if (mmdata->cisdense) { 2094 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2095 } 2096 if (!biscuda) { 2097 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2098 } 2099 PetscFunctionReturn(0); 2100 } 2101 2102 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2103 { 2104 Mat_Product *product = C->product; 2105 Mat A,B; 2106 PetscInt m,n; 2107 PetscBool cisdense,flg; 2108 PetscErrorCode ierr; 2109 MatMatCusparse *mmdata; 2110 Mat_SeqAIJCUSPARSE *cusp; 2111 2112 PetscFunctionBegin; 2113 MatCheckProduct(C,1); 2114 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2115 A = product->A; 2116 B = product->B; 2117 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2118 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2119 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2120 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2121 switch (product->type) { 2122 case MATPRODUCT_AB: 2123 m = A->rmap->n; 2124 n = B->cmap->n; 2125 break; 2126 case MATPRODUCT_AtB: 2127 m = A->cmap->n; 2128 n = B->cmap->n; 2129 break; 2130 case MATPRODUCT_ABt: 2131 m = A->rmap->n; 2132 n = B->rmap->n; 2133 break; 2134 case MATPRODUCT_PtAP: 2135 m = B->cmap->n; 2136 n = B->cmap->n; 2137 break; 2138 case MATPRODUCT_RARt: 2139 m = B->rmap->n; 2140 n = B->rmap->n; 2141 break; 2142 default: 2143 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2144 } 2145 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2146 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2147 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2148 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2149 2150 /* product data */ 2151 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2152 mmdata->cisdense = cisdense; 2153 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2154 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2155 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2156 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2157 } 2158 #endif 2159 /* for these products we need intermediate storage */ 2160 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2161 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2162 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2163 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2164 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2165 } else { 2166 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2167 } 2168 } 2169 C->product->data = mmdata; 2170 C->product->destroy = MatDestroy_MatMatCusparse; 2171 2172 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2173 PetscFunctionReturn(0); 2174 } 2175 2176 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2177 { 2178 Mat_Product *product = C->product; 2179 Mat A,B; 2180 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2181 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2182 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2183 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2184 PetscBool flg; 2185 PetscErrorCode ierr; 2186 cusparseStatus_t stat; 2187 cudaError_t cerr; 2188 MatProductType ptype; 2189 MatMatCusparse *mmdata; 2190 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2191 cusparseSpMatDescr_t BmatSpDescr; 2192 #endif 2193 2194 PetscFunctionBegin; 2195 MatCheckProduct(C,1); 2196 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2197 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2198 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2199 mmdata = (MatMatCusparse*)C->product->data; 2200 A = product->A; 2201 B = product->B; 2202 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2203 mmdata->reusesym = PETSC_FALSE; 2204 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2205 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2206 Cmat = Ccusp->mat; 2207 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2208 Ccsr = (CsrMatrix*)Cmat->mat; 2209 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2210 goto finalize; 2211 } 2212 if (!c->nz) goto finalize; 2213 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2214 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2215 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2216 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2217 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2218 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2219 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2220 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2221 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2222 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2223 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2224 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2225 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2226 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2227 2228 ptype = product->type; 2229 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2230 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2231 switch (ptype) { 2232 case MATPRODUCT_AB: 2233 Amat = Acusp->mat; 2234 Bmat = Bcusp->mat; 2235 break; 2236 case MATPRODUCT_AtB: 2237 Amat = Acusp->matTranspose; 2238 Bmat = Bcusp->mat; 2239 break; 2240 case MATPRODUCT_ABt: 2241 Amat = Acusp->mat; 2242 Bmat = Bcusp->matTranspose; 2243 break; 2244 default: 2245 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2246 } 2247 Cmat = Ccusp->mat; 2248 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2249 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2250 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2251 Acsr = (CsrMatrix*)Amat->mat; 2252 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2253 Ccsr = (CsrMatrix*)Cmat->mat; 2254 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2255 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2256 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2257 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2259 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2260 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2261 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2262 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2263 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2264 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2265 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2266 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2267 #else 2268 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2269 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2270 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2271 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2272 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2273 #endif 2274 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2275 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2276 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2277 C->offloadmask = PETSC_OFFLOAD_GPU; 2278 finalize: 2279 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2280 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2281 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2282 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2283 c->reallocs = 0; 2284 C->info.mallocs += 0; 2285 C->info.nz_unneeded = 0; 2286 C->assembled = C->was_assembled = PETSC_TRUE; 2287 C->num_ass++; 2288 PetscFunctionReturn(0); 2289 } 2290 2291 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2292 { 2293 Mat_Product *product = C->product; 2294 Mat A,B; 2295 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2296 Mat_SeqAIJ *a,*b,*c; 2297 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2298 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2299 PetscInt i,j,m,n,k; 2300 PetscBool flg; 2301 PetscErrorCode ierr; 2302 cusparseStatus_t stat; 2303 cudaError_t cerr; 2304 MatProductType ptype; 2305 MatMatCusparse *mmdata; 2306 PetscLogDouble flops; 2307 PetscBool biscompressed,ciscompressed; 2308 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2309 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2310 size_t bufSize2; 2311 cusparseSpMatDescr_t BmatSpDescr; 2312 #else 2313 int cnz; 2314 #endif 2315 2316 PetscFunctionBegin; 2317 MatCheckProduct(C,1); 2318 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2319 A = product->A; 2320 B = product->B; 2321 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2322 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2323 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2324 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2325 a = (Mat_SeqAIJ*)A->data; 2326 b = (Mat_SeqAIJ*)B->data; 2327 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2328 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2329 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2330 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2331 2332 /* product data */ 2333 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2334 C->product->data = mmdata; 2335 C->product->destroy = MatDestroy_MatMatCusparse; 2336 2337 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2338 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2339 ptype = product->type; 2340 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2341 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2342 biscompressed = PETSC_FALSE; 2343 ciscompressed = PETSC_FALSE; 2344 switch (ptype) { 2345 case MATPRODUCT_AB: 2346 m = A->rmap->n; 2347 n = B->cmap->n; 2348 k = A->cmap->n; 2349 Amat = Acusp->mat; 2350 Bmat = Bcusp->mat; 2351 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2352 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2353 break; 2354 case MATPRODUCT_AtB: 2355 m = A->cmap->n; 2356 n = B->cmap->n; 2357 k = A->rmap->n; 2358 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2359 Amat = Acusp->matTranspose; 2360 Bmat = Bcusp->mat; 2361 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2362 break; 2363 case MATPRODUCT_ABt: 2364 m = A->rmap->n; 2365 n = B->rmap->n; 2366 k = A->cmap->n; 2367 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2368 Amat = Acusp->mat; 2369 Bmat = Bcusp->matTranspose; 2370 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2371 break; 2372 default: 2373 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2374 } 2375 2376 /* create cusparse matrix */ 2377 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2378 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2379 c = (Mat_SeqAIJ*)C->data; 2380 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2381 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2382 Ccsr = new CsrMatrix; 2383 2384 c->compressedrow.use = ciscompressed; 2385 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2386 c->compressedrow.nrows = a->compressedrow.nrows; 2387 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2388 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2389 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2390 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2391 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2392 } else { 2393 c->compressedrow.nrows = 0; 2394 c->compressedrow.i = NULL; 2395 c->compressedrow.rindex = NULL; 2396 Ccusp->workVector = NULL; 2397 Cmat->cprowIndices = NULL; 2398 } 2399 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2400 Ccusp->mat = Cmat; 2401 Ccusp->mat->mat = Ccsr; 2402 Ccsr->num_rows = Ccusp->nrows; 2403 Ccsr->num_cols = n; 2404 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2405 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2406 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2407 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2408 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2409 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2410 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2411 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2412 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2413 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2414 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2415 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2416 c->nz = 0; 2417 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2418 Ccsr->values = new THRUSTARRAY(c->nz); 2419 goto finalizesym; 2420 } 2421 2422 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2423 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2424 Acsr = (CsrMatrix*)Amat->mat; 2425 if (!biscompressed) { 2426 Bcsr = (CsrMatrix*)Bmat->mat; 2427 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2428 BmatSpDescr = Bmat->matDescr; 2429 #endif 2430 } else { /* we need to use row offsets for the full matrix */ 2431 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2432 Bcsr = new CsrMatrix; 2433 Bcsr->num_rows = B->rmap->n; 2434 Bcsr->num_cols = cBcsr->num_cols; 2435 Bcsr->num_entries = cBcsr->num_entries; 2436 Bcsr->column_indices = cBcsr->column_indices; 2437 Bcsr->values = cBcsr->values; 2438 if (!Bcusp->rowoffsets_gpu) { 2439 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2440 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2441 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2442 } 2443 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2444 mmdata->Bcsr = Bcsr; 2445 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2446 if (Bcsr->num_rows && Bcsr->num_cols) { 2447 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2448 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2449 Bcsr->values->data().get(), 2450 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2451 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2452 } 2453 BmatSpDescr = mmdata->matSpBDescr; 2454 #endif 2455 } 2456 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2457 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2458 /* precompute flops count */ 2459 if (ptype == MATPRODUCT_AB) { 2460 for (i=0, flops = 0; i<A->rmap->n; i++) { 2461 const PetscInt st = a->i[i]; 2462 const PetscInt en = a->i[i+1]; 2463 for (j=st; j<en; j++) { 2464 const PetscInt brow = a->j[j]; 2465 flops += 2.*(b->i[brow+1] - b->i[brow]); 2466 } 2467 } 2468 } else if (ptype == MATPRODUCT_AtB) { 2469 for (i=0, flops = 0; i<A->rmap->n; i++) { 2470 const PetscInt anzi = a->i[i+1] - a->i[i]; 2471 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2472 flops += (2.*anzi)*bnzi; 2473 } 2474 } else { /* TODO */ 2475 flops = 0.; 2476 } 2477 2478 mmdata->flops = flops; 2479 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2480 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2481 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2482 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2483 NULL, NULL, NULL, 2484 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2485 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2486 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2487 /* ask bufferSize bytes for external memory */ 2488 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2489 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2490 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2491 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2492 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUSPARSE(stat); 2493 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2494 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2495 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2496 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2497 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2498 /* ask bufferSize again bytes for external memory */ 2499 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2500 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2501 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2502 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2503 /* The CUSPARSE documentation is not clear, nor the API 2504 We need both buffers to perform the operations properly! 2505 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2506 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2507 is stored in the descriptor! What a messy API... */ 2508 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUSPARSE(stat); 2509 /* compute the intermediate product of A * B */ 2510 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2511 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2512 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2513 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2514 /* get matrix C non-zero entries C_nnz1 */ 2515 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2516 c->nz = (PetscInt) C_nnz1; 2517 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2518 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2519 Ccsr->values = new THRUSTARRAY(c->nz); 2520 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2521 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2522 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2523 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2524 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2525 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2526 #else 2527 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2528 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2529 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2530 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2531 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2532 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2533 c->nz = cnz; 2534 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2535 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2536 Ccsr->values = new THRUSTARRAY(c->nz); 2537 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2538 2539 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2540 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2541 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2542 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2543 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2544 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2545 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2546 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2547 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2548 #endif 2549 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2550 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2551 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2552 finalizesym: 2553 c->singlemalloc = PETSC_FALSE; 2554 c->free_a = PETSC_TRUE; 2555 c->free_ij = PETSC_TRUE; 2556 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2557 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2558 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2559 PetscInt *d_i = c->i; 2560 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2561 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2562 ii = *Ccsr->row_offsets; 2563 jj = *Ccsr->column_indices; 2564 if (ciscompressed) d_i = c->compressedrow.i; 2565 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2566 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2567 } else { 2568 PetscInt *d_i = c->i; 2569 if (ciscompressed) d_i = c->compressedrow.i; 2570 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2571 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2572 } 2573 if (ciscompressed) { /* need to expand host row offsets */ 2574 PetscInt r = 0; 2575 c->i[0] = 0; 2576 for (k = 0; k < c->compressedrow.nrows; k++) { 2577 const PetscInt next = c->compressedrow.rindex[k]; 2578 const PetscInt old = c->compressedrow.i[k]; 2579 for (; r < next; r++) c->i[r+1] = old; 2580 } 2581 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2582 } 2583 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2584 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2585 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2586 c->maxnz = c->nz; 2587 c->nonzerorowcnt = 0; 2588 c->rmax = 0; 2589 for (k = 0; k < m; k++) { 2590 const PetscInt nn = c->i[k+1] - c->i[k]; 2591 c->ilen[k] = c->imax[k] = nn; 2592 c->nonzerorowcnt += (PetscInt)!!nn; 2593 c->rmax = PetscMax(c->rmax,nn); 2594 } 2595 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2596 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2597 Ccsr->num_entries = c->nz; 2598 2599 C->nonzerostate++; 2600 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2601 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2602 Ccusp->nonzerostate = C->nonzerostate; 2603 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2604 C->preallocated = PETSC_TRUE; 2605 C->assembled = PETSC_FALSE; 2606 C->was_assembled = PETSC_FALSE; 2607 if (product->api_user) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2608 mmdata->reusesym = PETSC_TRUE; 2609 C->offloadmask = PETSC_OFFLOAD_GPU; 2610 } 2611 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2612 PetscFunctionReturn(0); 2613 } 2614 2615 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2616 2617 /* handles sparse or dense B */ 2618 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2619 { 2620 Mat_Product *product = mat->product; 2621 PetscErrorCode ierr; 2622 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2623 2624 PetscFunctionBegin; 2625 MatCheckProduct(mat,1); 2626 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2627 if (!product->B->boundtocpu) { 2628 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2629 } 2630 if (product->type == MATPRODUCT_ABC) { 2631 Ciscusp = PETSC_FALSE; 2632 if (!product->C->boundtocpu) { 2633 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2634 } 2635 } 2636 if (isdense) { 2637 switch (product->type) { 2638 case MATPRODUCT_AB: 2639 case MATPRODUCT_AtB: 2640 case MATPRODUCT_ABt: 2641 case MATPRODUCT_PtAP: 2642 case MATPRODUCT_RARt: 2643 if (product->A->boundtocpu) { 2644 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2645 } else { 2646 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2647 } 2648 break; 2649 case MATPRODUCT_ABC: 2650 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2651 break; 2652 default: 2653 break; 2654 } 2655 } else if (Biscusp && Ciscusp) { 2656 switch (product->type) { 2657 case MATPRODUCT_AB: 2658 case MATPRODUCT_AtB: 2659 case MATPRODUCT_ABt: 2660 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2661 break; 2662 case MATPRODUCT_PtAP: 2663 case MATPRODUCT_RARt: 2664 case MATPRODUCT_ABC: 2665 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2666 break; 2667 default: 2668 break; 2669 } 2670 } else { /* fallback for AIJ */ 2671 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2672 } 2673 PetscFunctionReturn(0); 2674 } 2675 2676 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2677 { 2678 PetscErrorCode ierr; 2679 2680 PetscFunctionBegin; 2681 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2682 PetscFunctionReturn(0); 2683 } 2684 2685 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2686 { 2687 PetscErrorCode ierr; 2688 2689 PetscFunctionBegin; 2690 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2691 PetscFunctionReturn(0); 2692 } 2693 2694 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2695 { 2696 PetscErrorCode ierr; 2697 2698 PetscFunctionBegin; 2699 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2700 PetscFunctionReturn(0); 2701 } 2702 2703 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2704 { 2705 PetscErrorCode ierr; 2706 2707 PetscFunctionBegin; 2708 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2709 PetscFunctionReturn(0); 2710 } 2711 2712 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2713 { 2714 PetscErrorCode ierr; 2715 2716 PetscFunctionBegin; 2717 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2718 PetscFunctionReturn(0); 2719 } 2720 2721 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2722 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2723 { 2724 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2725 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2726 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2727 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2728 PetscErrorCode ierr; 2729 cudaError_t cerr; 2730 cusparseStatus_t stat; 2731 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2732 PetscBool compressed; 2733 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2734 PetscInt nx,ny; 2735 #endif 2736 2737 PetscFunctionBegin; 2738 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2739 if (!a->nonzerorowcnt) { 2740 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2741 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2742 PetscFunctionReturn(0); 2743 } 2744 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2745 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2746 if (!trans) { 2747 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2748 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2749 } else { 2750 if (herm || !cusparsestruct->transgen) { 2751 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2752 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2753 } else { 2754 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2755 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2756 } 2757 } 2758 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2759 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2760 2761 try { 2762 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2763 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2764 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2765 2766 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2767 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2768 /* z = A x + beta y. 2769 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2770 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2771 */ 2772 xptr = xarray; 2773 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2774 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2775 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2776 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2777 allocated to accommodate different uses. So we get the length info directly from mat. 2778 */ 2779 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2780 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2781 nx = mat->num_cols; 2782 ny = mat->num_rows; 2783 } 2784 #endif 2785 } else { 2786 /* z = A^T x + beta y 2787 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2788 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2789 */ 2790 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2791 dptr = zarray; 2792 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2793 if (compressed) { /* Scatter x to work vector */ 2794 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2795 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2796 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2797 VecCUDAEqualsReverse()); 2798 } 2799 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2800 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2801 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2802 nx = mat->num_rows; 2803 ny = mat->num_cols; 2804 } 2805 #endif 2806 } 2807 2808 /* csr_spmv does y = alpha op(A) x + beta y */ 2809 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2810 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2811 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2812 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2813 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2814 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2815 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2816 matstruct->matDescr, 2817 matstruct->cuSpMV[opA].vecXDescr, beta, 2818 matstruct->cuSpMV[opA].vecYDescr, 2819 cusparse_scalartype, 2820 cusparsestruct->spmvAlg, 2821 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2822 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2823 2824 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2825 } else { 2826 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2827 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2828 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2829 } 2830 2831 stat = cusparseSpMV(cusparsestruct->handle, opA, 2832 matstruct->alpha_one, 2833 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2834 matstruct->cuSpMV[opA].vecXDescr, 2835 beta, 2836 matstruct->cuSpMV[opA].vecYDescr, 2837 cusparse_scalartype, 2838 cusparsestruct->spmvAlg, 2839 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2840 #else 2841 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2842 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2843 mat->num_rows, mat->num_cols, 2844 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2845 mat->values->data().get(), mat->row_offsets->data().get(), 2846 mat->column_indices->data().get(), xptr, beta, 2847 dptr);CHKERRCUSPARSE(stat); 2848 #endif 2849 } else { 2850 if (cusparsestruct->nrows) { 2851 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2852 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2853 #else 2854 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2855 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2856 matstruct->alpha_one, matstruct->descr, hybMat, 2857 xptr, beta, 2858 dptr);CHKERRCUSPARSE(stat); 2859 #endif 2860 } 2861 } 2862 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2863 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2864 2865 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2866 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2867 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2868 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2869 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2870 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2871 } 2872 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2873 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2874 } 2875 2876 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2877 if (compressed) { 2878 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2879 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2880 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2881 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2882 VecCUDAPlusEquals()); 2883 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2884 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2885 } 2886 } else { 2887 if (yy && yy != zz) { 2888 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2889 } 2890 } 2891 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2892 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2893 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2894 } catch(char *ex) { 2895 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2896 } 2897 if (yy) { 2898 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2899 } else { 2900 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2901 } 2902 PetscFunctionReturn(0); 2903 } 2904 2905 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2906 { 2907 PetscErrorCode ierr; 2908 2909 PetscFunctionBegin; 2910 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2911 PetscFunctionReturn(0); 2912 } 2913 2914 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2915 { 2916 PetscErrorCode ierr; 2917 PetscSplitCSRDataStructure *d_mat = NULL; 2918 PetscFunctionBegin; 2919 if (A->factortype == MAT_FACTOR_NONE) { 2920 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2921 } 2922 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2923 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2924 if (d_mat) { 2925 A->offloadmask = PETSC_OFFLOAD_GPU; 2926 } 2927 2928 PetscFunctionReturn(0); 2929 } 2930 2931 /* --------------------------------------------------------------------------------*/ 2932 /*@ 2933 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2934 (the default parallel PETSc format). This matrix will ultimately pushed down 2935 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2936 assembly performance the user should preallocate the matrix storage by setting 2937 the parameter nz (or the array nnz). By setting these parameters accurately, 2938 performance during matrix assembly can be increased by more than a factor of 50. 2939 2940 Collective 2941 2942 Input Parameters: 2943 + comm - MPI communicator, set to PETSC_COMM_SELF 2944 . m - number of rows 2945 . n - number of columns 2946 . nz - number of nonzeros per row (same for all rows) 2947 - nnz - array containing the number of nonzeros in the various rows 2948 (possibly different for each row) or NULL 2949 2950 Output Parameter: 2951 . A - the matrix 2952 2953 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2954 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2955 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2956 2957 Notes: 2958 If nnz is given then nz is ignored 2959 2960 The AIJ format (also called the Yale sparse matrix format or 2961 compressed row storage), is fully compatible with standard Fortran 77 2962 storage. That is, the stored row and column indices can begin at 2963 either one (as in Fortran) or zero. See the users' manual for details. 2964 2965 Specify the preallocated storage with either nz or nnz (not both). 2966 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2967 allocation. For large problems you MUST preallocate memory or you 2968 will get TERRIBLE performance, see the users' manual chapter on matrices. 2969 2970 By default, this format uses inodes (identical nodes) when possible, to 2971 improve numerical efficiency of matrix-vector products and solves. We 2972 search for consecutive rows with the same nonzero structure, thereby 2973 reusing matrix information to achieve increased efficiency. 2974 2975 Level: intermediate 2976 2977 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2978 @*/ 2979 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2980 { 2981 PetscErrorCode ierr; 2982 2983 PetscFunctionBegin; 2984 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2985 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2986 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2987 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2988 PetscFunctionReturn(0); 2989 } 2990 2991 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2992 { 2993 PetscErrorCode ierr; 2994 PetscSplitCSRDataStructure *d_mat = NULL; 2995 2996 PetscFunctionBegin; 2997 if (A->factortype == MAT_FACTOR_NONE) { 2998 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2999 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3000 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3001 } else { 3002 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3003 } 3004 if (d_mat) { 3005 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3006 cudaError_t err; 3007 PetscSplitCSRDataStructure h_mat; 3008 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3009 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3010 if (a->compressedrow.use) { 3011 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3012 } 3013 err = cudaFree(d_mat);CHKERRCUDA(err); 3014 } 3015 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3016 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3017 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3018 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3019 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3020 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3021 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3022 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3023 PetscFunctionReturn(0); 3024 } 3025 3026 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3027 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3028 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3029 { 3030 PetscErrorCode ierr; 3031 3032 PetscFunctionBegin; 3033 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3034 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3035 PetscFunctionReturn(0); 3036 } 3037 3038 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc. 3039 { 3040 PetscErrorCode ierr; 3041 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3042 PetscBool flgx,flgy; 3043 3044 PetscFunctionBegin; 3045 if (a == (PetscScalar)0.0) PetscFunctionReturn(0); 3046 PetscValidHeaderSpecific(Y,MAT_CLASSID,1); 3047 PetscValidHeaderSpecific(X,MAT_CLASSID,3); 3048 ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr); 3049 ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr); 3050 if (!flgx || !flgy) { 3051 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 3052 PetscFunctionReturn(0); 3053 } 3054 if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE"); 3055 if (str == DIFFERENT_NONZERO_PATTERN) { 3056 if (x->nz == y->nz) { 3057 PetscBool e; 3058 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 3059 if (e) { 3060 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 3061 if (e) { 3062 str = SAME_NONZERO_PATTERN; 3063 } 3064 } 3065 } 3066 } 3067 if (str != SAME_NONZERO_PATTERN) { 3068 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 3069 PetscFunctionReturn(0); 3070 } else { 3071 Mat_SeqAIJCUSPARSE *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3072 Mat_SeqAIJCUSPARSE *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr; 3073 if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3074 if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3075 if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) { 3076 if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) { 3077 ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 3078 } 3079 if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) { 3080 ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr); 3081 } 3082 ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3083 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3084 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3085 } else { 3086 cublasHandle_t cublasv2handle; 3087 cublasStatus_t cberr; 3088 cudaError_t err; 3089 PetscScalar alpha = a; 3090 PetscBLASInt one = 1, bnz = 1; 3091 CsrMatrix *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat; 3092 CsrMatrix *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat; 3093 PetscScalar *aa_y, *aa_x; 3094 aa_y = matrix_y->values->data().get(); 3095 aa_x = matrix_x->values->data().get(); 3096 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3097 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3098 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3099 cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr); 3100 err = WaitForCUDA();CHKERRCUDA(err); 3101 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3102 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3103 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3104 ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr); 3105 if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU; 3106 else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state"); 3107 ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 3108 } 3109 } 3110 PetscFunctionReturn(0); 3111 } 3112 3113 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3114 { 3115 PetscErrorCode ierr; 3116 PetscBool both = PETSC_FALSE; 3117 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3118 3119 PetscFunctionBegin; 3120 if (A->factortype == MAT_FACTOR_NONE) { 3121 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3122 if (spptr->mat) { 3123 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3124 if (matrix->values) { 3125 both = PETSC_TRUE; 3126 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3127 } 3128 } 3129 if (spptr->matTranspose) { 3130 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3131 if (matrix->values) { 3132 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3133 } 3134 } 3135 } 3136 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3137 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3138 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3139 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3140 else A->offloadmask = PETSC_OFFLOAD_CPU; 3141 3142 PetscFunctionReturn(0); 3143 } 3144 3145 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3146 { 3147 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3148 PetscErrorCode ierr; 3149 3150 PetscFunctionBegin; 3151 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3152 if (flg) { 3153 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3154 3155 A->ops->axpy = MatAXPY_SeqAIJ; 3156 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3157 A->ops->mult = MatMult_SeqAIJ; 3158 A->ops->multadd = MatMultAdd_SeqAIJ; 3159 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3160 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3161 A->ops->multhermitiantranspose = NULL; 3162 A->ops->multhermitiantransposeadd = NULL; 3163 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3164 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3165 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3166 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3167 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3168 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3169 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3170 } else { 3171 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3172 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3173 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3174 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3175 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3176 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3177 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3178 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3179 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3180 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3181 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3182 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3183 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3184 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3185 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3186 } 3187 A->boundtocpu = flg; 3188 a->inode.use = flg; 3189 PetscFunctionReturn(0); 3190 } 3191 3192 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3193 { 3194 PetscErrorCode ierr; 3195 cusparseStatus_t stat; 3196 Mat B; 3197 3198 PetscFunctionBegin; 3199 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3200 if (reuse == MAT_INITIAL_MATRIX) { 3201 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3202 } else if (reuse == MAT_REUSE_MATRIX) { 3203 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3204 } 3205 B = *newmat; 3206 3207 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3208 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3209 3210 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3211 if (B->factortype == MAT_FACTOR_NONE) { 3212 Mat_SeqAIJCUSPARSE *spptr; 3213 3214 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3215 spptr->format = MAT_CUSPARSE_CSR; 3216 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3217 B->spptr = spptr; 3218 spptr->deviceMat = NULL; 3219 } else { 3220 Mat_SeqAIJCUSPARSETriFactors *spptr; 3221 3222 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3223 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3224 B->spptr = spptr; 3225 } 3226 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3227 } 3228 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3229 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3230 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3231 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3232 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3233 3234 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3235 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3236 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3237 PetscFunctionReturn(0); 3238 } 3239 3240 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3241 { 3242 PetscErrorCode ierr; 3243 3244 PetscFunctionBegin; 3245 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3246 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3247 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3248 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3249 ierr = PetscOptionsEnd();CHKERRQ(ierr); 3250 PetscFunctionReturn(0); 3251 } 3252 3253 /*MC 3254 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3255 3256 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3257 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3258 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3259 3260 Options Database Keys: 3261 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3262 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3263 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3264 3265 Level: beginner 3266 3267 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3268 M*/ 3269 3270 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3271 3272 3273 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3274 { 3275 PetscErrorCode ierr; 3276 3277 PetscFunctionBegin; 3278 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3279 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3280 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3281 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3282 PetscFunctionReturn(0); 3283 } 3284 3285 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3286 { 3287 PetscErrorCode ierr; 3288 cusparseStatus_t stat; 3289 3290 PetscFunctionBegin; 3291 if (*cusparsestruct) { 3292 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3293 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3294 delete (*cusparsestruct)->workVector; 3295 delete (*cusparsestruct)->rowoffsets_gpu; 3296 delete (*cusparsestruct)->cooPerm; 3297 delete (*cusparsestruct)->cooPerm_a; 3298 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3299 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3300 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 3301 #endif 3302 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3303 } 3304 PetscFunctionReturn(0); 3305 } 3306 3307 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3308 { 3309 PetscFunctionBegin; 3310 if (*mat) { 3311 delete (*mat)->values; 3312 delete (*mat)->column_indices; 3313 delete (*mat)->row_offsets; 3314 delete *mat; 3315 *mat = 0; 3316 } 3317 PetscFunctionReturn(0); 3318 } 3319 3320 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3321 { 3322 cusparseStatus_t stat; 3323 PetscErrorCode ierr; 3324 3325 PetscFunctionBegin; 3326 if (*trifactor) { 3327 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3328 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3329 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3330 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3331 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3332 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3333 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3334 #endif 3335 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3336 } 3337 PetscFunctionReturn(0); 3338 } 3339 3340 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3341 { 3342 CsrMatrix *mat; 3343 cusparseStatus_t stat; 3344 cudaError_t err; 3345 3346 PetscFunctionBegin; 3347 if (*matstruct) { 3348 if ((*matstruct)->mat) { 3349 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3350 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3351 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3352 #else 3353 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3354 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3355 #endif 3356 } else { 3357 mat = (CsrMatrix*)(*matstruct)->mat; 3358 CsrMatrix_Destroy(&mat); 3359 } 3360 } 3361 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3362 delete (*matstruct)->cprowIndices; 3363 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3364 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3365 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3366 3367 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3368 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3369 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3370 for (int i=0; i<3; i++) { 3371 if (mdata->cuSpMV[i].initialized) { 3372 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3373 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3374 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3375 } 3376 } 3377 #endif 3378 delete *matstruct; 3379 *matstruct = NULL; 3380 } 3381 PetscFunctionReturn(0); 3382 } 3383 3384 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3385 { 3386 PetscErrorCode ierr; 3387 3388 PetscFunctionBegin; 3389 if (*trifactors) { 3390 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3391 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3392 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3393 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3394 delete (*trifactors)->rpermIndices; 3395 delete (*trifactors)->cpermIndices; 3396 delete (*trifactors)->workVector; 3397 (*trifactors)->rpermIndices = NULL; 3398 (*trifactors)->cpermIndices = NULL; 3399 (*trifactors)->workVector = NULL; 3400 } 3401 PetscFunctionReturn(0); 3402 } 3403 3404 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3405 { 3406 PetscErrorCode ierr; 3407 cusparseHandle_t handle; 3408 cusparseStatus_t stat; 3409 3410 PetscFunctionBegin; 3411 if (*trifactors) { 3412 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3413 if (handle = (*trifactors)->handle) { 3414 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3415 } 3416 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3417 } 3418 PetscFunctionReturn(0); 3419 } 3420 3421 struct IJCompare 3422 { 3423 __host__ __device__ 3424 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3425 { 3426 if (t1.get<0>() < t2.get<0>()) return true; 3427 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3428 return false; 3429 } 3430 }; 3431 3432 struct IJEqual 3433 { 3434 __host__ __device__ 3435 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3436 { 3437 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3438 return true; 3439 } 3440 }; 3441 3442 struct IJDiff 3443 { 3444 __host__ __device__ 3445 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3446 { 3447 return t1 == t2 ? 0 : 1; 3448 } 3449 }; 3450 3451 struct IJSum 3452 { 3453 __host__ __device__ 3454 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3455 { 3456 return t1||t2; 3457 } 3458 }; 3459 3460 #include <thrust/iterator/discard_iterator.h> 3461 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3462 { 3463 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3464 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3465 THRUSTARRAY *cooPerm_v = NULL,*cooPerm_w; 3466 thrust::device_ptr<const PetscScalar> d_v; 3467 CsrMatrix *matrix; 3468 PetscErrorCode ierr; 3469 cudaError_t cerr; 3470 PetscInt n; 3471 3472 PetscFunctionBegin; 3473 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3474 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3475 if (!cusp->cooPerm) { 3476 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3477 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3478 PetscFunctionReturn(0); 3479 } 3480 matrix = (CsrMatrix*)cusp->mat->mat; 3481 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3482 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 3483 if (!v) { 3484 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3485 goto finalize; 3486 } 3487 n = cusp->cooPerm->size(); 3488 if (isCudaMem(v)) { 3489 d_v = thrust::device_pointer_cast(v); 3490 } else { 3491 cooPerm_v = new THRUSTARRAY(n); 3492 cooPerm_v->assign(v,v+n); 3493 d_v = cooPerm_v->data(); 3494 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3495 } 3496 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3497 if (cusp->cooPerm_a) { 3498 cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3499 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3500 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3501 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3502 delete cooPerm_w; 3503 } else { 3504 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3505 matrix->values->begin())); 3506 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3507 matrix->values->end())); 3508 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3509 } 3510 } else { 3511 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3512 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3513 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3514 } else { 3515 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3516 matrix->values->begin())); 3517 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3518 matrix->values->end())); 3519 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3520 } 3521 } 3522 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3523 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 3524 finalize: 3525 delete cooPerm_v; 3526 A->offloadmask = PETSC_OFFLOAD_GPU; 3527 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3528 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3529 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3530 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3531 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3532 a->reallocs = 0; 3533 A->info.mallocs += 0; 3534 A->info.nz_unneeded = 0; 3535 A->assembled = A->was_assembled = PETSC_TRUE; 3536 A->num_ass++; 3537 PetscFunctionReturn(0); 3538 } 3539 3540 #include <thrust/binary_search.h> 3541 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3542 { 3543 PetscErrorCode ierr; 3544 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3545 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3546 PetscInt cooPerm_n, nzr = 0; 3547 cudaError_t cerr; 3548 3549 PetscFunctionBegin; 3550 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3551 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3552 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3553 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3554 if (n != cooPerm_n) { 3555 delete cusp->cooPerm; 3556 delete cusp->cooPerm_a; 3557 cusp->cooPerm = NULL; 3558 cusp->cooPerm_a = NULL; 3559 } 3560 if (n) { 3561 THRUSTINTARRAY d_i(n); 3562 THRUSTINTARRAY d_j(n); 3563 THRUSTINTARRAY ii(A->rmap->n); 3564 3565 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3566 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3567 3568 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3569 d_i.assign(coo_i,coo_i+n); 3570 d_j.assign(coo_j,coo_j+n); 3571 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3572 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3573 3574 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3575 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3576 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3577 *cusp->cooPerm_a = d_i; 3578 THRUSTINTARRAY w = d_j; 3579 3580 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3581 if (nekey == ekey) { /* all entries are unique */ 3582 delete cusp->cooPerm_a; 3583 cusp->cooPerm_a = NULL; 3584 } else { /* I couldn't come up with a more elegant algorithm */ 3585 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3586 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3587 (*cusp->cooPerm_a)[0] = 0; 3588 w[0] = 0; 3589 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3590 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3591 } 3592 thrust::counting_iterator<PetscInt> search_begin(0); 3593 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3594 search_begin, search_begin + A->rmap->n, 3595 ii.begin()); 3596 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3597 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3598 3599 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3600 a->singlemalloc = PETSC_FALSE; 3601 a->free_a = PETSC_TRUE; 3602 a->free_ij = PETSC_TRUE; 3603 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3604 a->i[0] = 0; 3605 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3606 a->nz = a->maxnz = a->i[A->rmap->n]; 3607 a->rmax = 0; 3608 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3609 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3610 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3611 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3612 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3613 for (PetscInt i = 0; i < A->rmap->n; i++) { 3614 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3615 nzr += (PetscInt)!!(nnzr); 3616 a->ilen[i] = a->imax[i] = nnzr; 3617 a->rmax = PetscMax(a->rmax,nnzr); 3618 } 3619 a->nonzerorowcnt = nzr; 3620 A->preallocated = PETSC_TRUE; 3621 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3622 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3623 } else { 3624 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3625 } 3626 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3627 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3628 3629 /* We want to allocate the CUSPARSE struct for matvec now. 3630 The code is so convoluted now that I prefer to copy zeros */ 3631 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3632 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3633 A->offloadmask = PETSC_OFFLOAD_CPU; 3634 A->nonzerostate++; 3635 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3636 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3637 3638 A->assembled = PETSC_FALSE; 3639 A->was_assembled = PETSC_FALSE; 3640 PetscFunctionReturn(0); 3641 } 3642 3643 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3644 { 3645 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3646 CsrMatrix *csr; 3647 PetscErrorCode ierr; 3648 3649 PetscFunctionBegin; 3650 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3651 PetscValidPointer(a,2); 3652 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3653 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3654 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3655 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3656 csr = (CsrMatrix*)cusp->mat->mat; 3657 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3658 *a = csr->values->data().get(); 3659 PetscFunctionReturn(0); 3660 } 3661 3662 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3663 { 3664 PetscFunctionBegin; 3665 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3666 PetscValidPointer(a,2); 3667 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3668 *a = NULL; 3669 PetscFunctionReturn(0); 3670 } 3671 3672 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3673 { 3674 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3675 CsrMatrix *csr; 3676 3677 PetscFunctionBegin; 3678 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3679 PetscValidPointer(a,2); 3680 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3681 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3682 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3683 csr = (CsrMatrix*)cusp->mat->mat; 3684 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3685 *a = csr->values->data().get(); 3686 PetscFunctionReturn(0); 3687 } 3688 3689 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3690 { 3691 PetscErrorCode ierr; 3692 3693 PetscFunctionBegin; 3694 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3695 PetscValidPointer(a,2); 3696 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3697 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3698 A->offloadmask = PETSC_OFFLOAD_GPU; 3699 *a = NULL; 3700 PetscFunctionReturn(0); 3701 } 3702 3703 struct IJCompare4 3704 { 3705 __host__ __device__ 3706 inline bool operator() (const thrust::tuple<int, int, PetscScalar, bool> &t1, const thrust::tuple<int, int, PetscScalar, bool> &t2) 3707 { 3708 if (t1.get<0>() < t2.get<0>()) return true; 3709 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3710 return false; 3711 } 3712 }; 3713 3714 struct Shift 3715 { 3716 int _shift; 3717 3718 Shift(int shift) : _shift(shift) {} 3719 __host__ __device__ 3720 inline int operator() (const int &c) 3721 { 3722 return c + _shift; 3723 } 3724 }; 3725 3726 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3727 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3728 { 3729 PetscErrorCode ierr; 3730 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3731 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3732 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3733 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3734 PetscInt Annz,Bnnz; 3735 cusparseStatus_t stat; 3736 PetscInt i,m,n,zero = 0; 3737 cudaError_t cerr; 3738 3739 PetscFunctionBegin; 3740 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3741 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3742 PetscValidPointer(C,4); 3743 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3744 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3745 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3746 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3747 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3748 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3749 if (reuse == MAT_INITIAL_MATRIX) { 3750 m = A->rmap->n; 3751 n = A->cmap->n + B->cmap->n; 3752 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3753 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3754 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3755 c = (Mat_SeqAIJ*)(*C)->data; 3756 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3757 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3758 Ccsr = new CsrMatrix; 3759 Cmat->cprowIndices = NULL; 3760 c->compressedrow.use = PETSC_FALSE; 3761 c->compressedrow.nrows = 0; 3762 c->compressedrow.i = NULL; 3763 c->compressedrow.rindex = NULL; 3764 Ccusp->workVector = NULL; 3765 Ccusp->nrows = m; 3766 Ccusp->mat = Cmat; 3767 Ccusp->mat->mat = Ccsr; 3768 Ccsr->num_rows = m; 3769 Ccsr->num_cols = n; 3770 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3771 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3772 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3773 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3774 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3775 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3776 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3777 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3778 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3779 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3780 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3781 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3782 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3783 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3784 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3785 3786 Acsr = (CsrMatrix*)Acusp->mat->mat; 3787 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3788 Annz = (PetscInt)Acsr->column_indices->size(); 3789 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3790 c->nz = Annz + Bnnz; 3791 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3792 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3793 Ccsr->values = new THRUSTARRAY(c->nz); 3794 Ccsr->num_entries = c->nz; 3795 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3796 3797 if (c->nz) { 3798 THRUSTINTARRAY32 Acoo(Annz); 3799 THRUSTINTARRAY32 Bcoo(Bnnz); 3800 THRUSTINTARRAY32 *roff; 3801 if (a->compressedrow.use) { /* need full row offset */ 3802 if (!Acusp->rowoffsets_gpu) { 3803 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3804 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3805 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3806 } 3807 roff = Acusp->rowoffsets_gpu; 3808 } else roff = Acsr->row_offsets; 3809 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3810 stat = cusparseXcsr2coo(Acusp->handle, 3811 roff->data().get(), 3812 Annz, 3813 m, 3814 Acoo.data().get(), 3815 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3816 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3817 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3818 if (b->compressedrow.use) { /* need full row offset */ 3819 if (!Bcusp->rowoffsets_gpu) { 3820 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3821 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3822 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3823 } 3824 roff = Bcusp->rowoffsets_gpu; 3825 } else roff = Bcsr->row_offsets; 3826 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3827 stat = cusparseXcsr2coo(Bcusp->handle, 3828 roff->data().get(), 3829 Bnnz, 3830 m, 3831 Bcoo.data().get(), 3832 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3833 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3834 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3835 THRUSTINTARRAY32 Ccoo(c->nz); 3836 auto Aperm = thrust::make_constant_iterator(true); 3837 auto Bperm = thrust::make_constant_iterator(false); 3838 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3839 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3840 thrust::device_vector<bool> wPerm(Annz+Bnnz); 3841 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo.begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3842 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo.end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3843 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.begin(),Bcib,Bcsr->values->begin(),Bperm)); 3844 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.end(),Bcie,Bcsr->values->end(),Bperm)); 3845 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo.begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm.begin())); 3846 auto p1 = Ccusp->cooPerm->begin(); 3847 auto p2 = Ccusp->cooPerm->begin(); 3848 thrust::advance(p2,Annz); 3849 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3850 thrust::merge(Azb,Aze,Bzb,Bze,Czb,IJCompare4()); 3851 thrust::partition_copy(thrust::make_counting_iterator(zero),thrust::make_counting_iterator(c->nz),wPerm.begin(),p1,p2,thrust::identity<bool>()); 3852 stat = cusparseXcoo2csr(Ccusp->handle, 3853 Ccoo.data().get(), 3854 c->nz, 3855 m, 3856 Ccsr->row_offsets->data().get(), 3857 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3858 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3859 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3860 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3861 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 3862 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 3863 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3864 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3865 #endif 3866 if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 3867 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3868 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 3869 CsrMatrix *CcsrT = new CsrMatrix; 3870 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3871 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 3872 3873 Ccusp->transgen = PETSC_TRUE; 3874 CmatT->cprowIndices = NULL; 3875 CmatT->mat = CcsrT; 3876 CcsrT->num_rows = n; 3877 CcsrT->num_cols = m; 3878 CcsrT->num_entries = c->nz; 3879 3880 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 3881 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 3882 CcsrT->values = new THRUSTARRAY(c->nz); 3883 3884 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3885 auto rT = CcsrT->row_offsets->begin(); 3886 if (AT) { 3887 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 3888 thrust::advance(rT,-1); 3889 } 3890 if (BT) { 3891 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 3892 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 3893 thrust::copy(titb,tite,rT); 3894 } 3895 auto cT = CcsrT->column_indices->begin(); 3896 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 3897 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 3898 auto vT = CcsrT->values->begin(); 3899 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 3900 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 3901 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3902 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3903 3904 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 3905 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3906 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3907 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3908 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3909 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3910 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3911 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3912 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3913 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3914 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 3915 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 3916 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3917 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3918 #endif 3919 Ccusp->matTranspose = CmatT; 3920 } 3921 } 3922 3923 c->singlemalloc = PETSC_FALSE; 3924 c->free_a = PETSC_TRUE; 3925 c->free_ij = PETSC_TRUE; 3926 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 3927 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 3928 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3929 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3930 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3931 ii = *Ccsr->row_offsets; 3932 jj = *Ccsr->column_indices; 3933 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3934 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3935 } else { 3936 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3937 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3938 } 3939 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 3940 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 3941 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 3942 c->maxnz = c->nz; 3943 c->nonzerorowcnt = 0; 3944 c->rmax = 0; 3945 for (i = 0; i < m; i++) { 3946 const PetscInt nn = c->i[i+1] - c->i[i]; 3947 c->ilen[i] = c->imax[i] = nn; 3948 c->nonzerorowcnt += (PetscInt)!!nn; 3949 c->rmax = PetscMax(c->rmax,nn); 3950 } 3951 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 3952 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 3953 (*C)->nonzerostate++; 3954 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 3955 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 3956 Ccusp->nonzerostate = (*C)->nonzerostate; 3957 (*C)->preallocated = PETSC_TRUE; 3958 } else { 3959 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 3960 c = (Mat_SeqAIJ*)(*C)->data; 3961 if (c->nz) { 3962 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3963 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 3964 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3965 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 3966 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3967 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3968 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3969 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3970 Acsr = (CsrMatrix*)Acusp->mat->mat; 3971 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3972 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 3973 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 3974 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 3975 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 3976 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 3977 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 3978 auto pmid = Ccusp->cooPerm->begin(); 3979 thrust::advance(pmid,Acsr->num_entries); 3980 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3981 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 3982 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 3983 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 3984 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 3985 thrust::for_each(zibait,zieait,VecCUDAEquals()); 3986 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 3987 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 3988 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 3989 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 3990 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 3991 if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 3992 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 3993 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3994 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3995 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 3996 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 3997 auto vT = CcsrT->values->begin(); 3998 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 3999 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4000 } 4001 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4002 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4003 } 4004 } 4005 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4006 (*C)->assembled = PETSC_TRUE; 4007 (*C)->was_assembled = PETSC_FALSE; 4008 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4009 PetscFunctionReturn(0); 4010 } 4011