1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 166 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 167 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 168 if (!A->boundtocpu) { 169 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 170 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 171 } else { 172 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 173 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 174 } 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 177 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 178 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 179 if (!A->boundtocpu) { 180 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 181 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 182 } else { 183 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 184 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 185 } 186 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 187 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 188 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 189 190 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 191 (*B)->canuseordering = PETSC_TRUE; 192 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 } 195 196 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 197 { 198 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 199 200 PetscFunctionBegin; 201 switch (op) { 202 case MAT_CUSPARSE_MULT: 203 cusparsestruct->format = format; 204 break; 205 case MAT_CUSPARSE_ALL: 206 cusparsestruct->format = format; 207 break; 208 default: 209 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 210 } 211 PetscFunctionReturn(0); 212 } 213 214 /*@ 215 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 216 operation. Only the MatMult operation can use different GPU storage formats 217 for MPIAIJCUSPARSE matrices. 218 Not Collective 219 220 Input Parameters: 221 + A - Matrix of type SEQAIJCUSPARSE 222 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 223 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 224 225 Output Parameter: 226 227 Level: intermediate 228 229 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 230 @*/ 231 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 232 { 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 237 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 242 { 243 PetscErrorCode ierr; 244 245 PetscFunctionBegin; 246 switch (op) { 247 case MAT_FORM_EXPLICIT_TRANSPOSE: 248 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 249 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 250 A->form_explicit_transpose = flg; 251 break; 252 default: 253 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 254 break; 255 } 256 PetscFunctionReturn(0); 257 } 258 259 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 260 261 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 262 { 263 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 264 IS isrow = b->row,iscol = b->col; 265 PetscBool row_identity,col_identity; 266 PetscErrorCode ierr; 267 268 PetscFunctionBegin; 269 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 270 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 271 B->offloadmask = PETSC_OFFLOAD_CPU; 272 /* determine which version of MatSolve needs to be used. */ 273 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 274 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 275 if (row_identity && col_identity) { 276 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 277 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 278 B->ops->matsolve = NULL; 279 B->ops->matsolvetranspose = NULL; 280 } else { 281 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 282 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 283 B->ops->matsolve = NULL; 284 B->ops->matsolvetranspose = NULL; 285 } 286 287 /* get the triangular factors */ 288 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 289 PetscFunctionReturn(0); 290 } 291 292 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 293 { 294 PetscErrorCode ierr; 295 MatCUSPARSEStorageFormat format; 296 PetscBool flg; 297 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 298 299 PetscFunctionBegin; 300 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 301 if (A->factortype == MAT_FACTOR_NONE) { 302 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 303 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 304 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 305 306 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 307 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 308 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 309 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 310 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 311 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 312 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 313 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 314 if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 315 #else 316 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 317 #endif 318 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 319 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 320 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 321 322 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 323 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 324 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 325 #endif 326 } 327 ierr = PetscOptionsTail();CHKERRQ(ierr); 328 PetscFunctionReturn(0); 329 } 330 331 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 332 { 333 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 334 PetscErrorCode ierr; 335 336 PetscFunctionBegin; 337 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 338 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 339 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 340 PetscFunctionReturn(0); 341 } 342 343 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 344 { 345 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 346 PetscErrorCode ierr; 347 348 PetscFunctionBegin; 349 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 350 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 351 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 352 PetscFunctionReturn(0); 353 } 354 355 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 356 { 357 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 358 PetscErrorCode ierr; 359 360 PetscFunctionBegin; 361 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 362 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 363 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 364 PetscFunctionReturn(0); 365 } 366 367 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 368 { 369 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 374 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 375 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 376 PetscFunctionReturn(0); 377 } 378 379 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 380 { 381 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 382 PetscInt n = A->rmap->n; 383 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 384 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 385 cusparseStatus_t stat; 386 const PetscInt *ai = a->i,*aj = a->j,*vi; 387 const MatScalar *aa = a->a,*v; 388 PetscInt *AiLo, *AjLo; 389 PetscInt i,nz, nzLower, offset, rowOffset; 390 PetscErrorCode ierr; 391 cudaError_t cerr; 392 393 PetscFunctionBegin; 394 if (!n) PetscFunctionReturn(0); 395 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 396 try { 397 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 398 nzLower=n+ai[n]-ai[1]; 399 if (!loTriFactor) { 400 PetscScalar *AALo; 401 402 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 403 404 /* Allocate Space for the lower triangular matrix */ 405 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 406 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 407 408 /* Fill the lower triangular matrix */ 409 AiLo[0] = (PetscInt) 0; 410 AiLo[n] = nzLower; 411 AjLo[0] = (PetscInt) 0; 412 AALo[0] = (MatScalar) 1.0; 413 v = aa; 414 vi = aj; 415 offset = 1; 416 rowOffset= 1; 417 for (i=1; i<n; i++) { 418 nz = ai[i+1] - ai[i]; 419 /* additional 1 for the term on the diagonal */ 420 AiLo[i] = rowOffset; 421 rowOffset += nz+1; 422 423 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 424 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 425 426 offset += nz; 427 AjLo[offset] = (PetscInt) i; 428 AALo[offset] = (MatScalar) 1.0; 429 offset += 1; 430 431 v += nz; 432 vi += nz; 433 } 434 435 /* allocate space for the triangular factor information */ 436 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 437 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 438 /* Create the matrix description */ 439 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 440 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 441 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 442 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 443 #else 444 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 445 #endif 446 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 447 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 448 449 /* set the operation */ 450 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 451 452 /* set the matrix */ 453 loTriFactor->csrMat = new CsrMatrix; 454 loTriFactor->csrMat->num_rows = n; 455 loTriFactor->csrMat->num_cols = n; 456 loTriFactor->csrMat->num_entries = nzLower; 457 458 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 459 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 460 461 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 462 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 463 464 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 465 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 466 467 /* Create the solve analysis information */ 468 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 469 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 470 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 471 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 472 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 473 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 474 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 475 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 476 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 477 #endif 478 479 /* perform the solve analysis */ 480 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 481 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 482 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 483 loTriFactor->csrMat->column_indices->data().get(), 484 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 485 loTriFactor->solveInfo, 486 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 487 #else 488 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 489 #endif 490 cerr = WaitForCUDA();CHKERRCUDA(cerr); 491 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 492 493 /* assign the pointer */ 494 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 495 loTriFactor->AA_h = AALo; 496 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 497 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 498 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 499 } else { /* update values only */ 500 if (!loTriFactor->AA_h) { 501 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 502 } 503 /* Fill the lower triangular matrix */ 504 loTriFactor->AA_h[0] = 1.0; 505 v = aa; 506 vi = aj; 507 offset = 1; 508 for (i=1; i<n; i++) { 509 nz = ai[i+1] - ai[i]; 510 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 511 offset += nz; 512 loTriFactor->AA_h[offset] = 1.0; 513 offset += 1; 514 v += nz; 515 } 516 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 517 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 518 } 519 } catch(char *ex) { 520 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 521 } 522 } 523 PetscFunctionReturn(0); 524 } 525 526 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 527 { 528 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 529 PetscInt n = A->rmap->n; 530 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 531 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 532 cusparseStatus_t stat; 533 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 534 const MatScalar *aa = a->a,*v; 535 PetscInt *AiUp, *AjUp; 536 PetscInt i,nz, nzUpper, offset; 537 PetscErrorCode ierr; 538 cudaError_t cerr; 539 540 PetscFunctionBegin; 541 if (!n) PetscFunctionReturn(0); 542 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 543 try { 544 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 545 nzUpper = adiag[0]-adiag[n]; 546 if (!upTriFactor) { 547 PetscScalar *AAUp; 548 549 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 550 551 /* Allocate Space for the upper triangular matrix */ 552 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 553 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 554 555 /* Fill the upper triangular matrix */ 556 AiUp[0]=(PetscInt) 0; 557 AiUp[n]=nzUpper; 558 offset = nzUpper; 559 for (i=n-1; i>=0; i--) { 560 v = aa + adiag[i+1] + 1; 561 vi = aj + adiag[i+1] + 1; 562 563 /* number of elements NOT on the diagonal */ 564 nz = adiag[i] - adiag[i+1]-1; 565 566 /* decrement the offset */ 567 offset -= (nz+1); 568 569 /* first, set the diagonal elements */ 570 AjUp[offset] = (PetscInt) i; 571 AAUp[offset] = (MatScalar)1./v[nz]; 572 AiUp[i] = AiUp[i+1] - (nz+1); 573 574 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 575 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 576 } 577 578 /* allocate space for the triangular factor information */ 579 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 580 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 581 582 /* Create the matrix description */ 583 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 584 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 585 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 586 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 587 #else 588 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 589 #endif 590 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 591 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 592 593 /* set the operation */ 594 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 595 596 /* set the matrix */ 597 upTriFactor->csrMat = new CsrMatrix; 598 upTriFactor->csrMat->num_rows = n; 599 upTriFactor->csrMat->num_cols = n; 600 upTriFactor->csrMat->num_entries = nzUpper; 601 602 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 603 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 604 605 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 606 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 607 608 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 609 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 610 611 /* Create the solve analysis information */ 612 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 613 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 615 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 616 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 617 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 618 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 619 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 620 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 621 #endif 622 623 /* perform the solve analysis */ 624 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 625 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 626 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 627 upTriFactor->csrMat->column_indices->data().get(), 628 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 629 upTriFactor->solveInfo, 630 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 631 #else 632 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 633 #endif 634 cerr = WaitForCUDA();CHKERRCUDA(cerr); 635 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 636 637 /* assign the pointer */ 638 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 639 upTriFactor->AA_h = AAUp; 640 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 641 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 642 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 643 } else { 644 if (!upTriFactor->AA_h) { 645 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 646 } 647 /* Fill the upper triangular matrix */ 648 offset = nzUpper; 649 for (i=n-1; i>=0; i--) { 650 v = aa + adiag[i+1] + 1; 651 652 /* number of elements NOT on the diagonal */ 653 nz = adiag[i] - adiag[i+1]-1; 654 655 /* decrement the offset */ 656 offset -= (nz+1); 657 658 /* first, set the diagonal elements */ 659 upTriFactor->AA_h[offset] = 1./v[nz]; 660 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 661 } 662 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 663 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 664 } 665 } catch(char *ex) { 666 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 667 } 668 } 669 PetscFunctionReturn(0); 670 } 671 672 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 673 { 674 PetscErrorCode ierr; 675 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 676 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 677 IS isrow = a->row,iscol = a->icol; 678 PetscBool row_identity,col_identity; 679 PetscInt n = A->rmap->n; 680 681 PetscFunctionBegin; 682 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 683 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 684 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 685 686 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 687 cusparseTriFactors->nnz=a->nz; 688 689 A->offloadmask = PETSC_OFFLOAD_BOTH; 690 /* lower triangular indices */ 691 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 692 if (!row_identity && !cusparseTriFactors->rpermIndices) { 693 const PetscInt *r; 694 695 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 696 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 697 cusparseTriFactors->rpermIndices->assign(r, r+n); 698 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 699 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 700 } 701 702 /* upper triangular indices */ 703 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 704 if (!col_identity && !cusparseTriFactors->cpermIndices) { 705 const PetscInt *c; 706 707 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 708 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 709 cusparseTriFactors->cpermIndices->assign(c, c+n); 710 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 711 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 712 } 713 PetscFunctionReturn(0); 714 } 715 716 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 717 { 718 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 719 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 720 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 721 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 722 cusparseStatus_t stat; 723 PetscErrorCode ierr; 724 cudaError_t cerr; 725 PetscInt *AiUp, *AjUp; 726 PetscScalar *AAUp; 727 PetscScalar *AALo; 728 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 729 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 730 const PetscInt *ai = b->i,*aj = b->j,*vj; 731 const MatScalar *aa = b->a,*v; 732 733 PetscFunctionBegin; 734 if (!n) PetscFunctionReturn(0); 735 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 736 try { 737 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 738 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 739 if (!upTriFactor && !loTriFactor) { 740 /* Allocate Space for the upper triangular matrix */ 741 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 742 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 743 744 /* Fill the upper triangular matrix */ 745 AiUp[0]=(PetscInt) 0; 746 AiUp[n]=nzUpper; 747 offset = 0; 748 for (i=0; i<n; i++) { 749 /* set the pointers */ 750 v = aa + ai[i]; 751 vj = aj + ai[i]; 752 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 753 754 /* first, set the diagonal elements */ 755 AjUp[offset] = (PetscInt) i; 756 AAUp[offset] = (MatScalar)1.0/v[nz]; 757 AiUp[i] = offset; 758 AALo[offset] = (MatScalar)1.0/v[nz]; 759 760 offset+=1; 761 if (nz>0) { 762 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 763 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 764 for (j=offset; j<offset+nz; j++) { 765 AAUp[j] = -AAUp[j]; 766 AALo[j] = AAUp[j]/v[nz]; 767 } 768 offset+=nz; 769 } 770 } 771 772 /* allocate space for the triangular factor information */ 773 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 774 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 775 776 /* Create the matrix description */ 777 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 778 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 779 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 780 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 781 #else 782 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 783 #endif 784 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 785 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 786 787 /* set the matrix */ 788 upTriFactor->csrMat = new CsrMatrix; 789 upTriFactor->csrMat->num_rows = A->rmap->n; 790 upTriFactor->csrMat->num_cols = A->cmap->n; 791 upTriFactor->csrMat->num_entries = a->nz; 792 793 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 794 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 795 796 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 797 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 798 799 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 800 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 801 802 /* set the operation */ 803 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 804 805 /* Create the solve analysis information */ 806 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 807 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 808 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 809 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 810 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 811 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 812 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 813 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 814 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 815 #endif 816 817 /* perform the solve analysis */ 818 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 819 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 820 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 821 upTriFactor->csrMat->column_indices->data().get(), 822 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 823 upTriFactor->solveInfo, 824 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 825 #else 826 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 827 #endif 828 cerr = WaitForCUDA();CHKERRCUDA(cerr); 829 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 830 831 /* assign the pointer */ 832 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 833 834 /* allocate space for the triangular factor information */ 835 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 836 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 837 838 /* Create the matrix description */ 839 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 840 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 841 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 842 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 843 #else 844 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 845 #endif 846 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 847 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 848 849 /* set the operation */ 850 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 851 852 /* set the matrix */ 853 loTriFactor->csrMat = new CsrMatrix; 854 loTriFactor->csrMat->num_rows = A->rmap->n; 855 loTriFactor->csrMat->num_cols = A->cmap->n; 856 loTriFactor->csrMat->num_entries = a->nz; 857 858 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 859 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 860 861 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 862 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 863 864 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 865 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 866 867 /* Create the solve analysis information */ 868 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 869 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 870 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 871 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 872 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 873 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 874 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 875 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 876 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 877 #endif 878 879 /* perform the solve analysis */ 880 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 881 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 882 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 883 loTriFactor->csrMat->column_indices->data().get(), 884 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 885 loTriFactor->solveInfo, 886 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 887 #else 888 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 889 #endif 890 cerr = WaitForCUDA();CHKERRCUDA(cerr); 891 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 892 893 /* assign the pointer */ 894 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 895 896 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 897 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 898 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 899 } else { 900 /* Fill the upper triangular matrix */ 901 offset = 0; 902 for (i=0; i<n; i++) { 903 /* set the pointers */ 904 v = aa + ai[i]; 905 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 906 907 /* first, set the diagonal elements */ 908 AAUp[offset] = 1.0/v[nz]; 909 AALo[offset] = 1.0/v[nz]; 910 911 offset+=1; 912 if (nz>0) { 913 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 914 for (j=offset; j<offset+nz; j++) { 915 AAUp[j] = -AAUp[j]; 916 AALo[j] = AAUp[j]/v[nz]; 917 } 918 offset+=nz; 919 } 920 } 921 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 922 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 923 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 924 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 925 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 926 } 927 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 928 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 929 } catch(char *ex) { 930 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 931 } 932 } 933 PetscFunctionReturn(0); 934 } 935 936 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 937 { 938 PetscErrorCode ierr; 939 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 940 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 941 IS ip = a->row; 942 PetscBool perm_identity; 943 PetscInt n = A->rmap->n; 944 945 PetscFunctionBegin; 946 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 947 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 948 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 949 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 950 951 A->offloadmask = PETSC_OFFLOAD_BOTH; 952 953 /* lower triangular indices */ 954 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 955 if (!perm_identity) { 956 IS iip; 957 const PetscInt *irip,*rip; 958 959 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 960 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 961 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 962 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 963 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 964 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 965 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 966 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 967 ierr = ISDestroy(&iip);CHKERRQ(ierr); 968 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 969 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 970 } 971 PetscFunctionReturn(0); 972 } 973 974 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 975 { 976 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 977 IS ip = b->row; 978 PetscBool perm_identity; 979 PetscErrorCode ierr; 980 981 PetscFunctionBegin; 982 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 983 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 984 B->offloadmask = PETSC_OFFLOAD_CPU; 985 /* determine which version of MatSolve needs to be used. */ 986 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 987 if (perm_identity) { 988 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 989 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 990 B->ops->matsolve = NULL; 991 B->ops->matsolvetranspose = NULL; 992 } else { 993 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 994 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 995 B->ops->matsolve = NULL; 996 B->ops->matsolvetranspose = NULL; 997 } 998 999 /* get the triangular factors */ 1000 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1001 PetscFunctionReturn(0); 1002 } 1003 1004 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1005 { 1006 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1007 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1008 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1009 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1010 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1011 cusparseStatus_t stat; 1012 cusparseIndexBase_t indexBase; 1013 cusparseMatrixType_t matrixType; 1014 cusparseFillMode_t fillMode; 1015 cusparseDiagType_t diagType; 1016 cudaError_t cerr; 1017 PetscErrorCode ierr; 1018 1019 PetscFunctionBegin; 1020 /* allocate space for the transpose of the lower triangular factor */ 1021 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1022 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1023 1024 /* set the matrix descriptors of the lower triangular factor */ 1025 matrixType = cusparseGetMatType(loTriFactor->descr); 1026 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1027 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1028 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1029 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1030 1031 /* Create the matrix description */ 1032 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1033 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1034 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1035 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1036 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1037 1038 /* set the operation */ 1039 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1040 1041 /* allocate GPU space for the CSC of the lower triangular factor*/ 1042 loTriFactorT->csrMat = new CsrMatrix; 1043 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1044 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1045 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1046 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1047 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1048 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1049 1050 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1051 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1052 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1053 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1054 loTriFactor->csrMat->values->data().get(), 1055 loTriFactor->csrMat->row_offsets->data().get(), 1056 loTriFactor->csrMat->column_indices->data().get(), 1057 loTriFactorT->csrMat->values->data().get(), 1058 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059 CUSPARSE_ACTION_NUMERIC,indexBase, 1060 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1061 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1062 #endif 1063 1064 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1065 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1066 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1067 loTriFactor->csrMat->values->data().get(), 1068 loTriFactor->csrMat->row_offsets->data().get(), 1069 loTriFactor->csrMat->column_indices->data().get(), 1070 loTriFactorT->csrMat->values->data().get(), 1071 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1072 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1073 CUSPARSE_ACTION_NUMERIC, indexBase, 1074 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1075 #else 1076 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1077 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1078 #endif 1079 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1080 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1081 1082 /* Create the solve analysis information */ 1083 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1084 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1085 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1086 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1087 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1088 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1089 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1090 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1091 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1092 #endif 1093 1094 /* perform the solve analysis */ 1095 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1096 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1097 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1098 loTriFactorT->csrMat->column_indices->data().get(), 1099 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1100 loTriFactorT->solveInfo, 1101 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1102 #else 1103 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1104 #endif 1105 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1106 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1107 1108 /* assign the pointer */ 1109 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1110 1111 /*********************************************/ 1112 /* Now the Transpose of the Upper Tri Factor */ 1113 /*********************************************/ 1114 1115 /* allocate space for the transpose of the upper triangular factor */ 1116 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1117 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1118 1119 /* set the matrix descriptors of the upper triangular factor */ 1120 matrixType = cusparseGetMatType(upTriFactor->descr); 1121 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1122 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1123 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1124 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1125 1126 /* Create the matrix description */ 1127 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1128 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1129 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1130 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1131 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1132 1133 /* set the operation */ 1134 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1135 1136 /* allocate GPU space for the CSC of the upper triangular factor*/ 1137 upTriFactorT->csrMat = new CsrMatrix; 1138 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1139 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1140 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1141 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1142 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1143 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1144 1145 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1146 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1147 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1148 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1149 upTriFactor->csrMat->values->data().get(), 1150 upTriFactor->csrMat->row_offsets->data().get(), 1151 upTriFactor->csrMat->column_indices->data().get(), 1152 upTriFactorT->csrMat->values->data().get(), 1153 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1154 CUSPARSE_ACTION_NUMERIC,indexBase, 1155 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1156 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1157 #endif 1158 1159 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1160 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1161 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1162 upTriFactor->csrMat->values->data().get(), 1163 upTriFactor->csrMat->row_offsets->data().get(), 1164 upTriFactor->csrMat->column_indices->data().get(), 1165 upTriFactorT->csrMat->values->data().get(), 1166 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1167 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1168 CUSPARSE_ACTION_NUMERIC, indexBase, 1169 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1170 #else 1171 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1172 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1173 #endif 1174 1175 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1176 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1177 1178 /* Create the solve analysis information */ 1179 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1180 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1181 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1182 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1183 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1184 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1185 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1186 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1187 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1188 #endif 1189 1190 /* perform the solve analysis */ 1191 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1192 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1193 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1194 upTriFactorT->csrMat->column_indices->data().get(), 1195 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1196 upTriFactorT->solveInfo, 1197 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1198 #else 1199 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1200 #endif 1201 1202 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1203 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1204 1205 /* assign the pointer */ 1206 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1207 PetscFunctionReturn(0); 1208 } 1209 1210 struct PetscScalarToPetscInt 1211 { 1212 __host__ __device__ 1213 PetscInt operator()(PetscScalar s) 1214 { 1215 return (PetscInt)PetscRealPart(s); 1216 } 1217 }; 1218 1219 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1220 { 1221 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1222 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1223 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1224 cusparseStatus_t stat; 1225 cusparseIndexBase_t indexBase; 1226 cudaError_t err; 1227 PetscErrorCode ierr; 1228 1229 PetscFunctionBegin; 1230 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1231 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1232 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1233 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1234 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1235 if (A->transupdated) PetscFunctionReturn(0); 1236 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1237 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1238 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1239 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1240 } 1241 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1242 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1243 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1244 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1245 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1246 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1247 1248 /* set alpha and beta */ 1249 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1250 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1251 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1252 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1253 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1254 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1255 1256 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1257 CsrMatrix *matrixT = new CsrMatrix; 1258 matstructT->mat = matrixT; 1259 matrixT->num_rows = A->cmap->n; 1260 matrixT->num_cols = A->rmap->n; 1261 matrixT->num_entries = a->nz; 1262 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1263 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1264 matrixT->values = new THRUSTARRAY(a->nz); 1265 1266 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1267 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1268 1269 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1270 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1271 stat = cusparseCreateCsr(&matstructT->matDescr, 1272 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1273 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1274 matrixT->values->data().get(), 1275 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1276 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1277 #else 1278 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1279 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1280 1281 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1282 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1283 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1284 */ 1285 if (matrixT->num_entries) { 1286 stat = cusparseCreateCsr(&matstructT->matDescr, 1287 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1288 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1289 matrixT->values->data().get(), 1290 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1291 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1292 1293 } else { 1294 matstructT->matDescr = NULL; 1295 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1296 } 1297 #endif 1298 #endif 1299 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1300 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1301 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1302 #else 1303 CsrMatrix *temp = new CsrMatrix; 1304 CsrMatrix *tempT = new CsrMatrix; 1305 /* First convert HYB to CSR */ 1306 temp->num_rows = A->rmap->n; 1307 temp->num_cols = A->cmap->n; 1308 temp->num_entries = a->nz; 1309 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1310 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1311 temp->values = new THRUSTARRAY(a->nz); 1312 1313 stat = cusparse_hyb2csr(cusparsestruct->handle, 1314 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1315 temp->values->data().get(), 1316 temp->row_offsets->data().get(), 1317 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1318 1319 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1320 tempT->num_rows = A->rmap->n; 1321 tempT->num_cols = A->cmap->n; 1322 tempT->num_entries = a->nz; 1323 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1324 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1325 tempT->values = new THRUSTARRAY(a->nz); 1326 1327 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1328 temp->num_cols, temp->num_entries, 1329 temp->values->data().get(), 1330 temp->row_offsets->data().get(), 1331 temp->column_indices->data().get(), 1332 tempT->values->data().get(), 1333 tempT->column_indices->data().get(), 1334 tempT->row_offsets->data().get(), 1335 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1336 1337 /* Last, convert CSC to HYB */ 1338 cusparseHybMat_t hybMat; 1339 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1340 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1341 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1342 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1343 matstructT->descr, tempT->values->data().get(), 1344 tempT->row_offsets->data().get(), 1345 tempT->column_indices->data().get(), 1346 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1347 1348 /* assign the pointer */ 1349 matstructT->mat = hybMat; 1350 A->transupdated = PETSC_TRUE; 1351 /* delete temporaries */ 1352 if (tempT) { 1353 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1354 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1355 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1356 delete (CsrMatrix*) tempT; 1357 } 1358 if (temp) { 1359 if (temp->values) delete (THRUSTARRAY*) temp->values; 1360 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1361 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1362 delete (CsrMatrix*) temp; 1363 } 1364 #endif 1365 } 1366 } 1367 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1368 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1369 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1370 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1371 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1372 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1373 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1374 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1375 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1376 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1377 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1378 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1379 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1380 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1381 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1382 } 1383 if (!cusparsestruct->csr2csc_i) { 1384 THRUSTARRAY csr2csc_a(matrix->num_entries); 1385 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1386 1387 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1388 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1389 void *csr2cscBuffer; 1390 size_t csr2cscBufferSize; 1391 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1392 A->cmap->n, matrix->num_entries, 1393 matrix->values->data().get(), 1394 cusparsestruct->rowoffsets_gpu->data().get(), 1395 matrix->column_indices->data().get(), 1396 matrixT->values->data().get(), 1397 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1398 CUSPARSE_ACTION_NUMERIC,indexBase, 1399 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1400 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1401 #endif 1402 1403 if (matrix->num_entries) { 1404 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1405 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1406 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1407 1408 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1409 should be filled with indexBase. So I just take a shortcut here. 1410 */ 1411 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1412 A->cmap->n,matrix->num_entries, 1413 csr2csc_a.data().get(), 1414 cusparsestruct->rowoffsets_gpu->data().get(), 1415 matrix->column_indices->data().get(), 1416 matrixT->values->data().get(), 1417 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1418 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1419 CUSPARSE_ACTION_NUMERIC,indexBase, 1420 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1421 #else 1422 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1423 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1424 #endif 1425 } else { 1426 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1427 } 1428 1429 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1430 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1431 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1432 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1433 #endif 1434 } 1435 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1436 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1437 matrixT->values->begin())); 1438 } 1439 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1440 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1441 /* the compressed row indices is not used for matTranspose */ 1442 matstructT->cprowIndices = NULL; 1443 /* assign the pointer */ 1444 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1445 A->transupdated = PETSC_TRUE; 1446 PetscFunctionReturn(0); 1447 } 1448 1449 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1450 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1451 { 1452 PetscInt n = xx->map->n; 1453 const PetscScalar *barray; 1454 PetscScalar *xarray; 1455 thrust::device_ptr<const PetscScalar> bGPU; 1456 thrust::device_ptr<PetscScalar> xGPU; 1457 cusparseStatus_t stat; 1458 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1459 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1460 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1461 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1462 PetscErrorCode ierr; 1463 1464 PetscFunctionBegin; 1465 /* Analyze the matrix and create the transpose ... on the fly */ 1466 if (!loTriFactorT && !upTriFactorT) { 1467 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1468 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1469 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1470 } 1471 1472 /* Get the GPU pointers */ 1473 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1474 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1475 xGPU = thrust::device_pointer_cast(xarray); 1476 bGPU = thrust::device_pointer_cast(barray); 1477 1478 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1479 /* First, reorder with the row permutation */ 1480 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1481 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1482 xGPU); 1483 1484 /* First, solve U */ 1485 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1486 upTriFactorT->csrMat->num_rows, 1487 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1488 upTriFactorT->csrMat->num_entries, 1489 #endif 1490 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1491 upTriFactorT->csrMat->values->data().get(), 1492 upTriFactorT->csrMat->row_offsets->data().get(), 1493 upTriFactorT->csrMat->column_indices->data().get(), 1494 upTriFactorT->solveInfo, 1495 xarray, 1496 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1497 tempGPU->data().get(), 1498 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1499 #else 1500 tempGPU->data().get());CHKERRCUSPARSE(stat); 1501 #endif 1502 1503 /* Then, solve L */ 1504 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1505 loTriFactorT->csrMat->num_rows, 1506 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1507 loTriFactorT->csrMat->num_entries, 1508 #endif 1509 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1510 loTriFactorT->csrMat->values->data().get(), 1511 loTriFactorT->csrMat->row_offsets->data().get(), 1512 loTriFactorT->csrMat->column_indices->data().get(), 1513 loTriFactorT->solveInfo, 1514 tempGPU->data().get(), 1515 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1516 xarray, 1517 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1518 #else 1519 xarray);CHKERRCUSPARSE(stat); 1520 #endif 1521 1522 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1523 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1524 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1525 tempGPU->begin()); 1526 1527 /* Copy the temporary to the full solution. */ 1528 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1529 1530 /* restore */ 1531 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1532 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1533 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1534 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1535 PetscFunctionReturn(0); 1536 } 1537 1538 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1539 { 1540 const PetscScalar *barray; 1541 PetscScalar *xarray; 1542 cusparseStatus_t stat; 1543 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1544 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1545 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1546 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1547 PetscErrorCode ierr; 1548 1549 PetscFunctionBegin; 1550 /* Analyze the matrix and create the transpose ... on the fly */ 1551 if (!loTriFactorT && !upTriFactorT) { 1552 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1553 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1554 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1555 } 1556 1557 /* Get the GPU pointers */ 1558 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1559 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1560 1561 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1562 /* First, solve U */ 1563 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1564 upTriFactorT->csrMat->num_rows, 1565 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1566 upTriFactorT->csrMat->num_entries, 1567 #endif 1568 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1569 upTriFactorT->csrMat->values->data().get(), 1570 upTriFactorT->csrMat->row_offsets->data().get(), 1571 upTriFactorT->csrMat->column_indices->data().get(), 1572 upTriFactorT->solveInfo, 1573 barray, 1574 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1575 tempGPU->data().get(), 1576 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1577 #else 1578 tempGPU->data().get());CHKERRCUSPARSE(stat); 1579 #endif 1580 1581 /* Then, solve L */ 1582 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1583 loTriFactorT->csrMat->num_rows, 1584 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1585 loTriFactorT->csrMat->num_entries, 1586 #endif 1587 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1588 loTriFactorT->csrMat->values->data().get(), 1589 loTriFactorT->csrMat->row_offsets->data().get(), 1590 loTriFactorT->csrMat->column_indices->data().get(), 1591 loTriFactorT->solveInfo, 1592 tempGPU->data().get(), 1593 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1594 xarray, 1595 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1596 #else 1597 xarray);CHKERRCUSPARSE(stat); 1598 #endif 1599 1600 /* restore */ 1601 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1602 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1603 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1604 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1605 PetscFunctionReturn(0); 1606 } 1607 1608 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1609 { 1610 const PetscScalar *barray; 1611 PetscScalar *xarray; 1612 thrust::device_ptr<const PetscScalar> bGPU; 1613 thrust::device_ptr<PetscScalar> xGPU; 1614 cusparseStatus_t stat; 1615 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1616 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1617 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1618 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1619 PetscErrorCode ierr; 1620 1621 PetscFunctionBegin; 1622 1623 /* Get the GPU pointers */ 1624 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1625 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1626 xGPU = thrust::device_pointer_cast(xarray); 1627 bGPU = thrust::device_pointer_cast(barray); 1628 1629 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1630 /* First, reorder with the row permutation */ 1631 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1632 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1633 tempGPU->begin()); 1634 1635 /* Next, solve L */ 1636 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1637 loTriFactor->csrMat->num_rows, 1638 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1639 loTriFactor->csrMat->num_entries, 1640 #endif 1641 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1642 loTriFactor->csrMat->values->data().get(), 1643 loTriFactor->csrMat->row_offsets->data().get(), 1644 loTriFactor->csrMat->column_indices->data().get(), 1645 loTriFactor->solveInfo, 1646 tempGPU->data().get(), 1647 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1648 xarray, 1649 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1650 #else 1651 xarray);CHKERRCUSPARSE(stat); 1652 #endif 1653 1654 /* Then, solve U */ 1655 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1656 upTriFactor->csrMat->num_rows, 1657 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1658 upTriFactor->csrMat->num_entries, 1659 #endif 1660 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1661 upTriFactor->csrMat->values->data().get(), 1662 upTriFactor->csrMat->row_offsets->data().get(), 1663 upTriFactor->csrMat->column_indices->data().get(), 1664 upTriFactor->solveInfo,xarray, 1665 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1666 tempGPU->data().get(), 1667 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1668 #else 1669 tempGPU->data().get());CHKERRCUSPARSE(stat); 1670 #endif 1671 1672 /* Last, reorder with the column permutation */ 1673 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1674 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1675 xGPU); 1676 1677 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1678 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1679 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1680 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1681 PetscFunctionReturn(0); 1682 } 1683 1684 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1685 { 1686 const PetscScalar *barray; 1687 PetscScalar *xarray; 1688 cusparseStatus_t stat; 1689 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1690 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1691 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1692 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1693 PetscErrorCode ierr; 1694 1695 PetscFunctionBegin; 1696 /* Get the GPU pointers */ 1697 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1698 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1699 1700 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1701 /* First, solve L */ 1702 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1703 loTriFactor->csrMat->num_rows, 1704 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1705 loTriFactor->csrMat->num_entries, 1706 #endif 1707 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1708 loTriFactor->csrMat->values->data().get(), 1709 loTriFactor->csrMat->row_offsets->data().get(), 1710 loTriFactor->csrMat->column_indices->data().get(), 1711 loTriFactor->solveInfo, 1712 barray, 1713 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1714 tempGPU->data().get(), 1715 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1716 #else 1717 tempGPU->data().get());CHKERRCUSPARSE(stat); 1718 #endif 1719 1720 /* Next, solve U */ 1721 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1722 upTriFactor->csrMat->num_rows, 1723 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1724 upTriFactor->csrMat->num_entries, 1725 #endif 1726 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1727 upTriFactor->csrMat->values->data().get(), 1728 upTriFactor->csrMat->row_offsets->data().get(), 1729 upTriFactor->csrMat->column_indices->data().get(), 1730 upTriFactor->solveInfo, 1731 tempGPU->data().get(), 1732 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1733 xarray, 1734 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1735 #else 1736 xarray);CHKERRCUSPARSE(stat); 1737 #endif 1738 1739 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1740 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1741 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1742 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1743 PetscFunctionReturn(0); 1744 } 1745 1746 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1747 { 1748 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1749 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1750 cudaError_t cerr; 1751 PetscErrorCode ierr; 1752 1753 PetscFunctionBegin; 1754 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1755 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1756 1757 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1758 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1759 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1760 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1761 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1762 A->offloadmask = PETSC_OFFLOAD_BOTH; 1763 } 1764 PetscFunctionReturn(0); 1765 } 1766 1767 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1768 { 1769 PetscErrorCode ierr; 1770 1771 PetscFunctionBegin; 1772 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1773 *array = ((Mat_SeqAIJ*)A->data)->a; 1774 PetscFunctionReturn(0); 1775 } 1776 1777 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1778 { 1779 PetscFunctionBegin; 1780 A->offloadmask = PETSC_OFFLOAD_CPU; 1781 *array = NULL; 1782 PetscFunctionReturn(0); 1783 } 1784 1785 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1786 { 1787 PetscErrorCode ierr; 1788 1789 PetscFunctionBegin; 1790 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1791 *array = ((Mat_SeqAIJ*)A->data)->a; 1792 PetscFunctionReturn(0); 1793 } 1794 1795 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1796 { 1797 PetscFunctionBegin; 1798 *array = NULL; 1799 PetscFunctionReturn(0); 1800 } 1801 1802 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1803 { 1804 PetscFunctionBegin; 1805 *array = ((Mat_SeqAIJ*)A->data)->a; 1806 PetscFunctionReturn(0); 1807 } 1808 1809 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1810 { 1811 PetscFunctionBegin; 1812 A->offloadmask = PETSC_OFFLOAD_CPU; 1813 *array = NULL; 1814 PetscFunctionReturn(0); 1815 } 1816 1817 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1818 { 1819 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1820 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1821 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1822 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1823 PetscErrorCode ierr; 1824 cusparseStatus_t stat; 1825 PetscBool both = PETSC_TRUE; 1826 cudaError_t err; 1827 1828 PetscFunctionBegin; 1829 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1830 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1831 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1832 CsrMatrix *matrix; 1833 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1834 1835 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1836 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1837 matrix->values->assign(a->a, a->a+a->nz); 1838 err = WaitForCUDA();CHKERRCUDA(err); 1839 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1840 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1841 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1842 } else { 1843 PetscInt nnz; 1844 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1845 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1846 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1847 delete cusparsestruct->workVector; 1848 delete cusparsestruct->rowoffsets_gpu; 1849 cusparsestruct->workVector = NULL; 1850 cusparsestruct->rowoffsets_gpu = NULL; 1851 try { 1852 if (a->compressedrow.use) { 1853 m = a->compressedrow.nrows; 1854 ii = a->compressedrow.i; 1855 ridx = a->compressedrow.rindex; 1856 } else { 1857 m = A->rmap->n; 1858 ii = a->i; 1859 ridx = NULL; 1860 } 1861 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1862 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1863 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1864 else nnz = a->nz; 1865 1866 /* create cusparse matrix */ 1867 cusparsestruct->nrows = m; 1868 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1869 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1870 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1871 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1872 1873 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1874 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1875 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1876 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1877 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1878 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1879 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1880 1881 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1882 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1883 /* set the matrix */ 1884 CsrMatrix *mat= new CsrMatrix; 1885 mat->num_rows = m; 1886 mat->num_cols = A->cmap->n; 1887 mat->num_entries = nnz; 1888 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1889 mat->row_offsets->assign(ii, ii + m+1); 1890 1891 mat->column_indices = new THRUSTINTARRAY32(nnz); 1892 mat->column_indices->assign(a->j, a->j+nnz); 1893 1894 mat->values = new THRUSTARRAY(nnz); 1895 if (a->a) mat->values->assign(a->a, a->a+nnz); 1896 1897 /* assign the pointer */ 1898 matstruct->mat = mat; 1899 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1900 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1901 stat = cusparseCreateCsr(&matstruct->matDescr, 1902 mat->num_rows, mat->num_cols, mat->num_entries, 1903 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1904 mat->values->data().get(), 1905 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1906 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1907 } 1908 #endif 1909 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1910 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1911 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1912 #else 1913 CsrMatrix *mat= new CsrMatrix; 1914 mat->num_rows = m; 1915 mat->num_cols = A->cmap->n; 1916 mat->num_entries = nnz; 1917 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1918 mat->row_offsets->assign(ii, ii + m+1); 1919 1920 mat->column_indices = new THRUSTINTARRAY32(nnz); 1921 mat->column_indices->assign(a->j, a->j+nnz); 1922 1923 mat->values = new THRUSTARRAY(nnz); 1924 if (a->a) mat->values->assign(a->a, a->a+nnz); 1925 1926 cusparseHybMat_t hybMat; 1927 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1928 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1929 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1930 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1931 matstruct->descr, mat->values->data().get(), 1932 mat->row_offsets->data().get(), 1933 mat->column_indices->data().get(), 1934 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1935 /* assign the pointer */ 1936 matstruct->mat = hybMat; 1937 1938 if (mat) { 1939 if (mat->values) delete (THRUSTARRAY*)mat->values; 1940 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1941 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1942 delete (CsrMatrix*)mat; 1943 } 1944 #endif 1945 } 1946 1947 /* assign the compressed row indices */ 1948 if (a->compressedrow.use) { 1949 cusparsestruct->workVector = new THRUSTARRAY(m); 1950 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1951 matstruct->cprowIndices->assign(ridx,ridx+m); 1952 tmp = m; 1953 } else { 1954 cusparsestruct->workVector = NULL; 1955 matstruct->cprowIndices = NULL; 1956 tmp = 0; 1957 } 1958 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1959 1960 /* assign the pointer */ 1961 cusparsestruct->mat = matstruct; 1962 } catch(char *ex) { 1963 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1964 } 1965 err = WaitForCUDA();CHKERRCUDA(err); 1966 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1967 cusparsestruct->nonzerostate = A->nonzerostate; 1968 } 1969 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1970 } 1971 PetscFunctionReturn(0); 1972 } 1973 1974 struct VecCUDAPlusEquals 1975 { 1976 template <typename Tuple> 1977 __host__ __device__ 1978 void operator()(Tuple t) 1979 { 1980 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1981 } 1982 }; 1983 1984 struct VecCUDAEquals 1985 { 1986 template <typename Tuple> 1987 __host__ __device__ 1988 void operator()(Tuple t) 1989 { 1990 thrust::get<1>(t) = thrust::get<0>(t); 1991 } 1992 }; 1993 1994 struct VecCUDAEqualsReverse 1995 { 1996 template <typename Tuple> 1997 __host__ __device__ 1998 void operator()(Tuple t) 1999 { 2000 thrust::get<0>(t) = thrust::get<1>(t); 2001 } 2002 }; 2003 2004 struct MatMatCusparse { 2005 PetscBool cisdense; 2006 PetscScalar *Bt; 2007 Mat X; 2008 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2009 PetscLogDouble flops; 2010 CsrMatrix *Bcsr; 2011 2012 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2013 cusparseSpMatDescr_t matSpBDescr; 2014 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2015 cusparseDnMatDescr_t matBDescr; 2016 cusparseDnMatDescr_t matCDescr; 2017 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2018 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2019 void *dBuffer4; 2020 void *dBuffer5; 2021 #endif 2022 size_t mmBufferSize; 2023 void *mmBuffer; 2024 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2025 cusparseSpGEMMDescr_t spgemmDesc; 2026 #endif 2027 }; 2028 2029 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2030 { 2031 PetscErrorCode ierr; 2032 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2033 cudaError_t cerr; 2034 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2035 cusparseStatus_t stat; 2036 #endif 2037 2038 PetscFunctionBegin; 2039 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2040 delete mmdata->Bcsr; 2041 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2042 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2043 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2044 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2045 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2046 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2047 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2048 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2049 #endif 2050 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2051 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2052 #endif 2053 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2054 ierr = PetscFree(data);CHKERRQ(ierr); 2055 PetscFunctionReturn(0); 2056 } 2057 2058 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2059 2060 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2061 { 2062 Mat_Product *product = C->product; 2063 Mat A,B; 2064 PetscInt m,n,blda,clda; 2065 PetscBool flg,biscuda; 2066 Mat_SeqAIJCUSPARSE *cusp; 2067 cusparseStatus_t stat; 2068 cusparseOperation_t opA; 2069 const PetscScalar *barray; 2070 PetscScalar *carray; 2071 PetscErrorCode ierr; 2072 MatMatCusparse *mmdata; 2073 Mat_SeqAIJCUSPARSEMultStruct *mat; 2074 CsrMatrix *csrmat; 2075 2076 PetscFunctionBegin; 2077 MatCheckProduct(C,1); 2078 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2079 mmdata = (MatMatCusparse*)product->data; 2080 A = product->A; 2081 B = product->B; 2082 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2083 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2084 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2085 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2086 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2087 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2088 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2089 switch (product->type) { 2090 case MATPRODUCT_AB: 2091 case MATPRODUCT_PtAP: 2092 mat = cusp->mat; 2093 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2094 m = A->rmap->n; 2095 n = B->cmap->n; 2096 break; 2097 case MATPRODUCT_AtB: 2098 if (!A->form_explicit_transpose) { 2099 mat = cusp->mat; 2100 opA = CUSPARSE_OPERATION_TRANSPOSE; 2101 } else { 2102 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2103 mat = cusp->matTranspose; 2104 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2105 } 2106 m = A->cmap->n; 2107 n = B->cmap->n; 2108 break; 2109 case MATPRODUCT_ABt: 2110 case MATPRODUCT_RARt: 2111 mat = cusp->mat; 2112 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2113 m = A->rmap->n; 2114 n = B->rmap->n; 2115 break; 2116 default: 2117 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2118 } 2119 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2120 csrmat = (CsrMatrix*)mat->mat; 2121 /* if the user passed a CPU matrix, copy the data to the GPU */ 2122 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2123 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2124 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2125 2126 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2127 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2128 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2129 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2130 } else { 2131 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2132 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2133 } 2134 2135 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2137 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2138 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2139 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2140 size_t mmBufferSize; 2141 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2142 if (!mmdata->matBDescr) { 2143 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2144 mmdata->Blda = blda; 2145 } 2146 2147 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2148 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2149 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2150 mmdata->Clda = clda; 2151 } 2152 2153 if (!mat->matDescr) { 2154 stat = cusparseCreateCsr(&mat->matDescr, 2155 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2156 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2157 csrmat->values->data().get(), 2158 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2159 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2160 } 2161 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2162 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2163 mmdata->matCDescr,cusparse_scalartype, 2164 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2165 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2166 cudaError_t cerr; 2167 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2168 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2169 mmdata->mmBufferSize = mmBufferSize; 2170 } 2171 mmdata->initialized = PETSC_TRUE; 2172 } else { 2173 /* to be safe, always update pointers of the mats */ 2174 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2175 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2176 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2177 } 2178 2179 /* do cusparseSpMM, which supports transpose on B */ 2180 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2181 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2182 mmdata->matCDescr,cusparse_scalartype, 2183 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2184 #else 2185 PetscInt k; 2186 /* cusparseXcsrmm does not support transpose on B */ 2187 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2188 cublasHandle_t cublasv2handle; 2189 cublasStatus_t cerr; 2190 2191 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2192 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2193 B->cmap->n,B->rmap->n, 2194 &PETSC_CUSPARSE_ONE ,barray,blda, 2195 &PETSC_CUSPARSE_ZERO,barray,blda, 2196 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2197 blda = B->cmap->n; 2198 k = B->cmap->n; 2199 } else { 2200 k = B->rmap->n; 2201 } 2202 2203 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2204 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2205 csrmat->num_entries,mat->alpha_one,mat->descr, 2206 csrmat->values->data().get(), 2207 csrmat->row_offsets->data().get(), 2208 csrmat->column_indices->data().get(), 2209 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2210 carray,clda);CHKERRCUSPARSE(stat); 2211 #endif 2212 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2213 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2214 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2215 if (product->type == MATPRODUCT_RARt) { 2216 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2217 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2218 } else if (product->type == MATPRODUCT_PtAP) { 2219 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2220 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2221 } else { 2222 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2223 } 2224 if (mmdata->cisdense) { 2225 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2226 } 2227 if (!biscuda) { 2228 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2229 } 2230 PetscFunctionReturn(0); 2231 } 2232 2233 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2234 { 2235 Mat_Product *product = C->product; 2236 Mat A,B; 2237 PetscInt m,n; 2238 PetscBool cisdense,flg; 2239 PetscErrorCode ierr; 2240 MatMatCusparse *mmdata; 2241 Mat_SeqAIJCUSPARSE *cusp; 2242 2243 PetscFunctionBegin; 2244 MatCheckProduct(C,1); 2245 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2246 A = product->A; 2247 B = product->B; 2248 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2249 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2250 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2251 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2252 switch (product->type) { 2253 case MATPRODUCT_AB: 2254 m = A->rmap->n; 2255 n = B->cmap->n; 2256 break; 2257 case MATPRODUCT_AtB: 2258 m = A->cmap->n; 2259 n = B->cmap->n; 2260 break; 2261 case MATPRODUCT_ABt: 2262 m = A->rmap->n; 2263 n = B->rmap->n; 2264 break; 2265 case MATPRODUCT_PtAP: 2266 m = B->cmap->n; 2267 n = B->cmap->n; 2268 break; 2269 case MATPRODUCT_RARt: 2270 m = B->rmap->n; 2271 n = B->rmap->n; 2272 break; 2273 default: 2274 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2275 } 2276 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2277 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2278 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2279 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2280 2281 /* product data */ 2282 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2283 mmdata->cisdense = cisdense; 2284 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2285 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2286 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2287 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2288 } 2289 #endif 2290 /* for these products we need intermediate storage */ 2291 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2292 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2293 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2294 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2295 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2296 } else { 2297 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2298 } 2299 } 2300 C->product->data = mmdata; 2301 C->product->destroy = MatDestroy_MatMatCusparse; 2302 2303 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2304 PetscFunctionReturn(0); 2305 } 2306 2307 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2308 { 2309 Mat_Product *product = C->product; 2310 Mat A,B; 2311 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2312 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2313 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2314 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2315 PetscBool flg; 2316 PetscErrorCode ierr; 2317 cusparseStatus_t stat; 2318 cudaError_t cerr; 2319 MatProductType ptype; 2320 MatMatCusparse *mmdata; 2321 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2322 cusparseSpMatDescr_t BmatSpDescr; 2323 #endif 2324 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2325 2326 PetscFunctionBegin; 2327 MatCheckProduct(C,1); 2328 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2329 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2330 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2331 mmdata = (MatMatCusparse*)C->product->data; 2332 A = product->A; 2333 B = product->B; 2334 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2335 mmdata->reusesym = PETSC_FALSE; 2336 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2337 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2338 Cmat = Ccusp->mat; 2339 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2340 Ccsr = (CsrMatrix*)Cmat->mat; 2341 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2342 goto finalize; 2343 } 2344 if (!c->nz) goto finalize; 2345 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2346 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2347 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2348 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2349 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2350 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2351 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2352 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2353 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2354 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2355 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2356 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2357 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2358 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2359 2360 ptype = product->type; 2361 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2362 ptype = MATPRODUCT_AB; 2363 if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2364 } 2365 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2366 ptype = MATPRODUCT_AB; 2367 if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2368 } 2369 switch (ptype) { 2370 case MATPRODUCT_AB: 2371 Amat = Acusp->mat; 2372 Bmat = Bcusp->mat; 2373 break; 2374 case MATPRODUCT_AtB: 2375 Amat = Acusp->matTranspose; 2376 Bmat = Bcusp->mat; 2377 break; 2378 case MATPRODUCT_ABt: 2379 Amat = Acusp->mat; 2380 Bmat = Bcusp->matTranspose; 2381 break; 2382 default: 2383 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2384 } 2385 Cmat = Ccusp->mat; 2386 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2387 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2388 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2389 Acsr = (CsrMatrix*)Amat->mat; 2390 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2391 Ccsr = (CsrMatrix*)Cmat->mat; 2392 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2393 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2394 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2395 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2396 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2397 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2398 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2399 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2400 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2401 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2402 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2403 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2404 #else 2405 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2406 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2407 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2408 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2409 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2410 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2411 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2412 #endif 2413 #else 2414 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2415 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2416 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2417 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2418 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2419 #endif 2420 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2421 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2422 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2423 C->offloadmask = PETSC_OFFLOAD_GPU; 2424 finalize: 2425 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2426 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2427 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2428 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2429 c->reallocs = 0; 2430 C->info.mallocs += 0; 2431 C->info.nz_unneeded = 0; 2432 C->assembled = C->was_assembled = PETSC_TRUE; 2433 C->num_ass++; 2434 PetscFunctionReturn(0); 2435 } 2436 2437 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2438 { 2439 Mat_Product *product = C->product; 2440 Mat A,B; 2441 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2442 Mat_SeqAIJ *a,*b,*c; 2443 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2444 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2445 PetscInt i,j,m,n,k; 2446 PetscBool flg; 2447 PetscErrorCode ierr; 2448 cusparseStatus_t stat; 2449 cudaError_t cerr; 2450 MatProductType ptype; 2451 MatMatCusparse *mmdata; 2452 PetscLogDouble flops; 2453 PetscBool biscompressed,ciscompressed; 2454 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2455 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2456 cusparseSpMatDescr_t BmatSpDescr; 2457 #else 2458 int cnz; 2459 #endif 2460 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2461 2462 PetscFunctionBegin; 2463 MatCheckProduct(C,1); 2464 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2465 A = product->A; 2466 B = product->B; 2467 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2468 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2469 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2470 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2471 a = (Mat_SeqAIJ*)A->data; 2472 b = (Mat_SeqAIJ*)B->data; 2473 /* product data */ 2474 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2475 C->product->data = mmdata; 2476 C->product->destroy = MatDestroy_MatMatCusparse; 2477 2478 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2479 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2480 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2481 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2482 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2483 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2484 2485 ptype = product->type; 2486 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2487 ptype = MATPRODUCT_AB; 2488 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2489 } 2490 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2491 ptype = MATPRODUCT_AB; 2492 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2493 } 2494 biscompressed = PETSC_FALSE; 2495 ciscompressed = PETSC_FALSE; 2496 switch (ptype) { 2497 case MATPRODUCT_AB: 2498 m = A->rmap->n; 2499 n = B->cmap->n; 2500 k = A->cmap->n; 2501 Amat = Acusp->mat; 2502 Bmat = Bcusp->mat; 2503 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2504 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2505 break; 2506 case MATPRODUCT_AtB: 2507 m = A->cmap->n; 2508 n = B->cmap->n; 2509 k = A->rmap->n; 2510 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2511 Amat = Acusp->matTranspose; 2512 Bmat = Bcusp->mat; 2513 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2514 break; 2515 case MATPRODUCT_ABt: 2516 m = A->rmap->n; 2517 n = B->rmap->n; 2518 k = A->cmap->n; 2519 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2520 Amat = Acusp->mat; 2521 Bmat = Bcusp->matTranspose; 2522 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2523 break; 2524 default: 2525 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2526 } 2527 2528 /* create cusparse matrix */ 2529 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2530 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2531 c = (Mat_SeqAIJ*)C->data; 2532 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2533 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2534 Ccsr = new CsrMatrix; 2535 2536 c->compressedrow.use = ciscompressed; 2537 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2538 c->compressedrow.nrows = a->compressedrow.nrows; 2539 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2540 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2541 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2542 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2543 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2544 } else { 2545 c->compressedrow.nrows = 0; 2546 c->compressedrow.i = NULL; 2547 c->compressedrow.rindex = NULL; 2548 Ccusp->workVector = NULL; 2549 Cmat->cprowIndices = NULL; 2550 } 2551 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2552 Ccusp->mat = Cmat; 2553 Ccusp->mat->mat = Ccsr; 2554 Ccsr->num_rows = Ccusp->nrows; 2555 Ccsr->num_cols = n; 2556 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2557 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2558 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2559 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2560 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2561 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2562 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2563 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2564 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2565 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2566 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2567 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2568 c->nz = 0; 2569 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2570 Ccsr->values = new THRUSTARRAY(c->nz); 2571 goto finalizesym; 2572 } 2573 2574 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2575 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2576 Acsr = (CsrMatrix*)Amat->mat; 2577 if (!biscompressed) { 2578 Bcsr = (CsrMatrix*)Bmat->mat; 2579 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2580 BmatSpDescr = Bmat->matDescr; 2581 #endif 2582 } else { /* we need to use row offsets for the full matrix */ 2583 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2584 Bcsr = new CsrMatrix; 2585 Bcsr->num_rows = B->rmap->n; 2586 Bcsr->num_cols = cBcsr->num_cols; 2587 Bcsr->num_entries = cBcsr->num_entries; 2588 Bcsr->column_indices = cBcsr->column_indices; 2589 Bcsr->values = cBcsr->values; 2590 if (!Bcusp->rowoffsets_gpu) { 2591 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2592 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2593 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2594 } 2595 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2596 mmdata->Bcsr = Bcsr; 2597 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2598 if (Bcsr->num_rows && Bcsr->num_cols) { 2599 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2600 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2601 Bcsr->values->data().get(), 2602 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2603 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2604 } 2605 BmatSpDescr = mmdata->matSpBDescr; 2606 #endif 2607 } 2608 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2609 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2610 /* precompute flops count */ 2611 if (ptype == MATPRODUCT_AB) { 2612 for (i=0, flops = 0; i<A->rmap->n; i++) { 2613 const PetscInt st = a->i[i]; 2614 const PetscInt en = a->i[i+1]; 2615 for (j=st; j<en; j++) { 2616 const PetscInt brow = a->j[j]; 2617 flops += 2.*(b->i[brow+1] - b->i[brow]); 2618 } 2619 } 2620 } else if (ptype == MATPRODUCT_AtB) { 2621 for (i=0, flops = 0; i<A->rmap->n; i++) { 2622 const PetscInt anzi = a->i[i+1] - a->i[i]; 2623 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2624 flops += (2.*anzi)*bnzi; 2625 } 2626 } else { /* TODO */ 2627 flops = 0.; 2628 } 2629 2630 mmdata->flops = flops; 2631 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2632 2633 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2634 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2635 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2636 NULL, NULL, NULL, 2637 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2638 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2639 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2640 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2641 { 2642 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2643 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2644 */ 2645 void* dBuffer1 = NULL; 2646 void* dBuffer2 = NULL; 2647 void* dBuffer3 = NULL; 2648 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2649 size_t bufferSize1 = 0; 2650 size_t bufferSize2 = 0; 2651 size_t bufferSize3 = 0; 2652 size_t bufferSize4 = 0; 2653 size_t bufferSize5 = 0; 2654 2655 /*----------------------------------------------------------------------*/ 2656 /* ask bufferSize1 bytes for external memory */ 2657 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2658 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2659 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2660 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2661 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2662 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2663 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2664 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2665 2666 /*----------------------------------------------------------------------*/ 2667 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2668 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2669 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2670 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2671 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2672 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2673 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2674 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2675 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2676 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2677 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2678 2679 /*----------------------------------------------------------------------*/ 2680 /* get matrix C non-zero entries C_nnz1 */ 2681 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2682 c->nz = (PetscInt) C_nnz1; 2683 /* allocate matrix C */ 2684 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2685 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2686 /* update matC with the new pointers */ 2687 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2688 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2689 2690 /*----------------------------------------------------------------------*/ 2691 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2692 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2693 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2694 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2695 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2696 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2697 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2698 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2699 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2700 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2701 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2702 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2703 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2704 } 2705 #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2706 size_t bufSize2; 2707 /* ask bufferSize bytes for external memory */ 2708 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2709 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2710 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2711 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2712 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2713 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2714 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2715 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2716 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2717 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2718 /* ask bufferSize again bytes for external memory */ 2719 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2720 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2721 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2722 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2723 /* The CUSPARSE documentation is not clear, nor the API 2724 We need both buffers to perform the operations properly! 2725 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2726 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2727 is stored in the descriptor! What a messy API... */ 2728 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2729 /* compute the intermediate product of A * B */ 2730 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2731 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2732 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2733 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2734 /* get matrix C non-zero entries C_nnz1 */ 2735 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2736 c->nz = (PetscInt) C_nnz1; 2737 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2738 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2739 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2740 Ccsr->values = new THRUSTARRAY(c->nz); 2741 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2742 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2743 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2744 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2745 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2746 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2747 #endif 2748 #else 2749 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2750 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2751 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2752 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2753 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2754 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2755 c->nz = cnz; 2756 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2757 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2758 Ccsr->values = new THRUSTARRAY(c->nz); 2759 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2760 2761 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2762 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2763 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2764 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2765 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2766 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2767 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2768 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2769 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2770 #endif 2771 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2772 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2773 finalizesym: 2774 c->singlemalloc = PETSC_FALSE; 2775 c->free_a = PETSC_TRUE; 2776 c->free_ij = PETSC_TRUE; 2777 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2778 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2779 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2780 PetscInt *d_i = c->i; 2781 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2782 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2783 ii = *Ccsr->row_offsets; 2784 jj = *Ccsr->column_indices; 2785 if (ciscompressed) d_i = c->compressedrow.i; 2786 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2787 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2788 } else { 2789 PetscInt *d_i = c->i; 2790 if (ciscompressed) d_i = c->compressedrow.i; 2791 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2792 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2793 } 2794 if (ciscompressed) { /* need to expand host row offsets */ 2795 PetscInt r = 0; 2796 c->i[0] = 0; 2797 for (k = 0; k < c->compressedrow.nrows; k++) { 2798 const PetscInt next = c->compressedrow.rindex[k]; 2799 const PetscInt old = c->compressedrow.i[k]; 2800 for (; r < next; r++) c->i[r+1] = old; 2801 } 2802 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2803 } 2804 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2805 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2806 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2807 c->maxnz = c->nz; 2808 c->nonzerorowcnt = 0; 2809 c->rmax = 0; 2810 for (k = 0; k < m; k++) { 2811 const PetscInt nn = c->i[k+1] - c->i[k]; 2812 c->ilen[k] = c->imax[k] = nn; 2813 c->nonzerorowcnt += (PetscInt)!!nn; 2814 c->rmax = PetscMax(c->rmax,nn); 2815 } 2816 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2817 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2818 Ccsr->num_entries = c->nz; 2819 2820 C->nonzerostate++; 2821 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2822 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2823 Ccusp->nonzerostate = C->nonzerostate; 2824 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2825 C->preallocated = PETSC_TRUE; 2826 C->assembled = PETSC_FALSE; 2827 C->was_assembled = PETSC_FALSE; 2828 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2829 mmdata->reusesym = PETSC_TRUE; 2830 C->offloadmask = PETSC_OFFLOAD_GPU; 2831 } 2832 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2833 PetscFunctionReturn(0); 2834 } 2835 2836 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2837 2838 /* handles sparse or dense B */ 2839 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2840 { 2841 Mat_Product *product = mat->product; 2842 PetscErrorCode ierr; 2843 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2844 2845 PetscFunctionBegin; 2846 MatCheckProduct(mat,1); 2847 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2848 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2849 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2850 } 2851 if (product->type == MATPRODUCT_ABC) { 2852 Ciscusp = PETSC_FALSE; 2853 if (!product->C->boundtocpu) { 2854 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2855 } 2856 } 2857 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2858 PetscBool usecpu = PETSC_FALSE; 2859 switch (product->type) { 2860 case MATPRODUCT_AB: 2861 if (product->api_user) { 2862 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2863 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2864 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2865 } else { 2866 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2867 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2868 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2869 } 2870 break; 2871 case MATPRODUCT_AtB: 2872 if (product->api_user) { 2873 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2874 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2875 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2876 } else { 2877 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2878 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2879 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2880 } 2881 break; 2882 case MATPRODUCT_PtAP: 2883 if (product->api_user) { 2884 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2885 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2886 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2887 } else { 2888 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2889 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2890 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2891 } 2892 break; 2893 case MATPRODUCT_RARt: 2894 if (product->api_user) { 2895 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2896 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2897 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2898 } else { 2899 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2900 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2901 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2902 } 2903 break; 2904 case MATPRODUCT_ABC: 2905 if (product->api_user) { 2906 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2907 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2908 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2909 } else { 2910 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2911 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2912 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2913 } 2914 break; 2915 default: 2916 break; 2917 } 2918 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2919 } 2920 /* dispatch */ 2921 if (isdense) { 2922 switch (product->type) { 2923 case MATPRODUCT_AB: 2924 case MATPRODUCT_AtB: 2925 case MATPRODUCT_ABt: 2926 case MATPRODUCT_PtAP: 2927 case MATPRODUCT_RARt: 2928 if (product->A->boundtocpu) { 2929 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2930 } else { 2931 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2932 } 2933 break; 2934 case MATPRODUCT_ABC: 2935 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2936 break; 2937 default: 2938 break; 2939 } 2940 } else if (Biscusp && Ciscusp) { 2941 switch (product->type) { 2942 case MATPRODUCT_AB: 2943 case MATPRODUCT_AtB: 2944 case MATPRODUCT_ABt: 2945 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2946 break; 2947 case MATPRODUCT_PtAP: 2948 case MATPRODUCT_RARt: 2949 case MATPRODUCT_ABC: 2950 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2951 break; 2952 default: 2953 break; 2954 } 2955 } else { /* fallback for AIJ */ 2956 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2957 } 2958 PetscFunctionReturn(0); 2959 } 2960 2961 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2962 { 2963 PetscErrorCode ierr; 2964 2965 PetscFunctionBegin; 2966 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2967 PetscFunctionReturn(0); 2968 } 2969 2970 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2971 { 2972 PetscErrorCode ierr; 2973 2974 PetscFunctionBegin; 2975 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2976 PetscFunctionReturn(0); 2977 } 2978 2979 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2980 { 2981 PetscErrorCode ierr; 2982 2983 PetscFunctionBegin; 2984 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2985 PetscFunctionReturn(0); 2986 } 2987 2988 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2989 { 2990 PetscErrorCode ierr; 2991 2992 PetscFunctionBegin; 2993 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2994 PetscFunctionReturn(0); 2995 } 2996 2997 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2998 { 2999 PetscErrorCode ierr; 3000 3001 PetscFunctionBegin; 3002 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3003 PetscFunctionReturn(0); 3004 } 3005 3006 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3007 { 3008 int i = blockIdx.x*blockDim.x + threadIdx.x; 3009 if (i < n) y[idx[i]] += x[i]; 3010 } 3011 3012 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3013 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3014 { 3015 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3016 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3017 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3018 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3019 PetscErrorCode ierr; 3020 cusparseStatus_t stat; 3021 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3022 PetscBool compressed; 3023 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3024 PetscInt nx,ny; 3025 #endif 3026 3027 PetscFunctionBegin; 3028 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3029 if (!a->nonzerorowcnt) { 3030 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 3031 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 3032 PetscFunctionReturn(0); 3033 } 3034 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3035 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3036 if (!trans) { 3037 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3038 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3039 } else { 3040 if (herm || !A->form_explicit_transpose) { 3041 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3042 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3043 } else { 3044 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3045 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3046 } 3047 } 3048 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3049 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3050 3051 try { 3052 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3053 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3054 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3055 3056 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3057 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3058 /* z = A x + beta y. 3059 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3060 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3061 */ 3062 xptr = xarray; 3063 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3064 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3065 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3066 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3067 allocated to accommodate different uses. So we get the length info directly from mat. 3068 */ 3069 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3070 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3071 nx = mat->num_cols; 3072 ny = mat->num_rows; 3073 } 3074 #endif 3075 } else { 3076 /* z = A^T x + beta y 3077 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3078 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3079 */ 3080 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3081 dptr = zarray; 3082 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3083 if (compressed) { /* Scatter x to work vector */ 3084 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3085 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3086 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3087 VecCUDAEqualsReverse()); 3088 } 3089 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3090 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3091 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3092 nx = mat->num_rows; 3093 ny = mat->num_cols; 3094 } 3095 #endif 3096 } 3097 3098 /* csr_spmv does y = alpha op(A) x + beta y */ 3099 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3100 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3101 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3102 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3103 cudaError_t cerr; 3104 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3105 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3106 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3107 matstruct->matDescr, 3108 matstruct->cuSpMV[opA].vecXDescr, beta, 3109 matstruct->cuSpMV[opA].vecYDescr, 3110 cusparse_scalartype, 3111 cusparsestruct->spmvAlg, 3112 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3113 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3114 3115 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3116 } else { 3117 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3118 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3119 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3120 } 3121 3122 stat = cusparseSpMV(cusparsestruct->handle, opA, 3123 matstruct->alpha_one, 3124 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3125 matstruct->cuSpMV[opA].vecXDescr, 3126 beta, 3127 matstruct->cuSpMV[opA].vecYDescr, 3128 cusparse_scalartype, 3129 cusparsestruct->spmvAlg, 3130 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3131 #else 3132 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3133 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3134 mat->num_rows, mat->num_cols, 3135 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3136 mat->values->data().get(), mat->row_offsets->data().get(), 3137 mat->column_indices->data().get(), xptr, beta, 3138 dptr);CHKERRCUSPARSE(stat); 3139 #endif 3140 } else { 3141 if (cusparsestruct->nrows) { 3142 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3143 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3144 #else 3145 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3146 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3147 matstruct->alpha_one, matstruct->descr, hybMat, 3148 xptr, beta, 3149 dptr);CHKERRCUSPARSE(stat); 3150 #endif 3151 } 3152 } 3153 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3154 3155 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3156 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3157 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3158 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3159 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3160 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3161 } 3162 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3163 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3164 } 3165 3166 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3167 if (compressed) { 3168 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3169 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3170 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3171 prevent that. So I just add a ScatterAdd kernel. 3172 */ 3173 #if 0 3174 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3175 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3176 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3177 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3178 VecCUDAPlusEquals()); 3179 #else 3180 PetscInt n = matstruct->cprowIndices->size(); 3181 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3182 #endif 3183 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3184 } 3185 } else { 3186 if (yy && yy != zz) { 3187 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3188 } 3189 } 3190 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3191 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3192 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3193 } catch(char *ex) { 3194 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3195 } 3196 if (yy) { 3197 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3198 } else { 3199 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3200 } 3201 PetscFunctionReturn(0); 3202 } 3203 3204 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3205 { 3206 PetscErrorCode ierr; 3207 3208 PetscFunctionBegin; 3209 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3210 PetscFunctionReturn(0); 3211 } 3212 3213 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3214 { 3215 PetscErrorCode ierr; 3216 PetscObjectState onnz = A->nonzerostate; 3217 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3218 3219 PetscFunctionBegin; 3220 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3221 if (onnz != A->nonzerostate && cusp->deviceMat) { 3222 cudaError_t cerr; 3223 3224 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3225 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3226 cusp->deviceMat = NULL; 3227 } 3228 PetscFunctionReturn(0); 3229 } 3230 3231 /* --------------------------------------------------------------------------------*/ 3232 /*@ 3233 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3234 (the default parallel PETSc format). This matrix will ultimately pushed down 3235 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3236 assembly performance the user should preallocate the matrix storage by setting 3237 the parameter nz (or the array nnz). By setting these parameters accurately, 3238 performance during matrix assembly can be increased by more than a factor of 50. 3239 3240 Collective 3241 3242 Input Parameters: 3243 + comm - MPI communicator, set to PETSC_COMM_SELF 3244 . m - number of rows 3245 . n - number of columns 3246 . nz - number of nonzeros per row (same for all rows) 3247 - nnz - array containing the number of nonzeros in the various rows 3248 (possibly different for each row) or NULL 3249 3250 Output Parameter: 3251 . A - the matrix 3252 3253 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3254 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3255 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3256 3257 Notes: 3258 If nnz is given then nz is ignored 3259 3260 The AIJ format (also called the Yale sparse matrix format or 3261 compressed row storage), is fully compatible with standard Fortran 77 3262 storage. That is, the stored row and column indices can begin at 3263 either one (as in Fortran) or zero. See the users' manual for details. 3264 3265 Specify the preallocated storage with either nz or nnz (not both). 3266 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3267 allocation. For large problems you MUST preallocate memory or you 3268 will get TERRIBLE performance, see the users' manual chapter on matrices. 3269 3270 By default, this format uses inodes (identical nodes) when possible, to 3271 improve numerical efficiency of matrix-vector products and solves. We 3272 search for consecutive rows with the same nonzero structure, thereby 3273 reusing matrix information to achieve increased efficiency. 3274 3275 Level: intermediate 3276 3277 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3278 @*/ 3279 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3280 { 3281 PetscErrorCode ierr; 3282 3283 PetscFunctionBegin; 3284 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3285 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3286 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3287 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3288 PetscFunctionReturn(0); 3289 } 3290 3291 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3292 { 3293 PetscErrorCode ierr; 3294 3295 PetscFunctionBegin; 3296 if (A->factortype == MAT_FACTOR_NONE) { 3297 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3298 } else { 3299 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3300 } 3301 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3302 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3303 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3304 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3305 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3306 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3307 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3308 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3309 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3310 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3311 PetscFunctionReturn(0); 3312 } 3313 3314 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3315 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3316 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3317 { 3318 PetscErrorCode ierr; 3319 3320 PetscFunctionBegin; 3321 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3322 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3323 PetscFunctionReturn(0); 3324 } 3325 3326 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3327 { 3328 PetscErrorCode ierr; 3329 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3330 Mat_SeqAIJCUSPARSE *cy; 3331 Mat_SeqAIJCUSPARSE *cx; 3332 PetscScalar *ay; 3333 const PetscScalar *ax; 3334 CsrMatrix *csry,*csrx; 3335 3336 PetscFunctionBegin; 3337 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3338 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3339 if (X->ops->axpy != Y->ops->axpy) { 3340 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3341 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3342 PetscFunctionReturn(0); 3343 } 3344 /* if we are here, it means both matrices are bound to GPU */ 3345 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3346 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3347 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3348 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3349 csry = (CsrMatrix*)cy->mat->mat; 3350 csrx = (CsrMatrix*)cx->mat->mat; 3351 /* see if we can turn this into a cublas axpy */ 3352 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3353 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3354 if (eq) { 3355 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3356 } 3357 if (eq) str = SAME_NONZERO_PATTERN; 3358 } 3359 /* spgeam is buggy with one column */ 3360 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3361 3362 if (str == SUBSET_NONZERO_PATTERN) { 3363 cusparseStatus_t stat; 3364 PetscScalar b = 1.0; 3365 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3366 size_t bufferSize; 3367 void *buffer; 3368 cudaError_t cerr; 3369 #endif 3370 3371 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3372 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3373 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3374 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3375 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3376 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3377 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3378 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3379 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3380 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3381 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3382 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3383 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3384 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3385 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3386 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3387 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3388 #else 3389 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3390 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3391 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3392 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3393 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3394 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3395 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3396 #endif 3397 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3398 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3399 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3400 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3401 } else if (str == SAME_NONZERO_PATTERN) { 3402 cublasHandle_t cublasv2handle; 3403 cublasStatus_t berr; 3404 PetscBLASInt one = 1, bnz = 1; 3405 3406 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3407 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3408 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3409 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3410 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3411 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3412 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3413 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3414 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3415 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3416 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3417 } else { 3418 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3419 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3420 } 3421 PetscFunctionReturn(0); 3422 } 3423 3424 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3425 { 3426 PetscErrorCode ierr; 3427 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3428 PetscScalar *ay; 3429 cublasHandle_t cublasv2handle; 3430 cublasStatus_t berr; 3431 PetscBLASInt one = 1, bnz = 1; 3432 3433 PetscFunctionBegin; 3434 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3435 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3436 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3437 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3438 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3439 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3440 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3441 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3442 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3443 PetscFunctionReturn(0); 3444 } 3445 3446 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3447 { 3448 PetscErrorCode ierr; 3449 PetscBool both = PETSC_FALSE; 3450 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3451 3452 PetscFunctionBegin; 3453 if (A->factortype == MAT_FACTOR_NONE) { 3454 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3455 if (spptr->mat) { 3456 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3457 if (matrix->values) { 3458 both = PETSC_TRUE; 3459 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3460 } 3461 } 3462 if (spptr->matTranspose) { 3463 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3464 if (matrix->values) { 3465 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3466 } 3467 } 3468 } 3469 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3470 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3471 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3472 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3473 else A->offloadmask = PETSC_OFFLOAD_CPU; 3474 PetscFunctionReturn(0); 3475 } 3476 3477 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3478 { 3479 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3480 PetscErrorCode ierr; 3481 3482 PetscFunctionBegin; 3483 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3484 if (flg) { 3485 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3486 3487 A->ops->scale = MatScale_SeqAIJ; 3488 A->ops->axpy = MatAXPY_SeqAIJ; 3489 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3490 A->ops->mult = MatMult_SeqAIJ; 3491 A->ops->multadd = MatMultAdd_SeqAIJ; 3492 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3493 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3494 A->ops->multhermitiantranspose = NULL; 3495 A->ops->multhermitiantransposeadd = NULL; 3496 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3497 ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr); 3498 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3499 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3500 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3501 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3502 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3503 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3504 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3505 } else { 3506 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3507 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3508 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3509 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3510 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3511 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3512 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3513 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3514 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3515 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3516 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3517 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3518 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3519 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3520 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3521 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3522 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3523 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3524 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3525 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3526 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3527 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3528 } 3529 A->boundtocpu = flg; 3530 if (flg && a->inode.size) { 3531 a->inode.use = PETSC_TRUE; 3532 } else { 3533 a->inode.use = PETSC_FALSE; 3534 } 3535 PetscFunctionReturn(0); 3536 } 3537 3538 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3539 { 3540 PetscErrorCode ierr; 3541 cusparseStatus_t stat; 3542 Mat B; 3543 3544 PetscFunctionBegin; 3545 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3546 if (reuse == MAT_INITIAL_MATRIX) { 3547 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3548 } else if (reuse == MAT_REUSE_MATRIX) { 3549 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3550 } 3551 B = *newmat; 3552 3553 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3554 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3555 3556 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3557 if (B->factortype == MAT_FACTOR_NONE) { 3558 Mat_SeqAIJCUSPARSE *spptr; 3559 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3560 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3561 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3562 spptr->format = MAT_CUSPARSE_CSR; 3563 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3564 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3565 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3566 #else 3567 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3568 #endif 3569 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3570 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3571 #endif 3572 B->spptr = spptr; 3573 } else { 3574 Mat_SeqAIJCUSPARSETriFactors *spptr; 3575 3576 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3577 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3578 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3579 B->spptr = spptr; 3580 } 3581 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3582 } 3583 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3584 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3585 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3586 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3587 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3588 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3589 3590 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3591 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3592 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3593 #if defined(PETSC_HAVE_HYPRE) 3594 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3595 #endif 3596 PetscFunctionReturn(0); 3597 } 3598 3599 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3600 { 3601 PetscErrorCode ierr; 3602 3603 PetscFunctionBegin; 3604 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3605 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3606 PetscFunctionReturn(0); 3607 } 3608 3609 /*MC 3610 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3611 3612 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3613 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3614 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3615 3616 Options Database Keys: 3617 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3618 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3619 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3620 3621 Level: beginner 3622 3623 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3624 M*/ 3625 3626 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3627 3628 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3629 { 3630 PetscErrorCode ierr; 3631 3632 PetscFunctionBegin; 3633 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3634 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3635 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3636 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3637 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3638 3639 PetscFunctionReturn(0); 3640 } 3641 3642 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3643 { 3644 PetscErrorCode ierr; 3645 cusparseStatus_t stat; 3646 3647 PetscFunctionBegin; 3648 if (*cusparsestruct) { 3649 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3650 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3651 delete (*cusparsestruct)->workVector; 3652 delete (*cusparsestruct)->rowoffsets_gpu; 3653 delete (*cusparsestruct)->cooPerm; 3654 delete (*cusparsestruct)->cooPerm_a; 3655 delete (*cusparsestruct)->csr2csc_i; 3656 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3657 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3658 } 3659 PetscFunctionReturn(0); 3660 } 3661 3662 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3663 { 3664 PetscFunctionBegin; 3665 if (*mat) { 3666 delete (*mat)->values; 3667 delete (*mat)->column_indices; 3668 delete (*mat)->row_offsets; 3669 delete *mat; 3670 *mat = 0; 3671 } 3672 PetscFunctionReturn(0); 3673 } 3674 3675 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3676 { 3677 cusparseStatus_t stat; 3678 PetscErrorCode ierr; 3679 3680 PetscFunctionBegin; 3681 if (*trifactor) { 3682 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3683 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3684 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3685 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3686 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3687 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3688 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3689 #endif 3690 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3691 } 3692 PetscFunctionReturn(0); 3693 } 3694 3695 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3696 { 3697 CsrMatrix *mat; 3698 cusparseStatus_t stat; 3699 cudaError_t err; 3700 3701 PetscFunctionBegin; 3702 if (*matstruct) { 3703 if ((*matstruct)->mat) { 3704 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3705 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3706 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3707 #else 3708 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3709 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3710 #endif 3711 } else { 3712 mat = (CsrMatrix*)(*matstruct)->mat; 3713 CsrMatrix_Destroy(&mat); 3714 } 3715 } 3716 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3717 delete (*matstruct)->cprowIndices; 3718 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3719 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3720 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3721 3722 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3723 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3724 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3725 for (int i=0; i<3; i++) { 3726 if (mdata->cuSpMV[i].initialized) { 3727 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3728 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3729 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3730 } 3731 } 3732 #endif 3733 delete *matstruct; 3734 *matstruct = NULL; 3735 } 3736 PetscFunctionReturn(0); 3737 } 3738 3739 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3740 { 3741 PetscErrorCode ierr; 3742 3743 PetscFunctionBegin; 3744 if (*trifactors) { 3745 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3746 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3747 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3748 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3749 delete (*trifactors)->rpermIndices; 3750 delete (*trifactors)->cpermIndices; 3751 delete (*trifactors)->workVector; 3752 (*trifactors)->rpermIndices = NULL; 3753 (*trifactors)->cpermIndices = NULL; 3754 (*trifactors)->workVector = NULL; 3755 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3756 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3757 (*trifactors)->init_dev_prop = PETSC_FALSE; 3758 } 3759 PetscFunctionReturn(0); 3760 } 3761 3762 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3763 { 3764 PetscErrorCode ierr; 3765 cusparseHandle_t handle; 3766 cusparseStatus_t stat; 3767 3768 PetscFunctionBegin; 3769 if (*trifactors) { 3770 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3771 if (handle = (*trifactors)->handle) { 3772 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3773 } 3774 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3775 } 3776 PetscFunctionReturn(0); 3777 } 3778 3779 struct IJCompare 3780 { 3781 __host__ __device__ 3782 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3783 { 3784 if (t1.get<0>() < t2.get<0>()) return true; 3785 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3786 return false; 3787 } 3788 }; 3789 3790 struct IJEqual 3791 { 3792 __host__ __device__ 3793 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3794 { 3795 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3796 return true; 3797 } 3798 }; 3799 3800 struct IJDiff 3801 { 3802 __host__ __device__ 3803 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3804 { 3805 return t1 == t2 ? 0 : 1; 3806 } 3807 }; 3808 3809 struct IJSum 3810 { 3811 __host__ __device__ 3812 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3813 { 3814 return t1||t2; 3815 } 3816 }; 3817 3818 #include <thrust/iterator/discard_iterator.h> 3819 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3820 { 3821 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3822 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3823 THRUSTARRAY *cooPerm_v = NULL; 3824 thrust::device_ptr<const PetscScalar> d_v; 3825 CsrMatrix *matrix; 3826 PetscErrorCode ierr; 3827 PetscInt n; 3828 3829 PetscFunctionBegin; 3830 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3831 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3832 if (!cusp->cooPerm) { 3833 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3834 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3835 PetscFunctionReturn(0); 3836 } 3837 matrix = (CsrMatrix*)cusp->mat->mat; 3838 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3839 if (!v) { 3840 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3841 goto finalize; 3842 } 3843 n = cusp->cooPerm->size(); 3844 if (isCudaMem(v)) { 3845 d_v = thrust::device_pointer_cast(v); 3846 } else { 3847 cooPerm_v = new THRUSTARRAY(n); 3848 cooPerm_v->assign(v,v+n); 3849 d_v = cooPerm_v->data(); 3850 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3851 } 3852 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3853 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3854 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3855 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3856 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3857 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3858 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3859 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3860 */ 3861 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3862 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3863 delete cooPerm_w; 3864 } else { 3865 /* all nonzeros in d_v[] are unique entries */ 3866 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3867 matrix->values->begin())); 3868 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3869 matrix->values->end())); 3870 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3871 } 3872 } else { 3873 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3874 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3875 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3876 } else { 3877 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3878 matrix->values->begin())); 3879 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3880 matrix->values->end())); 3881 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3882 } 3883 } 3884 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3885 finalize: 3886 delete cooPerm_v; 3887 A->offloadmask = PETSC_OFFLOAD_GPU; 3888 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3889 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3890 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3891 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3892 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3893 a->reallocs = 0; 3894 A->info.mallocs += 0; 3895 A->info.nz_unneeded = 0; 3896 A->assembled = A->was_assembled = PETSC_TRUE; 3897 A->num_ass++; 3898 PetscFunctionReturn(0); 3899 } 3900 3901 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3902 { 3903 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3904 PetscErrorCode ierr; 3905 3906 PetscFunctionBegin; 3907 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3908 if (!cusp) PetscFunctionReturn(0); 3909 if (destroy) { 3910 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3911 delete cusp->csr2csc_i; 3912 cusp->csr2csc_i = NULL; 3913 } 3914 A->transupdated = PETSC_FALSE; 3915 PetscFunctionReturn(0); 3916 } 3917 3918 #include <thrust/binary_search.h> 3919 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3920 { 3921 PetscErrorCode ierr; 3922 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3924 PetscInt cooPerm_n, nzr = 0; 3925 cudaError_t cerr; 3926 3927 PetscFunctionBegin; 3928 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3929 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3930 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3931 if (n != cooPerm_n) { 3932 delete cusp->cooPerm; 3933 delete cusp->cooPerm_a; 3934 cusp->cooPerm = NULL; 3935 cusp->cooPerm_a = NULL; 3936 } 3937 if (n) { 3938 THRUSTINTARRAY d_i(n); 3939 THRUSTINTARRAY d_j(n); 3940 THRUSTINTARRAY ii(A->rmap->n); 3941 3942 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3943 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3944 3945 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3946 d_i.assign(coo_i,coo_i+n); 3947 d_j.assign(coo_j,coo_j+n); 3948 3949 /* Ex. 3950 n = 6 3951 coo_i = [3,3,1,4,1,4] 3952 coo_j = [3,2,2,5,2,6] 3953 */ 3954 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3955 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3956 3957 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3958 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3959 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3960 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3961 THRUSTINTARRAY w = d_j; 3962 3963 /* 3964 d_i = [1,1,3,3,4,4] 3965 d_j = [2,2,2,3,5,6] 3966 cooPerm = [2,4,1,0,3,5] 3967 */ 3968 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3969 3970 /* 3971 d_i = [1,3,3,4,4,x] 3972 ^ekey 3973 d_j = [2,2,3,5,6,x] 3974 ^nekye 3975 */ 3976 if (nekey == ekey) { /* all entries are unique */ 3977 delete cusp->cooPerm_a; 3978 cusp->cooPerm_a = NULL; 3979 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3980 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3981 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3982 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3983 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3984 w[0] = 0; 3985 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3986 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3987 } 3988 thrust::counting_iterator<PetscInt> search_begin(0); 3989 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3990 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3991 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3992 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3993 3994 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3995 a->singlemalloc = PETSC_FALSE; 3996 a->free_a = PETSC_TRUE; 3997 a->free_ij = PETSC_TRUE; 3998 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3999 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4000 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4001 a->nz = a->maxnz = a->i[A->rmap->n]; 4002 a->rmax = 0; 4003 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 4004 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 4005 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4006 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 4007 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 4008 for (PetscInt i = 0; i < A->rmap->n; i++) { 4009 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4010 nzr += (PetscInt)!!(nnzr); 4011 a->ilen[i] = a->imax[i] = nnzr; 4012 a->rmax = PetscMax(a->rmax,nnzr); 4013 } 4014 a->nonzerorowcnt = nzr; 4015 A->preallocated = PETSC_TRUE; 4016 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 4017 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 4018 } else { 4019 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 4020 } 4021 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 4022 4023 /* We want to allocate the CUSPARSE struct for matvec now. 4024 The code is so convoluted now that I prefer to copy zeros */ 4025 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 4026 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 4027 A->offloadmask = PETSC_OFFLOAD_CPU; 4028 A->nonzerostate++; 4029 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4030 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 4031 4032 A->assembled = PETSC_FALSE; 4033 A->was_assembled = PETSC_FALSE; 4034 PetscFunctionReturn(0); 4035 } 4036 4037 /*@C 4038 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4039 4040 Not collective 4041 4042 Input Parameters: 4043 + A - the matrix 4044 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4045 4046 Output Parameters: 4047 + ia - the CSR row pointers 4048 - ja - the CSR column indices 4049 4050 Level: developer 4051 4052 Notes: 4053 When compressed is true, the CSR structure does not contain empty rows 4054 4055 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4056 @*/ 4057 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4058 { 4059 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4060 CsrMatrix *csr; 4061 PetscErrorCode ierr; 4062 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4063 4064 PetscFunctionBegin; 4065 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4066 if (!i || !j) PetscFunctionReturn(0); 4067 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4068 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4069 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4070 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4071 csr = (CsrMatrix*)cusp->mat->mat; 4072 if (i) { 4073 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4074 if (!cusp->rowoffsets_gpu) { 4075 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4076 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4077 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4078 } 4079 *i = cusp->rowoffsets_gpu->data().get(); 4080 } else *i = csr->row_offsets->data().get(); 4081 } 4082 if (j) *j = csr->column_indices->data().get(); 4083 PetscFunctionReturn(0); 4084 } 4085 4086 /*@C 4087 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4088 4089 Not collective 4090 4091 Input Parameters: 4092 + A - the matrix 4093 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4094 4095 Output Parameters: 4096 + ia - the CSR row pointers 4097 - ja - the CSR column indices 4098 4099 Level: developer 4100 4101 .seealso: MatSeqAIJCUSPARSEGetIJ() 4102 @*/ 4103 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4104 { 4105 PetscFunctionBegin; 4106 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4107 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4108 if (i) *i = NULL; 4109 if (j) *j = NULL; 4110 PetscFunctionReturn(0); 4111 } 4112 4113 /*@C 4114 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4115 4116 Not Collective 4117 4118 Input Parameter: 4119 . A - a MATSEQAIJCUSPARSE matrix 4120 4121 Output Parameter: 4122 . a - pointer to the device data 4123 4124 Level: developer 4125 4126 Notes: may trigger host-device copies if up-to-date matrix data is on host 4127 4128 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4129 @*/ 4130 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4131 { 4132 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4133 CsrMatrix *csr; 4134 PetscErrorCode ierr; 4135 4136 PetscFunctionBegin; 4137 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4138 PetscValidPointer(a,2); 4139 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4140 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4141 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4142 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4143 csr = (CsrMatrix*)cusp->mat->mat; 4144 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4145 *a = csr->values->data().get(); 4146 PetscFunctionReturn(0); 4147 } 4148 4149 /*@C 4150 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4151 4152 Not Collective 4153 4154 Input Parameter: 4155 . A - a MATSEQAIJCUSPARSE matrix 4156 4157 Output Parameter: 4158 . a - pointer to the device data 4159 4160 Level: developer 4161 4162 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4163 @*/ 4164 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4165 { 4166 PetscFunctionBegin; 4167 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4168 PetscValidPointer(a,2); 4169 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4170 *a = NULL; 4171 PetscFunctionReturn(0); 4172 } 4173 4174 /*@C 4175 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4176 4177 Not Collective 4178 4179 Input Parameter: 4180 . A - a MATSEQAIJCUSPARSE matrix 4181 4182 Output Parameter: 4183 . a - pointer to the device data 4184 4185 Level: developer 4186 4187 Notes: may trigger host-device copies if up-to-date matrix data is on host 4188 4189 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4190 @*/ 4191 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4192 { 4193 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4194 CsrMatrix *csr; 4195 PetscErrorCode ierr; 4196 4197 PetscFunctionBegin; 4198 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4199 PetscValidPointer(a,2); 4200 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4201 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4202 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4203 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4204 csr = (CsrMatrix*)cusp->mat->mat; 4205 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4206 *a = csr->values->data().get(); 4207 A->offloadmask = PETSC_OFFLOAD_GPU; 4208 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4209 PetscFunctionReturn(0); 4210 } 4211 /*@C 4212 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4213 4214 Not Collective 4215 4216 Input Parameter: 4217 . A - a MATSEQAIJCUSPARSE matrix 4218 4219 Output Parameter: 4220 . a - pointer to the device data 4221 4222 Level: developer 4223 4224 .seealso: MatSeqAIJCUSPARSEGetArray() 4225 @*/ 4226 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4227 { 4228 PetscErrorCode ierr; 4229 4230 PetscFunctionBegin; 4231 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4232 PetscValidPointer(a,2); 4233 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4234 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4235 *a = NULL; 4236 PetscFunctionReturn(0); 4237 } 4238 4239 /*@C 4240 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4241 4242 Not Collective 4243 4244 Input Parameter: 4245 . A - a MATSEQAIJCUSPARSE matrix 4246 4247 Output Parameter: 4248 . a - pointer to the device data 4249 4250 Level: developer 4251 4252 Notes: does not trigger host-device copies and flags data validity on the GPU 4253 4254 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4255 @*/ 4256 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4257 { 4258 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4259 CsrMatrix *csr; 4260 PetscErrorCode ierr; 4261 4262 PetscFunctionBegin; 4263 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4264 PetscValidPointer(a,2); 4265 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4266 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4267 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4268 csr = (CsrMatrix*)cusp->mat->mat; 4269 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4270 *a = csr->values->data().get(); 4271 A->offloadmask = PETSC_OFFLOAD_GPU; 4272 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4273 PetscFunctionReturn(0); 4274 } 4275 4276 /*@C 4277 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4278 4279 Not Collective 4280 4281 Input Parameter: 4282 . A - a MATSEQAIJCUSPARSE matrix 4283 4284 Output Parameter: 4285 . a - pointer to the device data 4286 4287 Level: developer 4288 4289 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4290 @*/ 4291 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4292 { 4293 PetscErrorCode ierr; 4294 4295 PetscFunctionBegin; 4296 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4297 PetscValidPointer(a,2); 4298 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4299 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4300 *a = NULL; 4301 PetscFunctionReturn(0); 4302 } 4303 4304 struct IJCompare4 4305 { 4306 __host__ __device__ 4307 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4308 { 4309 if (t1.get<0>() < t2.get<0>()) return true; 4310 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4311 return false; 4312 } 4313 }; 4314 4315 struct Shift 4316 { 4317 int _shift; 4318 4319 Shift(int shift) : _shift(shift) {} 4320 __host__ __device__ 4321 inline int operator() (const int &c) 4322 { 4323 return c + _shift; 4324 } 4325 }; 4326 4327 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4328 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4329 { 4330 PetscErrorCode ierr; 4331 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4332 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4333 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4334 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4335 PetscInt Annz,Bnnz; 4336 cusparseStatus_t stat; 4337 PetscInt i,m,n,zero = 0; 4338 cudaError_t cerr; 4339 4340 PetscFunctionBegin; 4341 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4342 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4343 PetscValidPointer(C,4); 4344 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4345 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4346 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4347 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4348 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4349 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4350 if (reuse == MAT_INITIAL_MATRIX) { 4351 m = A->rmap->n; 4352 n = A->cmap->n + B->cmap->n; 4353 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4354 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4355 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4356 c = (Mat_SeqAIJ*)(*C)->data; 4357 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4358 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4359 Ccsr = new CsrMatrix; 4360 Cmat->cprowIndices = NULL; 4361 c->compressedrow.use = PETSC_FALSE; 4362 c->compressedrow.nrows = 0; 4363 c->compressedrow.i = NULL; 4364 c->compressedrow.rindex = NULL; 4365 Ccusp->workVector = NULL; 4366 Ccusp->nrows = m; 4367 Ccusp->mat = Cmat; 4368 Ccusp->mat->mat = Ccsr; 4369 Ccsr->num_rows = m; 4370 Ccsr->num_cols = n; 4371 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4372 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4373 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4374 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4375 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4376 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4377 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4378 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4379 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4380 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4381 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4382 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4383 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4384 4385 Acsr = (CsrMatrix*)Acusp->mat->mat; 4386 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4387 Annz = (PetscInt)Acsr->column_indices->size(); 4388 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4389 c->nz = Annz + Bnnz; 4390 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4391 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4392 Ccsr->values = new THRUSTARRAY(c->nz); 4393 Ccsr->num_entries = c->nz; 4394 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4395 if (c->nz) { 4396 auto Acoo = new THRUSTINTARRAY32(Annz); 4397 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4398 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4399 THRUSTINTARRAY32 *Aroff,*Broff; 4400 4401 if (a->compressedrow.use) { /* need full row offset */ 4402 if (!Acusp->rowoffsets_gpu) { 4403 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4404 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4405 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4406 } 4407 Aroff = Acusp->rowoffsets_gpu; 4408 } else Aroff = Acsr->row_offsets; 4409 if (b->compressedrow.use) { /* need full row offset */ 4410 if (!Bcusp->rowoffsets_gpu) { 4411 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4412 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4413 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4414 } 4415 Broff = Bcusp->rowoffsets_gpu; 4416 } else Broff = Bcsr->row_offsets; 4417 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4418 stat = cusparseXcsr2coo(Acusp->handle, 4419 Aroff->data().get(), 4420 Annz, 4421 m, 4422 Acoo->data().get(), 4423 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4424 stat = cusparseXcsr2coo(Bcusp->handle, 4425 Broff->data().get(), 4426 Bnnz, 4427 m, 4428 Bcoo->data().get(), 4429 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4430 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4431 auto Aperm = thrust::make_constant_iterator(1); 4432 auto Bperm = thrust::make_constant_iterator(0); 4433 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4434 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4435 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4436 #else 4437 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4438 auto Bcib = Bcsr->column_indices->begin(); 4439 auto Bcie = Bcsr->column_indices->end(); 4440 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4441 #endif 4442 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4443 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4444 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4445 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4446 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4447 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4448 auto p1 = Ccusp->cooPerm->begin(); 4449 auto p2 = Ccusp->cooPerm->begin(); 4450 thrust::advance(p2,Annz); 4451 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4452 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4453 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4454 #endif 4455 auto cci = thrust::make_counting_iterator(zero); 4456 auto cce = thrust::make_counting_iterator(c->nz); 4457 #if 0 //Errors on SUMMIT cuda 11.1.0 4458 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4459 #else 4460 auto pred = thrust::identity<int>(); 4461 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4462 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4463 #endif 4464 stat = cusparseXcoo2csr(Ccusp->handle, 4465 Ccoo->data().get(), 4466 c->nz, 4467 m, 4468 Ccsr->row_offsets->data().get(), 4469 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4470 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4471 delete wPerm; 4472 delete Acoo; 4473 delete Bcoo; 4474 delete Ccoo; 4475 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4476 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4477 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4478 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4479 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4480 #endif 4481 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4482 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4483 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4484 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4485 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4486 CsrMatrix *CcsrT = new CsrMatrix; 4487 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4488 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4489 4490 (*C)->form_explicit_transpose = PETSC_TRUE; 4491 (*C)->transupdated = PETSC_TRUE; 4492 Ccusp->rowoffsets_gpu = NULL; 4493 CmatT->cprowIndices = NULL; 4494 CmatT->mat = CcsrT; 4495 CcsrT->num_rows = n; 4496 CcsrT->num_cols = m; 4497 CcsrT->num_entries = c->nz; 4498 4499 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4500 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4501 CcsrT->values = new THRUSTARRAY(c->nz); 4502 4503 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4504 auto rT = CcsrT->row_offsets->begin(); 4505 if (AT) { 4506 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4507 thrust::advance(rT,-1); 4508 } 4509 if (BT) { 4510 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4511 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4512 thrust::copy(titb,tite,rT); 4513 } 4514 auto cT = CcsrT->column_indices->begin(); 4515 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4516 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4517 auto vT = CcsrT->values->begin(); 4518 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4519 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4520 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4521 4522 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4523 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4524 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4525 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4526 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4527 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4528 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4529 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4530 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4531 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4532 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4533 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4534 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4535 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4536 #endif 4537 Ccusp->matTranspose = CmatT; 4538 } 4539 } 4540 4541 c->singlemalloc = PETSC_FALSE; 4542 c->free_a = PETSC_TRUE; 4543 c->free_ij = PETSC_TRUE; 4544 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4545 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4546 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4547 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4548 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4549 ii = *Ccsr->row_offsets; 4550 jj = *Ccsr->column_indices; 4551 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4552 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4553 } else { 4554 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4555 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4556 } 4557 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4558 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4559 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4560 c->maxnz = c->nz; 4561 c->nonzerorowcnt = 0; 4562 c->rmax = 0; 4563 for (i = 0; i < m; i++) { 4564 const PetscInt nn = c->i[i+1] - c->i[i]; 4565 c->ilen[i] = c->imax[i] = nn; 4566 c->nonzerorowcnt += (PetscInt)!!nn; 4567 c->rmax = PetscMax(c->rmax,nn); 4568 } 4569 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4570 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4571 (*C)->nonzerostate++; 4572 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4573 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4574 Ccusp->nonzerostate = (*C)->nonzerostate; 4575 (*C)->preallocated = PETSC_TRUE; 4576 } else { 4577 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4578 c = (Mat_SeqAIJ*)(*C)->data; 4579 if (c->nz) { 4580 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4581 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4582 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4583 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4584 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4585 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4586 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4587 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4588 Acsr = (CsrMatrix*)Acusp->mat->mat; 4589 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4590 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4591 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4592 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4593 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4594 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4595 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4596 auto pmid = Ccusp->cooPerm->begin(); 4597 thrust::advance(pmid,Acsr->num_entries); 4598 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4599 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4600 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4601 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4602 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4603 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4604 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4605 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4606 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4607 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4608 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4609 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4610 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4611 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4612 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4613 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4614 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4615 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4616 auto vT = CcsrT->values->begin(); 4617 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4618 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4619 (*C)->transupdated = PETSC_TRUE; 4620 } 4621 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4622 } 4623 } 4624 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4625 (*C)->assembled = PETSC_TRUE; 4626 (*C)->was_assembled = PETSC_FALSE; 4627 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4628 PetscFunctionReturn(0); 4629 } 4630 4631 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4632 { 4633 PetscErrorCode ierr; 4634 bool dmem; 4635 const PetscScalar *av; 4636 cudaError_t cerr; 4637 4638 PetscFunctionBegin; 4639 dmem = isCudaMem(v); 4640 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4641 if (n && idx) { 4642 THRUSTINTARRAY widx(n); 4643 widx.assign(idx,idx+n); 4644 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4645 4646 THRUSTARRAY *w = NULL; 4647 thrust::device_ptr<PetscScalar> dv; 4648 if (dmem) { 4649 dv = thrust::device_pointer_cast(v); 4650 } else { 4651 w = new THRUSTARRAY(n); 4652 dv = w->data(); 4653 } 4654 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4655 4656 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4657 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4658 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4659 if (w) { 4660 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4661 } 4662 delete w; 4663 } else { 4664 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4665 } 4666 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4667 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4668 PetscFunctionReturn(0); 4669 } 4670