1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,PetscInt[],PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 96 { 97 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98 99 PetscFunctionBegin; 100 switch (op) { 101 case MAT_CUSPARSE_MULT: 102 cusparsestruct->format = format; 103 break; 104 case MAT_CUSPARSE_ALL: 105 cusparsestruct->format = format; 106 break; 107 default: 108 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 109 } 110 PetscFunctionReturn(0); 111 } 112 113 /*@ 114 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 115 operation. Only the MatMult operation can use different GPU storage formats 116 for MPIAIJCUSPARSE matrices. 117 Not Collective 118 119 Input Parameters: 120 + A - Matrix of type SEQAIJCUSPARSE 121 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 122 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 123 124 Output Parameter: 125 126 Level: intermediate 127 128 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129 @*/ 130 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131 { 132 PetscFunctionBegin; 133 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 134 PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 135 PetscFunctionReturn(0); 136 } 137 138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 139 { 140 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 141 142 PetscFunctionBegin; 143 cusparsestruct->use_cpu_solve = use_cpu; 144 PetscFunctionReturn(0); 145 } 146 147 /*@ 148 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 149 150 Input Parameters: 151 + A - Matrix of type SEQAIJCUSPARSE 152 - use_cpu - set flag for using the built-in CPU MatSolve 153 154 Output Parameter: 155 156 Notes: 157 The cuSparse LU solver currently computes the factors with the built-in CPU method 158 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160 161 Level: intermediate 162 163 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 164 @*/ 165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 166 { 167 PetscFunctionBegin; 168 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169 PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 170 PetscFunctionReturn(0); 171 } 172 173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 174 { 175 PetscFunctionBegin; 176 switch (op) { 177 case MAT_FORM_EXPLICIT_TRANSPOSE: 178 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 179 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 180 A->form_explicit_transpose = flg; 181 break; 182 default: 183 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 184 break; 185 } 186 PetscFunctionReturn(0); 187 } 188 189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 190 191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 192 { 193 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 194 IS isrow = b->row,iscol = b->col; 195 PetscBool row_identity,col_identity; 196 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 197 198 PetscFunctionBegin; 199 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 200 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 201 B->offloadmask = PETSC_OFFLOAD_CPU; 202 /* determine which version of MatSolve needs to be used. */ 203 PetscCall(ISIdentity(isrow,&row_identity)); 204 PetscCall(ISIdentity(iscol,&col_identity)); 205 206 if (!cusparsestruct->use_cpu_solve) { 207 if (row_identity && col_identity) { 208 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 209 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 210 } else { 211 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 212 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 213 } 214 } 215 B->ops->matsolve = NULL; 216 B->ops->matsolvetranspose = NULL; 217 218 /* get the triangular factors */ 219 if (!cusparsestruct->use_cpu_solve) { 220 PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 221 } 222 PetscFunctionReturn(0); 223 } 224 225 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 226 { 227 MatCUSPARSEStorageFormat format; 228 PetscBool flg; 229 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 230 231 PetscFunctionBegin; 232 PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options"); 233 if (A->factortype == MAT_FACTOR_NONE) { 234 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 235 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 236 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 237 238 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 239 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 240 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 241 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 242 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 243 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 244 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 245 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg)); 246 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 247 #if CUSPARSE_VERSION > 11301 248 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 249 #else 250 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 251 #endif 252 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 253 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg)); 254 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 255 256 PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 257 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg)); 258 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 259 #endif 260 } 261 PetscOptionsHeadEnd(); 262 PetscFunctionReturn(0); 263 } 264 265 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 266 { 267 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 268 PetscInt n = A->rmap->n; 269 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 270 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 271 const PetscInt *ai = a->i,*aj = a->j,*vi; 272 const MatScalar *aa = a->a,*v; 273 PetscInt *AiLo, *AjLo; 274 PetscInt i,nz, nzLower, offset, rowOffset; 275 276 PetscFunctionBegin; 277 if (!n) PetscFunctionReturn(0); 278 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 279 try { 280 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 281 nzLower=n+ai[n]-ai[1]; 282 if (!loTriFactor) { 283 PetscScalar *AALo; 284 285 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 286 287 /* Allocate Space for the lower triangular matrix */ 288 PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 289 PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 290 291 /* Fill the lower triangular matrix */ 292 AiLo[0] = (PetscInt) 0; 293 AiLo[n] = nzLower; 294 AjLo[0] = (PetscInt) 0; 295 AALo[0] = (MatScalar) 1.0; 296 v = aa; 297 vi = aj; 298 offset = 1; 299 rowOffset= 1; 300 for (i=1; i<n; i++) { 301 nz = ai[i+1] - ai[i]; 302 /* additional 1 for the term on the diagonal */ 303 AiLo[i] = rowOffset; 304 rowOffset += nz+1; 305 306 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 307 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 308 309 offset += nz; 310 AjLo[offset] = (PetscInt) i; 311 AALo[offset] = (MatScalar) 1.0; 312 offset += 1; 313 314 v += nz; 315 vi += nz; 316 } 317 318 /* allocate space for the triangular factor information */ 319 PetscCall(PetscNew(&loTriFactor)); 320 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 321 /* Create the matrix description */ 322 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 323 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 324 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 325 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 326 #else 327 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 328 #endif 329 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 330 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 331 332 /* set the operation */ 333 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 334 335 /* set the matrix */ 336 loTriFactor->csrMat = new CsrMatrix; 337 loTriFactor->csrMat->num_rows = n; 338 loTriFactor->csrMat->num_cols = n; 339 loTriFactor->csrMat->num_entries = nzLower; 340 341 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 342 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 343 344 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 345 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 346 347 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 348 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 349 350 /* Create the solve analysis information */ 351 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 352 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 353 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 354 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 355 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 356 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 357 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 358 &loTriFactor->solveBufferSize)); 359 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 360 #endif 361 362 /* perform the solve analysis */ 363 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 364 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 365 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 366 loTriFactor->csrMat->column_indices->data().get(), 367 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 368 loTriFactor->solveInfo, 369 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 370 #else 371 loTriFactor->solveInfo)); 372 #endif 373 PetscCallCUDA(WaitForCUDA()); 374 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 375 376 /* assign the pointer */ 377 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 378 loTriFactor->AA_h = AALo; 379 PetscCallCUDA(cudaFreeHost(AiLo)); 380 PetscCallCUDA(cudaFreeHost(AjLo)); 381 PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 382 } else { /* update values only */ 383 if (!loTriFactor->AA_h) { 384 PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 385 } 386 /* Fill the lower triangular matrix */ 387 loTriFactor->AA_h[0] = 1.0; 388 v = aa; 389 vi = aj; 390 offset = 1; 391 for (i=1; i<n; i++) { 392 nz = ai[i+1] - ai[i]; 393 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 394 offset += nz; 395 loTriFactor->AA_h[offset] = 1.0; 396 offset += 1; 397 v += nz; 398 } 399 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 400 PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 401 } 402 } catch(char *ex) { 403 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 404 } 405 } 406 PetscFunctionReturn(0); 407 } 408 409 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 410 { 411 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 412 PetscInt n = A->rmap->n; 413 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 414 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 415 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 416 const MatScalar *aa = a->a,*v; 417 PetscInt *AiUp, *AjUp; 418 PetscInt i,nz, nzUpper, offset; 419 420 PetscFunctionBegin; 421 if (!n) PetscFunctionReturn(0); 422 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 423 try { 424 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 425 nzUpper = adiag[0]-adiag[n]; 426 if (!upTriFactor) { 427 PetscScalar *AAUp; 428 429 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 430 431 /* Allocate Space for the upper triangular matrix */ 432 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 433 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 434 435 /* Fill the upper triangular matrix */ 436 AiUp[0]=(PetscInt) 0; 437 AiUp[n]=nzUpper; 438 offset = nzUpper; 439 for (i=n-1; i>=0; i--) { 440 v = aa + adiag[i+1] + 1; 441 vi = aj + adiag[i+1] + 1; 442 443 /* number of elements NOT on the diagonal */ 444 nz = adiag[i] - adiag[i+1]-1; 445 446 /* decrement the offset */ 447 offset -= (nz+1); 448 449 /* first, set the diagonal elements */ 450 AjUp[offset] = (PetscInt) i; 451 AAUp[offset] = (MatScalar)1./v[nz]; 452 AiUp[i] = AiUp[i+1] - (nz+1); 453 454 PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 455 PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 456 } 457 458 /* allocate space for the triangular factor information */ 459 PetscCall(PetscNew(&upTriFactor)); 460 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 461 462 /* Create the matrix description */ 463 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 464 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 465 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 466 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 467 #else 468 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 469 #endif 470 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 471 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 472 473 /* set the operation */ 474 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 475 476 /* set the matrix */ 477 upTriFactor->csrMat = new CsrMatrix; 478 upTriFactor->csrMat->num_rows = n; 479 upTriFactor->csrMat->num_cols = n; 480 upTriFactor->csrMat->num_entries = nzUpper; 481 482 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 483 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 484 485 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 486 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 487 488 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 489 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 490 491 /* Create the solve analysis information */ 492 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 493 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 494 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 495 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 496 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 497 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 498 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 499 &upTriFactor->solveBufferSize)); 500 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 501 #endif 502 503 /* perform the solve analysis */ 504 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 505 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 506 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 507 upTriFactor->csrMat->column_indices->data().get(), 508 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 509 upTriFactor->solveInfo, 510 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 511 #else 512 upTriFactor->solveInfo)); 513 #endif 514 PetscCallCUDA(WaitForCUDA()); 515 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 516 517 /* assign the pointer */ 518 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 519 upTriFactor->AA_h = AAUp; 520 PetscCallCUDA(cudaFreeHost(AiUp)); 521 PetscCallCUDA(cudaFreeHost(AjUp)); 522 PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 523 } else { 524 if (!upTriFactor->AA_h) { 525 PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 526 } 527 /* Fill the upper triangular matrix */ 528 offset = nzUpper; 529 for (i=n-1; i>=0; i--) { 530 v = aa + adiag[i+1] + 1; 531 532 /* number of elements NOT on the diagonal */ 533 nz = adiag[i] - adiag[i+1]-1; 534 535 /* decrement the offset */ 536 offset -= (nz+1); 537 538 /* first, set the diagonal elements */ 539 upTriFactor->AA_h[offset] = 1./v[nz]; 540 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 541 } 542 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 543 PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 544 } 545 } catch(char *ex) { 546 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 547 } 548 } 549 PetscFunctionReturn(0); 550 } 551 552 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 553 { 554 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 555 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 556 IS isrow = a->row,iscol = a->icol; 557 PetscBool row_identity,col_identity; 558 PetscInt n = A->rmap->n; 559 560 PetscFunctionBegin; 561 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 562 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 563 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 564 565 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 566 cusparseTriFactors->nnz=a->nz; 567 568 A->offloadmask = PETSC_OFFLOAD_BOTH; 569 /* lower triangular indices */ 570 PetscCall(ISIdentity(isrow,&row_identity)); 571 if (!row_identity && !cusparseTriFactors->rpermIndices) { 572 const PetscInt *r; 573 574 PetscCall(ISGetIndices(isrow,&r)); 575 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 576 cusparseTriFactors->rpermIndices->assign(r, r+n); 577 PetscCall(ISRestoreIndices(isrow,&r)); 578 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 579 } 580 581 /* upper triangular indices */ 582 PetscCall(ISIdentity(iscol,&col_identity)); 583 if (!col_identity && !cusparseTriFactors->cpermIndices) { 584 const PetscInt *c; 585 586 PetscCall(ISGetIndices(iscol,&c)); 587 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 588 cusparseTriFactors->cpermIndices->assign(c, c+n); 589 PetscCall(ISRestoreIndices(iscol,&c)); 590 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 591 } 592 PetscFunctionReturn(0); 593 } 594 595 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 596 { 597 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 598 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 599 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 600 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 601 PetscInt *AiUp, *AjUp; 602 PetscScalar *AAUp; 603 PetscScalar *AALo; 604 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 605 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 606 const PetscInt *ai = b->i,*aj = b->j,*vj; 607 const MatScalar *aa = b->a,*v; 608 609 PetscFunctionBegin; 610 if (!n) PetscFunctionReturn(0); 611 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 612 try { 613 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 614 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 615 if (!upTriFactor && !loTriFactor) { 616 /* Allocate Space for the upper triangular matrix */ 617 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 618 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 619 620 /* Fill the upper triangular matrix */ 621 AiUp[0]=(PetscInt) 0; 622 AiUp[n]=nzUpper; 623 offset = 0; 624 for (i=0; i<n; i++) { 625 /* set the pointers */ 626 v = aa + ai[i]; 627 vj = aj + ai[i]; 628 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 629 630 /* first, set the diagonal elements */ 631 AjUp[offset] = (PetscInt) i; 632 AAUp[offset] = (MatScalar)1.0/v[nz]; 633 AiUp[i] = offset; 634 AALo[offset] = (MatScalar)1.0/v[nz]; 635 636 offset+=1; 637 if (nz>0) { 638 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 639 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 640 for (j=offset; j<offset+nz; j++) { 641 AAUp[j] = -AAUp[j]; 642 AALo[j] = AAUp[j]/v[nz]; 643 } 644 offset+=nz; 645 } 646 } 647 648 /* allocate space for the triangular factor information */ 649 PetscCall(PetscNew(&upTriFactor)); 650 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 651 652 /* Create the matrix description */ 653 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 654 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 655 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 656 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 657 #else 658 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 659 #endif 660 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 661 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 662 663 /* set the matrix */ 664 upTriFactor->csrMat = new CsrMatrix; 665 upTriFactor->csrMat->num_rows = A->rmap->n; 666 upTriFactor->csrMat->num_cols = A->cmap->n; 667 upTriFactor->csrMat->num_entries = a->nz; 668 669 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 670 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 671 672 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 673 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 674 675 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 676 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 677 678 /* set the operation */ 679 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 680 681 /* Create the solve analysis information */ 682 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 683 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 684 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 685 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 686 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 687 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 688 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 689 &upTriFactor->solveBufferSize)); 690 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 691 #endif 692 693 /* perform the solve analysis */ 694 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 695 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 696 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 697 upTriFactor->csrMat->column_indices->data().get(), 698 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 699 upTriFactor->solveInfo, 700 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 701 #else 702 upTriFactor->solveInfo)); 703 #endif 704 PetscCallCUDA(WaitForCUDA()); 705 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 706 707 /* assign the pointer */ 708 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 709 710 /* allocate space for the triangular factor information */ 711 PetscCall(PetscNew(&loTriFactor)); 712 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 713 714 /* Create the matrix description */ 715 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 716 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 717 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 718 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 719 #else 720 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 721 #endif 722 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 723 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 724 725 /* set the operation */ 726 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 727 728 /* set the matrix */ 729 loTriFactor->csrMat = new CsrMatrix; 730 loTriFactor->csrMat->num_rows = A->rmap->n; 731 loTriFactor->csrMat->num_cols = A->cmap->n; 732 loTriFactor->csrMat->num_entries = a->nz; 733 734 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 735 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 736 737 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 738 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 739 740 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 741 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 742 743 /* Create the solve analysis information */ 744 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 745 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 746 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 747 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 748 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 749 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 750 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 751 &loTriFactor->solveBufferSize)); 752 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 753 #endif 754 755 /* perform the solve analysis */ 756 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 757 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 758 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 759 loTriFactor->csrMat->column_indices->data().get(), 760 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 761 loTriFactor->solveInfo, 762 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 763 #else 764 loTriFactor->solveInfo)); 765 #endif 766 PetscCallCUDA(WaitForCUDA()); 767 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 768 769 /* assign the pointer */ 770 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 771 772 PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 773 PetscCallCUDA(cudaFreeHost(AiUp)); 774 PetscCallCUDA(cudaFreeHost(AjUp)); 775 } else { 776 /* Fill the upper triangular matrix */ 777 offset = 0; 778 for (i=0; i<n; i++) { 779 /* set the pointers */ 780 v = aa + ai[i]; 781 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 782 783 /* first, set the diagonal elements */ 784 AAUp[offset] = 1.0/v[nz]; 785 AALo[offset] = 1.0/v[nz]; 786 787 offset+=1; 788 if (nz>0) { 789 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 790 for (j=offset; j<offset+nz; j++) { 791 AAUp[j] = -AAUp[j]; 792 AALo[j] = AAUp[j]/v[nz]; 793 } 794 offset+=nz; 795 } 796 } 797 PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 798 PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 799 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 800 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 801 PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 802 } 803 PetscCallCUDA(cudaFreeHost(AAUp)); 804 PetscCallCUDA(cudaFreeHost(AALo)); 805 } catch(char *ex) { 806 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 807 } 808 } 809 PetscFunctionReturn(0); 810 } 811 812 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 813 { 814 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 815 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 816 IS ip = a->row; 817 PetscBool perm_identity; 818 PetscInt n = A->rmap->n; 819 820 PetscFunctionBegin; 821 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 822 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 823 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 824 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 825 826 A->offloadmask = PETSC_OFFLOAD_BOTH; 827 828 /* lower triangular indices */ 829 PetscCall(ISIdentity(ip,&perm_identity)); 830 if (!perm_identity) { 831 IS iip; 832 const PetscInt *irip,*rip; 833 834 PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 835 PetscCall(ISGetIndices(iip,&irip)); 836 PetscCall(ISGetIndices(ip,&rip)); 837 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 838 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 839 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 840 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 841 PetscCall(ISRestoreIndices(iip,&irip)); 842 PetscCall(ISDestroy(&iip)); 843 PetscCall(ISRestoreIndices(ip,&rip)); 844 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 845 } 846 PetscFunctionReturn(0); 847 } 848 849 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 850 { 851 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 852 IS ip = b->row; 853 PetscBool perm_identity; 854 855 PetscFunctionBegin; 856 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 857 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 858 B->offloadmask = PETSC_OFFLOAD_CPU; 859 /* determine which version of MatSolve needs to be used. */ 860 PetscCall(ISIdentity(ip,&perm_identity)); 861 if (perm_identity) { 862 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 863 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 864 B->ops->matsolve = NULL; 865 B->ops->matsolvetranspose = NULL; 866 } else { 867 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 868 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 869 B->ops->matsolve = NULL; 870 B->ops->matsolvetranspose = NULL; 871 } 872 873 /* get the triangular factors */ 874 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 875 PetscFunctionReturn(0); 876 } 877 878 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 879 { 880 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 881 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 882 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 883 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 884 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 885 cusparseIndexBase_t indexBase; 886 cusparseMatrixType_t matrixType; 887 cusparseFillMode_t fillMode; 888 cusparseDiagType_t diagType; 889 890 PetscFunctionBegin; 891 /* allocate space for the transpose of the lower triangular factor */ 892 PetscCall(PetscNew(&loTriFactorT)); 893 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 894 895 /* set the matrix descriptors of the lower triangular factor */ 896 matrixType = cusparseGetMatType(loTriFactor->descr); 897 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 898 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 899 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 900 diagType = cusparseGetMatDiagType(loTriFactor->descr); 901 902 /* Create the matrix description */ 903 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 904 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 905 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 906 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 907 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 908 909 /* set the operation */ 910 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 911 912 /* allocate GPU space for the CSC of the lower triangular factor*/ 913 loTriFactorT->csrMat = new CsrMatrix; 914 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 915 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 916 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 917 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 918 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 919 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 920 921 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 923 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 924 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 925 loTriFactor->csrMat->values->data().get(), 926 loTriFactor->csrMat->row_offsets->data().get(), 927 loTriFactor->csrMat->column_indices->data().get(), 928 loTriFactorT->csrMat->values->data().get(), 929 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 930 CUSPARSE_ACTION_NUMERIC,indexBase, 931 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 932 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 933 #endif 934 935 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 936 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 937 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 938 loTriFactor->csrMat->values->data().get(), 939 loTriFactor->csrMat->row_offsets->data().get(), 940 loTriFactor->csrMat->column_indices->data().get(), 941 loTriFactorT->csrMat->values->data().get(), 942 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 943 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 944 CUSPARSE_ACTION_NUMERIC, indexBase, 945 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 946 #else 947 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 948 CUSPARSE_ACTION_NUMERIC, indexBase)); 949 #endif 950 PetscCallCUDA(WaitForCUDA()); 951 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 952 953 /* Create the solve analysis information */ 954 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 955 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 956 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 957 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 958 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 959 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 960 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 961 &loTriFactorT->solveBufferSize)); 962 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 963 #endif 964 965 /* perform the solve analysis */ 966 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 967 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 968 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 969 loTriFactorT->csrMat->column_indices->data().get(), 970 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 971 loTriFactorT->solveInfo, 972 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 973 #else 974 loTriFactorT->solveInfo)); 975 #endif 976 PetscCallCUDA(WaitForCUDA()); 977 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 978 979 /* assign the pointer */ 980 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 981 982 /*********************************************/ 983 /* Now the Transpose of the Upper Tri Factor */ 984 /*********************************************/ 985 986 /* allocate space for the transpose of the upper triangular factor */ 987 PetscCall(PetscNew(&upTriFactorT)); 988 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 989 990 /* set the matrix descriptors of the upper triangular factor */ 991 matrixType = cusparseGetMatType(upTriFactor->descr); 992 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 993 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 994 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 995 diagType = cusparseGetMatDiagType(upTriFactor->descr); 996 997 /* Create the matrix description */ 998 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 999 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1000 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1001 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1002 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1003 1004 /* set the operation */ 1005 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1006 1007 /* allocate GPU space for the CSC of the upper triangular factor*/ 1008 upTriFactorT->csrMat = new CsrMatrix; 1009 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1010 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1011 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1012 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1013 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1014 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1015 1016 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1017 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1018 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1019 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1020 upTriFactor->csrMat->values->data().get(), 1021 upTriFactor->csrMat->row_offsets->data().get(), 1022 upTriFactor->csrMat->column_indices->data().get(), 1023 upTriFactorT->csrMat->values->data().get(), 1024 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1025 CUSPARSE_ACTION_NUMERIC,indexBase, 1026 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1027 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1028 #endif 1029 1030 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1031 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1032 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1033 upTriFactor->csrMat->values->data().get(), 1034 upTriFactor->csrMat->row_offsets->data().get(), 1035 upTriFactor->csrMat->column_indices->data().get(), 1036 upTriFactorT->csrMat->values->data().get(), 1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1039 CUSPARSE_ACTION_NUMERIC, indexBase, 1040 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1041 #else 1042 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1043 CUSPARSE_ACTION_NUMERIC, indexBase)); 1044 #endif 1045 1046 PetscCallCUDA(WaitForCUDA()); 1047 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1048 1049 /* Create the solve analysis information */ 1050 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1051 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1052 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1053 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1054 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1055 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1056 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1057 &upTriFactorT->solveBufferSize)); 1058 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1059 #endif 1060 1061 /* perform the solve analysis */ 1062 /* christ, would it have killed you to put this stuff in a function????????? */ 1063 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1064 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1065 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1066 upTriFactorT->csrMat->column_indices->data().get(), 1067 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1068 upTriFactorT->solveInfo, 1069 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1070 #else 1071 upTriFactorT->solveInfo)); 1072 #endif 1073 1074 PetscCallCUDA(WaitForCUDA()); 1075 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1076 1077 /* assign the pointer */ 1078 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1079 PetscFunctionReturn(0); 1080 } 1081 1082 struct PetscScalarToPetscInt 1083 { 1084 __host__ __device__ 1085 PetscInt operator()(PetscScalar s) 1086 { 1087 return (PetscInt)PetscRealPart(s); 1088 } 1089 }; 1090 1091 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1092 { 1093 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1094 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1095 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1096 cusparseStatus_t stat; 1097 cusparseIndexBase_t indexBase; 1098 1099 PetscFunctionBegin; 1100 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1101 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1102 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1103 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1104 PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1105 if (A->transupdated) PetscFunctionReturn(0); 1106 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1107 PetscCall(PetscLogGpuTimeBegin()); 1108 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1109 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1110 } 1111 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1112 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1113 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1114 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1115 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1116 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1117 1118 /* set alpha and beta */ 1119 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 1120 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 1121 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1122 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1123 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1124 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1125 1126 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1127 CsrMatrix *matrixT = new CsrMatrix; 1128 matstructT->mat = matrixT; 1129 matrixT->num_rows = A->cmap->n; 1130 matrixT->num_cols = A->rmap->n; 1131 matrixT->num_entries = a->nz; 1132 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1133 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1134 matrixT->values = new THRUSTARRAY(a->nz); 1135 1136 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1137 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1138 1139 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1140 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1141 stat = cusparseCreateCsr(&matstructT->matDescr, 1142 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1143 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1144 matrixT->values->data().get(), 1145 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1146 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1147 #else 1148 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1149 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1150 1151 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1152 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1153 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1154 */ 1155 if (matrixT->num_entries) { 1156 stat = cusparseCreateCsr(&matstructT->matDescr, 1157 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1158 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1159 matrixT->values->data().get(), 1160 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1161 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1162 1163 } else { 1164 matstructT->matDescr = NULL; 1165 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1166 } 1167 #endif 1168 #endif 1169 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1170 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1171 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1172 #else 1173 CsrMatrix *temp = new CsrMatrix; 1174 CsrMatrix *tempT = new CsrMatrix; 1175 /* First convert HYB to CSR */ 1176 temp->num_rows = A->rmap->n; 1177 temp->num_cols = A->cmap->n; 1178 temp->num_entries = a->nz; 1179 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1180 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1181 temp->values = new THRUSTARRAY(a->nz); 1182 1183 stat = cusparse_hyb2csr(cusparsestruct->handle, 1184 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1185 temp->values->data().get(), 1186 temp->row_offsets->data().get(), 1187 temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1188 1189 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1190 tempT->num_rows = A->rmap->n; 1191 tempT->num_cols = A->cmap->n; 1192 tempT->num_entries = a->nz; 1193 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1194 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1195 tempT->values = new THRUSTARRAY(a->nz); 1196 1197 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1198 temp->num_cols, temp->num_entries, 1199 temp->values->data().get(), 1200 temp->row_offsets->data().get(), 1201 temp->column_indices->data().get(), 1202 tempT->values->data().get(), 1203 tempT->column_indices->data().get(), 1204 tempT->row_offsets->data().get(), 1205 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1206 1207 /* Last, convert CSC to HYB */ 1208 cusparseHybMat_t hybMat; 1209 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1210 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1211 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1212 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1213 matstructT->descr, tempT->values->data().get(), 1214 tempT->row_offsets->data().get(), 1215 tempT->column_indices->data().get(), 1216 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1217 1218 /* assign the pointer */ 1219 matstructT->mat = hybMat; 1220 A->transupdated = PETSC_TRUE; 1221 /* delete temporaries */ 1222 if (tempT) { 1223 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1224 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1225 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1226 delete (CsrMatrix*) tempT; 1227 } 1228 if (temp) { 1229 if (temp->values) delete (THRUSTARRAY*) temp->values; 1230 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1231 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1232 delete (CsrMatrix*) temp; 1233 } 1234 #endif 1235 } 1236 } 1237 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1238 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1239 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1240 PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1241 PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1242 PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1243 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1244 PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1245 PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1246 PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1247 PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1248 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1249 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1250 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1251 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1252 } 1253 if (!cusparsestruct->csr2csc_i) { 1254 THRUSTARRAY csr2csc_a(matrix->num_entries); 1255 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1256 1257 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1259 void *csr2cscBuffer; 1260 size_t csr2cscBufferSize; 1261 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1262 A->cmap->n, matrix->num_entries, 1263 matrix->values->data().get(), 1264 cusparsestruct->rowoffsets_gpu->data().get(), 1265 matrix->column_indices->data().get(), 1266 matrixT->values->data().get(), 1267 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1268 CUSPARSE_ACTION_NUMERIC,indexBase, 1269 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 1270 PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1271 #endif 1272 1273 if (matrix->num_entries) { 1274 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1275 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1276 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1277 1278 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1279 should be filled with indexBase. So I just take a shortcut here. 1280 */ 1281 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1282 A->cmap->n,matrix->num_entries, 1283 csr2csc_a.data().get(), 1284 cusparsestruct->rowoffsets_gpu->data().get(), 1285 matrix->column_indices->data().get(), 1286 matrixT->values->data().get(), 1287 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1288 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1289 CUSPARSE_ACTION_NUMERIC,indexBase, 1290 cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1291 #else 1292 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1293 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1294 #endif 1295 } else { 1296 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1297 } 1298 1299 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1300 PetscCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1301 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1302 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1303 #endif 1304 } 1305 PetscCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1306 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1307 matrixT->values->begin())); 1308 } 1309 PetscCall(PetscLogGpuTimeEnd()); 1310 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1311 /* the compressed row indices is not used for matTranspose */ 1312 matstructT->cprowIndices = NULL; 1313 /* assign the pointer */ 1314 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1315 A->transupdated = PETSC_TRUE; 1316 PetscFunctionReturn(0); 1317 } 1318 1319 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1320 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1321 { 1322 PetscInt n = xx->map->n; 1323 const PetscScalar *barray; 1324 PetscScalar *xarray; 1325 thrust::device_ptr<const PetscScalar> bGPU; 1326 thrust::device_ptr<PetscScalar> xGPU; 1327 cusparseStatus_t stat; 1328 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1329 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1330 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1331 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1332 1333 PetscFunctionBegin; 1334 /* Analyze the matrix and create the transpose ... on the fly */ 1335 if (!loTriFactorT && !upTriFactorT) { 1336 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1337 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1338 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1339 } 1340 1341 /* Get the GPU pointers */ 1342 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1343 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1344 xGPU = thrust::device_pointer_cast(xarray); 1345 bGPU = thrust::device_pointer_cast(barray); 1346 1347 PetscCall(PetscLogGpuTimeBegin()); 1348 /* First, reorder with the row permutation */ 1349 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1350 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1351 xGPU); 1352 1353 /* First, solve U */ 1354 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1355 upTriFactorT->csrMat->num_rows, 1356 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1357 upTriFactorT->csrMat->num_entries, 1358 #endif 1359 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1360 upTriFactorT->csrMat->values->data().get(), 1361 upTriFactorT->csrMat->row_offsets->data().get(), 1362 upTriFactorT->csrMat->column_indices->data().get(), 1363 upTriFactorT->solveInfo, 1364 xarray, 1365 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1366 tempGPU->data().get(), 1367 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1368 #else 1369 tempGPU->data().get());PetscCallCUSPARSE(stat); 1370 #endif 1371 1372 /* Then, solve L */ 1373 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1374 loTriFactorT->csrMat->num_rows, 1375 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1376 loTriFactorT->csrMat->num_entries, 1377 #endif 1378 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1379 loTriFactorT->csrMat->values->data().get(), 1380 loTriFactorT->csrMat->row_offsets->data().get(), 1381 loTriFactorT->csrMat->column_indices->data().get(), 1382 loTriFactorT->solveInfo, 1383 tempGPU->data().get(), 1384 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1385 xarray, 1386 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1387 #else 1388 xarray);PetscCallCUSPARSE(stat); 1389 #endif 1390 1391 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1392 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1393 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1394 tempGPU->begin()); 1395 1396 /* Copy the temporary to the full solution. */ 1397 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1398 1399 /* restore */ 1400 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1401 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1402 PetscCall(PetscLogGpuTimeEnd()); 1403 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1404 PetscFunctionReturn(0); 1405 } 1406 1407 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1408 { 1409 const PetscScalar *barray; 1410 PetscScalar *xarray; 1411 cusparseStatus_t stat; 1412 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1413 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1414 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1415 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1416 1417 PetscFunctionBegin; 1418 /* Analyze the matrix and create the transpose ... on the fly */ 1419 if (!loTriFactorT && !upTriFactorT) { 1420 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1421 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1422 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1423 } 1424 1425 /* Get the GPU pointers */ 1426 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1427 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1428 1429 PetscCall(PetscLogGpuTimeBegin()); 1430 /* First, solve U */ 1431 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1432 upTriFactorT->csrMat->num_rows, 1433 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1434 upTriFactorT->csrMat->num_entries, 1435 #endif 1436 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1437 upTriFactorT->csrMat->values->data().get(), 1438 upTriFactorT->csrMat->row_offsets->data().get(), 1439 upTriFactorT->csrMat->column_indices->data().get(), 1440 upTriFactorT->solveInfo, 1441 barray, 1442 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1443 tempGPU->data().get(), 1444 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1445 #else 1446 tempGPU->data().get());PetscCallCUSPARSE(stat); 1447 #endif 1448 1449 /* Then, solve L */ 1450 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1451 loTriFactorT->csrMat->num_rows, 1452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453 loTriFactorT->csrMat->num_entries, 1454 #endif 1455 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1456 loTriFactorT->csrMat->values->data().get(), 1457 loTriFactorT->csrMat->row_offsets->data().get(), 1458 loTriFactorT->csrMat->column_indices->data().get(), 1459 loTriFactorT->solveInfo, 1460 tempGPU->data().get(), 1461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462 xarray, 1463 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1464 #else 1465 xarray);PetscCallCUSPARSE(stat); 1466 #endif 1467 1468 /* restore */ 1469 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1470 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1471 PetscCall(PetscLogGpuTimeEnd()); 1472 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1473 PetscFunctionReturn(0); 1474 } 1475 1476 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1477 { 1478 const PetscScalar *barray; 1479 PetscScalar *xarray; 1480 thrust::device_ptr<const PetscScalar> bGPU; 1481 thrust::device_ptr<PetscScalar> xGPU; 1482 cusparseStatus_t stat; 1483 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1484 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1485 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1486 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1487 1488 PetscFunctionBegin; 1489 1490 /* Get the GPU pointers */ 1491 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1492 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1493 xGPU = thrust::device_pointer_cast(xarray); 1494 bGPU = thrust::device_pointer_cast(barray); 1495 1496 PetscCall(PetscLogGpuTimeBegin()); 1497 /* First, reorder with the row permutation */ 1498 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1499 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1500 tempGPU->begin()); 1501 1502 /* Next, solve L */ 1503 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1504 loTriFactor->csrMat->num_rows, 1505 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1506 loTriFactor->csrMat->num_entries, 1507 #endif 1508 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1509 loTriFactor->csrMat->values->data().get(), 1510 loTriFactor->csrMat->row_offsets->data().get(), 1511 loTriFactor->csrMat->column_indices->data().get(), 1512 loTriFactor->solveInfo, 1513 tempGPU->data().get(), 1514 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1515 xarray, 1516 loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1517 #else 1518 xarray);PetscCallCUSPARSE(stat); 1519 #endif 1520 1521 /* Then, solve U */ 1522 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1523 upTriFactor->csrMat->num_rows, 1524 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1525 upTriFactor->csrMat->num_entries, 1526 #endif 1527 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1528 upTriFactor->csrMat->values->data().get(), 1529 upTriFactor->csrMat->row_offsets->data().get(), 1530 upTriFactor->csrMat->column_indices->data().get(), 1531 upTriFactor->solveInfo,xarray, 1532 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1533 tempGPU->data().get(), 1534 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1535 #else 1536 tempGPU->data().get());PetscCallCUSPARSE(stat); 1537 #endif 1538 1539 /* Last, reorder with the column permutation */ 1540 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1541 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1542 xGPU); 1543 1544 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1545 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1546 PetscCall(PetscLogGpuTimeEnd()); 1547 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1548 PetscFunctionReturn(0); 1549 } 1550 1551 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1552 { 1553 const PetscScalar *barray; 1554 PetscScalar *xarray; 1555 cusparseStatus_t stat; 1556 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1557 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1558 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1559 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1560 1561 PetscFunctionBegin; 1562 /* Get the GPU pointers */ 1563 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1564 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1565 1566 PetscCall(PetscLogGpuTimeBegin()); 1567 /* First, solve L */ 1568 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1569 loTriFactor->csrMat->num_rows, 1570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1571 loTriFactor->csrMat->num_entries, 1572 #endif 1573 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1574 loTriFactor->csrMat->values->data().get(), 1575 loTriFactor->csrMat->row_offsets->data().get(), 1576 loTriFactor->csrMat->column_indices->data().get(), 1577 loTriFactor->solveInfo, 1578 barray, 1579 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1580 tempGPU->data().get(), 1581 loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1582 #else 1583 tempGPU->data().get());PetscCallCUSPARSE(stat); 1584 #endif 1585 1586 /* Next, solve U */ 1587 stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1588 upTriFactor->csrMat->num_rows, 1589 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1590 upTriFactor->csrMat->num_entries, 1591 #endif 1592 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1593 upTriFactor->csrMat->values->data().get(), 1594 upTriFactor->csrMat->row_offsets->data().get(), 1595 upTriFactor->csrMat->column_indices->data().get(), 1596 upTriFactor->solveInfo, 1597 tempGPU->data().get(), 1598 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1599 xarray, 1600 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1601 #else 1602 xarray);PetscCallCUSPARSE(stat); 1603 #endif 1604 1605 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1606 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1607 PetscCall(PetscLogGpuTimeEnd()); 1608 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1609 PetscFunctionReturn(0); 1610 } 1611 1612 #if CUSPARSE_VERSION >= 11500 1613 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1614 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x) 1615 { 1616 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1617 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1618 const PetscScalar *barray; 1619 PetscScalar *xarray; 1620 1621 PetscFunctionBegin; 1622 PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 1623 PetscCall(VecCUDAGetArrayRead(b,&barray)); 1624 PetscCall(PetscLogGpuTimeBegin()); 1625 1626 /* Solve L*y = b */ 1627 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 1628 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 1629 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1630 CUSPARSE_OPERATION_NON_TRANSPOSE, 1631 &PETSC_CUSPARSE_ONE, 1632 fs->spMatDescr_L, /* L Y = X */ 1633 fs->dnVecDescr_X, 1634 fs->dnVecDescr_Y, 1635 cusparse_scalartype, 1636 CUSPARSE_SPSV_ALG_DEFAULT, 1637 fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1638 1639 /* Solve U*x = y */ 1640 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 1641 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1642 CUSPARSE_OPERATION_NON_TRANSPOSE, 1643 &PETSC_CUSPARSE_ONE, 1644 fs->spMatDescr_U, /* U X = Y */ 1645 fs->dnVecDescr_Y, 1646 fs->dnVecDescr_X, 1647 cusparse_scalartype, 1648 CUSPARSE_SPSV_ALG_DEFAULT, 1649 fs->spsvDescr_U)); 1650 1651 PetscCall(VecCUDARestoreArrayRead(b,&barray)); 1652 PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 1653 1654 PetscCall(PetscLogGpuTimeEnd()); 1655 PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 1656 PetscFunctionReturn(0); 1657 } 1658 1659 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x) 1660 { 1661 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1662 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1663 const PetscScalar *barray; 1664 PetscScalar *xarray; 1665 1666 PetscFunctionBegin; 1667 if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1668 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1669 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1670 CUSPARSE_OPERATION_TRANSPOSE, 1671 &PETSC_CUSPARSE_ONE, 1672 fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 1673 fs->dnVecDescr_X, 1674 fs->dnVecDescr_Y, 1675 cusparse_scalartype, 1676 CUSPARSE_SPSV_ALG_DEFAULT, 1677 fs->spsvDescr_Lt, 1678 &fs->spsvBufferSize_Lt)); 1679 1680 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1681 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1682 CUSPARSE_OPERATION_TRANSPOSE, 1683 &PETSC_CUSPARSE_ONE, 1684 fs->spMatDescr_U, 1685 fs->dnVecDescr_X, 1686 fs->dnVecDescr_Y, 1687 cusparse_scalartype, 1688 CUSPARSE_SPSV_ALG_DEFAULT, 1689 fs->spsvDescr_Ut, 1690 &fs->spsvBufferSize_Ut)); 1691 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt)); 1692 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Ut,fs->spsvBufferSize_Ut)); 1693 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1694 } 1695 1696 if (!fs->updatedTransposeSpSVAnalysis) { 1697 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1698 CUSPARSE_OPERATION_TRANSPOSE, 1699 &PETSC_CUSPARSE_ONE, 1700 fs->spMatDescr_L, 1701 fs->dnVecDescr_X, 1702 fs->dnVecDescr_Y, 1703 cusparse_scalartype, 1704 CUSPARSE_SPSV_ALG_DEFAULT, 1705 fs->spsvDescr_Lt, 1706 fs->spsvBuffer_Lt)); 1707 1708 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1709 CUSPARSE_OPERATION_TRANSPOSE, 1710 &PETSC_CUSPARSE_ONE, 1711 fs->spMatDescr_U, 1712 fs->dnVecDescr_X, 1713 fs->dnVecDescr_Y, 1714 cusparse_scalartype, 1715 CUSPARSE_SPSV_ALG_DEFAULT, 1716 fs->spsvDescr_Ut, 1717 fs->spsvBuffer_Ut)); 1718 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1719 } 1720 1721 PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 1722 PetscCall(VecCUDAGetArrayRead(b,&barray)); 1723 PetscCall(PetscLogGpuTimeBegin()); 1724 1725 /* Solve Ut*y = b */ 1726 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 1727 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 1728 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1729 CUSPARSE_OPERATION_TRANSPOSE, 1730 &PETSC_CUSPARSE_ONE, 1731 fs->spMatDescr_U, /* Ut Y = X */ 1732 fs->dnVecDescr_X, 1733 fs->dnVecDescr_Y, 1734 cusparse_scalartype, 1735 CUSPARSE_SPSV_ALG_DEFAULT, 1736 fs->spsvDescr_Ut)); 1737 1738 /* Solve Lt*x = y */ 1739 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 1740 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1741 CUSPARSE_OPERATION_TRANSPOSE, 1742 &PETSC_CUSPARSE_ONE, 1743 fs->spMatDescr_L, /* Lt X = Y */ 1744 fs->dnVecDescr_Y, 1745 fs->dnVecDescr_X, 1746 cusparse_scalartype, 1747 CUSPARSE_SPSV_ALG_DEFAULT, 1748 fs->spsvDescr_Lt)); 1749 1750 PetscCall(VecCUDARestoreArrayRead(b,&barray)); 1751 PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 1752 PetscCall(PetscLogGpuTimeEnd()); 1753 PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 1754 PetscFunctionReturn(0); 1755 } 1756 1757 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *info) 1758 { 1759 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1760 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1761 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1762 CsrMatrix *Acsr; 1763 PetscInt m,nz; 1764 PetscBool flg; 1765 1766 PetscFunctionBegin; 1767 if (PetscDefined(USE_DEBUG)) { 1768 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1769 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 1770 } 1771 1772 /* Copy A's value to fact */ 1773 m = fact->rmap->n; 1774 nz = aij->nz; 1775 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1776 Acsr = (CsrMatrix*)Acusp->mat->mat; 1777 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1778 1779 /* Factorize fact inplace */ 1780 if (m) PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1781 fs->matDescr_M, 1782 fs->csrVal, 1783 fs->csrRowPtr, 1784 fs->csrColIdx, 1785 fs->ilu0Info_M, 1786 fs->policy_M, 1787 fs->factBuffer_M)); 1788 if (PetscDefined(USE_DEBUG)) { 1789 int numerical_zero; 1790 cusparseStatus_t status; 1791 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1792 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csrilu02: A(%d,%d) is zero",numerical_zero,numerical_zero); 1793 } 1794 1795 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1796 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1797 */ 1798 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1799 CUSPARSE_OPERATION_NON_TRANSPOSE, 1800 &PETSC_CUSPARSE_ONE, 1801 fs->spMatDescr_L, 1802 fs->dnVecDescr_X, 1803 fs->dnVecDescr_Y, 1804 cusparse_scalartype, 1805 CUSPARSE_SPSV_ALG_DEFAULT, 1806 fs->spsvDescr_L, 1807 fs->spsvBuffer_L)); 1808 1809 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1810 CUSPARSE_OPERATION_NON_TRANSPOSE, 1811 &PETSC_CUSPARSE_ONE, 1812 fs->spMatDescr_U, 1813 fs->dnVecDescr_X, 1814 fs->dnVecDescr_Y, 1815 cusparse_scalartype, 1816 CUSPARSE_SPSV_ALG_DEFAULT, 1817 fs->spsvDescr_U, 1818 fs->spsvBuffer_U)); 1819 1820 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1821 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1822 1823 fact->offloadmask = PETSC_OFFLOAD_GPU; 1824 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1825 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1826 fact->ops->matsolve = NULL; 1827 fact->ops->matsolvetranspose = NULL; 1828 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1829 PetscFunctionReturn(0); 1830 } 1831 1832 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1833 { 1834 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1835 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1836 PetscInt m,nz; 1837 1838 PetscFunctionBegin; 1839 if (PetscDefined(USE_DEBUG)) { 1840 PetscInt i; 1841 PetscBool flg,missing; 1842 1843 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1844 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 1845 PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 1846 PetscCall(MatMissingDiagonal(A,&missing,&i)); 1847 PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i); 1848 } 1849 1850 /* Free the old stale stuff */ 1851 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1852 1853 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1854 but they will not be used. Allocate them just for easy debugging. 1855 */ 1856 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/)); 1857 1858 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1859 fact->factortype = MAT_FACTOR_ILU; 1860 fact->info.factor_mallocs = 0; 1861 fact->info.fill_ratio_given = info->fill; 1862 fact->info.fill_ratio_needed = 1.0; 1863 1864 aij->row = NULL; 1865 aij->col = NULL; 1866 1867 /* ====================================================================== */ 1868 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1869 /* We'll do in-place factorization on fact */ 1870 /* ====================================================================== */ 1871 const int *Ai,*Aj; 1872 1873 m = fact->rmap->n; 1874 nz = aij->nz; 1875 1876 PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1))); 1877 PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz)); 1878 PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz)); 1879 PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj)); /* Do not use compressed Ai */ 1880 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1881 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1882 1883 /* ====================================================================== */ 1884 /* Create descriptors for M, L, U */ 1885 /* ====================================================================== */ 1886 cusparseFillMode_t fillMode; 1887 cusparseDiagType_t diagType; 1888 1889 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1890 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1891 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1892 1893 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1894 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1895 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1896 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1897 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1898 */ 1899 fillMode = CUSPARSE_FILL_MODE_LOWER; 1900 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1901 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz, 1902 fs->csrRowPtr, 1903 fs->csrColIdx, 1904 fs->csrVal, 1905 CUSPARSE_INDEX_32I, 1906 CUSPARSE_INDEX_32I, 1907 CUSPARSE_INDEX_BASE_ZERO, 1908 cusparse_scalartype)); 1909 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 1910 CUSPARSE_SPMAT_FILL_MODE, 1911 &fillMode, 1912 sizeof(fillMode))); 1913 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 1914 CUSPARSE_SPMAT_DIAG_TYPE, 1915 &diagType, 1916 sizeof(diagType))); 1917 1918 fillMode = CUSPARSE_FILL_MODE_UPPER; 1919 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1920 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U,m,m,nz, 1921 fs->csrRowPtr, 1922 fs->csrColIdx, 1923 fs->csrVal, 1924 CUSPARSE_INDEX_32I, 1925 CUSPARSE_INDEX_32I, 1926 CUSPARSE_INDEX_BASE_ZERO, 1927 cusparse_scalartype)); 1928 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, 1929 CUSPARSE_SPMAT_FILL_MODE, 1930 &fillMode, 1931 sizeof(fillMode))); 1932 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, 1933 CUSPARSE_SPMAT_DIAG_TYPE, 1934 &diagType, 1935 sizeof(diagType))); 1936 1937 /* ========================================================================= */ 1938 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1939 /* ========================================================================= */ 1940 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1941 if (m) PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1942 fs->matDescr_M, 1943 fs->csrVal, 1944 fs->csrRowPtr, 1945 fs->csrColIdx, 1946 fs->ilu0Info_M, 1947 &fs->factBufferSize_M)); 1948 1949 PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m)); 1950 PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m)); 1951 1952 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype)); 1953 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype)); 1954 1955 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1956 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1957 CUSPARSE_OPERATION_NON_TRANSPOSE, 1958 &PETSC_CUSPARSE_ONE, 1959 fs->spMatDescr_L, 1960 fs->dnVecDescr_X, 1961 fs->dnVecDescr_Y, 1962 cusparse_scalartype, 1963 CUSPARSE_SPSV_ALG_DEFAULT, 1964 fs->spsvDescr_L, 1965 &fs->spsvBufferSize_L)); 1966 1967 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1968 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1969 CUSPARSE_OPERATION_NON_TRANSPOSE, 1970 &PETSC_CUSPARSE_ONE, 1971 fs->spMatDescr_U, 1972 fs->dnVecDescr_X, 1973 fs->dnVecDescr_Y, 1974 cusparse_scalartype, 1975 CUSPARSE_SPSV_ALG_DEFAULT, 1976 fs->spsvDescr_U, 1977 &fs->spsvBufferSize_U)); 1978 1979 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1980 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1981 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1982 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1983 */ 1984 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1985 PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M))); 1986 fs->spsvBuffer_L = fs->factBuffer_M; 1987 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_U,fs->spsvBufferSize_U)); 1988 } else { 1989 PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_U,(size_t)fs->factBufferSize_M))); 1990 fs->spsvBuffer_U = fs->factBuffer_M; 1991 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L)); 1992 } 1993 1994 /* ========================================================================== */ 1995 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1996 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1997 /* ========================================================================== */ 1998 int structural_zero; 1999 cusparseStatus_t status; 2000 2001 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2002 if (m) PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 2003 fs->matDescr_M, 2004 fs->csrVal, 2005 fs->csrRowPtr, 2006 fs->csrColIdx, 2007 fs->ilu0Info_M, 2008 fs->policy_M, 2009 fs->factBuffer_M)); 2010 if (PetscDefined(USE_DEBUG)) { 2011 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2012 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 2013 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csrilu02: A(%d,%d) is missing",structural_zero,structural_zero); 2014 } 2015 2016 /* Estimate FLOPs of the numeric factorization */ 2017 { 2018 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ*)A->data; 2019 PetscInt *Ai,*Adiag,nzRow,nzLeft; 2020 PetscLogDouble flops = 0.0; 2021 2022 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 2023 Ai = Aseq->i; 2024 Adiag = Aseq->diag; 2025 for (PetscInt i=0; i<m; i++) { 2026 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i+1]) { /* There are nonzeros left to the diagonal of row i */ 2027 nzRow = Ai[i+1] - Ai[i]; 2028 nzLeft = Adiag[i] - Ai[i]; 2029 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2030 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2031 */ 2032 nzLeft = (nzRow-1)/2; 2033 flops += nzLeft*(2.0*nzRow-nzLeft+1); 2034 } 2035 } 2036 fs->numericFactFlops = flops; 2037 } 2038 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 2039 PetscFunctionReturn(0); 2040 } 2041 2042 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x) 2043 { 2044 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2045 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2046 const PetscScalar *barray; 2047 PetscScalar *xarray; 2048 2049 PetscFunctionBegin; 2050 PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 2051 PetscCall(VecCUDAGetArrayRead(b,&barray)); 2052 PetscCall(PetscLogGpuTimeBegin()); 2053 2054 /* Solve L*y = b */ 2055 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 2056 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 2057 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 2058 CUSPARSE_OPERATION_NON_TRANSPOSE, 2059 &PETSC_CUSPARSE_ONE, 2060 fs->spMatDescr_L, /* L Y = X */ 2061 fs->dnVecDescr_X, 2062 fs->dnVecDescr_Y, 2063 cusparse_scalartype, 2064 CUSPARSE_SPSV_ALG_DEFAULT, 2065 fs->spsvDescr_L)); 2066 2067 /* Solve Lt*x = y */ 2068 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 2069 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 2070 CUSPARSE_OPERATION_TRANSPOSE, 2071 &PETSC_CUSPARSE_ONE, 2072 fs->spMatDescr_L, /* Lt X = Y */ 2073 fs->dnVecDescr_Y, 2074 fs->dnVecDescr_X, 2075 cusparse_scalartype, 2076 CUSPARSE_SPSV_ALG_DEFAULT, 2077 fs->spsvDescr_Lt)); 2078 2079 PetscCall(VecCUDARestoreArrayRead(b,&barray)); 2080 PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 2081 2082 PetscCall(PetscLogGpuTimeEnd()); 2083 PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 2084 PetscFunctionReturn(0); 2085 } 2086 2087 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *info) 2088 { 2089 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2090 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2091 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2092 CsrMatrix *Acsr; 2093 PetscInt m,nz; 2094 PetscBool flg; 2095 2096 PetscFunctionBegin; 2097 if (PetscDefined(USE_DEBUG)) { 2098 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2099 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 2100 } 2101 2102 /* Copy A's value to fact */ 2103 m = fact->rmap->n; 2104 nz = aij->nz; 2105 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2106 Acsr = (CsrMatrix*)Acusp->mat->mat; 2107 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2108 2109 /* Factorize fact inplace */ 2110 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 2111 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 2112 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 2113 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 2114 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 2115 */ 2116 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, 2117 fs->matDescr_M, 2118 fs->csrVal, 2119 fs->csrRowPtr, 2120 fs->csrColIdx, 2121 fs->ic0Info_M, 2122 fs->policy_M, 2123 fs->factBuffer_M)); 2124 if (PetscDefined(USE_DEBUG)) { 2125 int numerical_zero; 2126 cusparseStatus_t status; 2127 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 2128 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csric02: A(%d,%d) is zero",numerical_zero,numerical_zero); 2129 } 2130 2131 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 2132 CUSPARSE_OPERATION_NON_TRANSPOSE, 2133 &PETSC_CUSPARSE_ONE, 2134 fs->spMatDescr_L, 2135 fs->dnVecDescr_X, 2136 fs->dnVecDescr_Y, 2137 cusparse_scalartype, 2138 CUSPARSE_SPSV_ALG_DEFAULT, 2139 fs->spsvDescr_L, 2140 fs->spsvBuffer_L)); 2141 2142 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 2143 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 2144 */ 2145 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 2146 CUSPARSE_OPERATION_TRANSPOSE, 2147 &PETSC_CUSPARSE_ONE, 2148 fs->spMatDescr_L, 2149 fs->dnVecDescr_X, 2150 fs->dnVecDescr_Y, 2151 cusparse_scalartype, 2152 CUSPARSE_SPSV_ALG_DEFAULT, 2153 fs->spsvDescr_Lt, 2154 fs->spsvBuffer_Lt)); 2155 2156 fact->offloadmask = PETSC_OFFLOAD_GPU; 2157 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 2158 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 2159 fact->ops->matsolve = NULL; 2160 fact->ops->matsolvetranspose = NULL; 2161 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 2162 PetscFunctionReturn(0); 2163 } 2164 2165 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2166 { 2167 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2168 Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2169 PetscInt m,nz; 2170 2171 PetscFunctionBegin; 2172 if (PetscDefined(USE_DEBUG)) { 2173 PetscInt i; 2174 PetscBool flg,missing; 2175 2176 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2177 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 2178 PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 2179 PetscCall(MatMissingDiagonal(A,&missing,&i)); 2180 PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i); 2181 } 2182 2183 /* Free the old stale stuff */ 2184 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2185 2186 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2187 but they will not be used. Allocate them just for easy debugging. 2188 */ 2189 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/)); 2190 2191 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2192 fact->factortype = MAT_FACTOR_ICC; 2193 fact->info.factor_mallocs = 0; 2194 fact->info.fill_ratio_given = info->fill; 2195 fact->info.fill_ratio_needed = 1.0; 2196 2197 aij->row = NULL; 2198 aij->col = NULL; 2199 2200 /* ====================================================================== */ 2201 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2202 /* We'll do in-place factorization on fact */ 2203 /* ====================================================================== */ 2204 const int *Ai,*Aj; 2205 2206 m = fact->rmap->n; 2207 nz = aij->nz; 2208 2209 PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1))); 2210 PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz)); 2211 PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz)); 2212 PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj)); /* Do not use compressed Ai */ 2213 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2214 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2215 2216 /* ====================================================================== */ 2217 /* Create mat descriptors for M, L */ 2218 /* ====================================================================== */ 2219 cusparseFillMode_t fillMode; 2220 cusparseDiagType_t diagType; 2221 2222 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2223 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2224 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2225 2226 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2227 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2228 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2229 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2230 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2231 */ 2232 fillMode = CUSPARSE_FILL_MODE_LOWER; 2233 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2234 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz, 2235 fs->csrRowPtr, 2236 fs->csrColIdx, 2237 fs->csrVal, 2238 CUSPARSE_INDEX_32I, 2239 CUSPARSE_INDEX_32I, 2240 CUSPARSE_INDEX_BASE_ZERO, 2241 cusparse_scalartype)); 2242 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 2243 CUSPARSE_SPMAT_FILL_MODE, 2244 &fillMode, 2245 sizeof(fillMode))); 2246 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 2247 CUSPARSE_SPMAT_DIAG_TYPE, 2248 &diagType, 2249 sizeof(diagType))); 2250 2251 /* ========================================================================= */ 2252 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2253 /* ========================================================================= */ 2254 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2255 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, 2256 fs->matDescr_M, 2257 fs->csrVal, 2258 fs->csrRowPtr, 2259 fs->csrColIdx, 2260 fs->ic0Info_M, 2261 &fs->factBufferSize_M)); 2262 2263 PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m)); 2264 PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m)); 2265 2266 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype)); 2267 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype)); 2268 2269 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2270 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 2271 CUSPARSE_OPERATION_NON_TRANSPOSE, 2272 &PETSC_CUSPARSE_ONE, 2273 fs->spMatDescr_L, 2274 fs->dnVecDescr_X, 2275 fs->dnVecDescr_Y, 2276 cusparse_scalartype, 2277 CUSPARSE_SPSV_ALG_DEFAULT, 2278 fs->spsvDescr_L, 2279 &fs->spsvBufferSize_L)); 2280 2281 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2282 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 2283 CUSPARSE_OPERATION_TRANSPOSE, 2284 &PETSC_CUSPARSE_ONE, 2285 fs->spMatDescr_L, 2286 fs->dnVecDescr_X, 2287 fs->dnVecDescr_Y, 2288 cusparse_scalartype, 2289 CUSPARSE_SPSV_ALG_DEFAULT, 2290 fs->spsvDescr_Lt, 2291 &fs->spsvBufferSize_Lt)); 2292 2293 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2294 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2295 */ 2296 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2297 PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M))); 2298 fs->spsvBuffer_L = fs->factBuffer_M; 2299 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt)); 2300 } else { 2301 PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_Lt,(size_t)fs->factBufferSize_M))); 2302 fs->spsvBuffer_Lt = fs->factBuffer_M; 2303 PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L)); 2304 } 2305 2306 /* ========================================================================== */ 2307 /* Perform analysis of ic0 on M */ 2308 /* The lower triangular part of M has the same sparsity pattern as L */ 2309 /* ========================================================================== */ 2310 int structural_zero; 2311 cusparseStatus_t status; 2312 2313 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2314 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, 2315 fs->matDescr_M, 2316 fs->csrVal, 2317 fs->csrRowPtr, 2318 fs->csrColIdx, 2319 fs->ic0Info_M, 2320 fs->policy_M, 2321 fs->factBuffer_M)); 2322 if (PetscDefined(USE_DEBUG)) { 2323 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2324 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2325 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csric02: A(%d,%d) is missing",structural_zero,structural_zero); 2326 } 2327 2328 /* Estimate FLOPs of the numeric factorization */ 2329 { 2330 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ*)A->data; 2331 PetscInt *Ai,nzRow,nzLeft; 2332 PetscLogDouble flops = 0.0; 2333 2334 Ai = Aseq->i; 2335 for (PetscInt i=0; i<m; i++) { 2336 nzRow = Ai[i+1] - Ai[i]; 2337 if (nzRow > 1) { 2338 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2339 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2340 */ 2341 nzLeft = (nzRow-1)/2; 2342 flops += nzLeft*(2.0*nzRow-nzLeft+1); 2343 } 2344 } 2345 fs->numericFactFlops = flops; 2346 } 2347 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2348 PetscFunctionReturn(0); 2349 } 2350 #endif 2351 2352 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2353 { 2354 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2355 2356 PetscFunctionBegin; 2357 #if CUSPARSE_VERSION >= 11500 2358 PetscBool row_identity = PETSC_FALSE,col_identity = PETSC_FALSE; 2359 if (cusparseTriFactors->factorizeOnDevice) { 2360 PetscCall(ISIdentity(isrow,&row_identity)); 2361 PetscCall(ISIdentity(iscol,&col_identity)); 2362 } 2363 if (!info->levels && row_identity && col_identity) { 2364 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B,A,isrow,iscol,info)); 2365 } else 2366 #endif 2367 { 2368 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2369 PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2370 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2371 } 2372 PetscFunctionReturn(0); 2373 } 2374 2375 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2376 { 2377 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2378 2379 PetscFunctionBegin; 2380 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2381 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2382 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2383 PetscFunctionReturn(0); 2384 } 2385 2386 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2387 { 2388 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2389 2390 PetscFunctionBegin; 2391 #if CUSPARSE_VERSION >= 11500 2392 PetscBool perm_identity = PETSC_FALSE; 2393 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm,&perm_identity)); 2394 if (!info->levels && perm_identity) { 2395 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B,A,perm,info)); 2396 } else 2397 #endif 2398 { 2399 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2400 PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 2401 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2402 } 2403 PetscFunctionReturn(0); 2404 } 2405 2406 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2407 { 2408 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2409 2410 PetscFunctionBegin; 2411 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2412 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 2413 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2414 PetscFunctionReturn(0); 2415 } 2416 2417 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 2418 { 2419 PetscFunctionBegin; 2420 *type = MATSOLVERCUSPARSE; 2421 PetscFunctionReturn(0); 2422 } 2423 2424 /*MC 2425 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2426 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 2427 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2428 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2429 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2430 algorithms are not recommended. This class does NOT support direct solver operations. 2431 2432 Level: beginner 2433 2434 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2435 M*/ 2436 2437 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 2438 { 2439 PetscInt n = A->rmap->n; 2440 PetscBool factOnDevice,factOnHost; 2441 char *prefix; 2442 char factPlace[32] = "device"; /* the default */ 2443 2444 PetscFunctionBegin; 2445 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 2446 PetscCall(MatSetSizes(*B,n,n,n,n)); 2447 (*B)->factortype = ftype; 2448 PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 2449 2450 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2451 PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)),prefix,"MatGetFactor","Mat"); 2452 PetscCall(PetscOptionsString("-mat_factor_bind_factorization","Do matrix factorization on host or device when possible","MatGetFactor",NULL,factPlace,sizeof(factPlace),NULL)); 2453 PetscOptionsEnd(); 2454 PetscCall(PetscStrcasecmp("device",factPlace,&factOnDevice)); 2455 PetscCall(PetscStrcasecmp("host",factPlace,&factOnHost)); 2456 PetscCheck(factOnDevice || factOnHost,PetscObjectComm((PetscObject)(*B)),PETSC_ERR_ARG_OUTOFRANGE,"Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed",factPlace); 2457 ((Mat_SeqAIJCUSPARSETriFactors*)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2458 2459 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 2460 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2461 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 2462 if (!A->boundtocpu) { 2463 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2464 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2465 } else { 2466 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2467 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2468 } 2469 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 2470 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2471 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2472 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2473 if (!A->boundtocpu) { 2474 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2475 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2476 } else { 2477 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2478 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2479 } 2480 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2481 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2482 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 2483 2484 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 2485 (*B)->canuseordering = PETSC_TRUE; 2486 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 2487 PetscFunctionReturn(0); 2488 } 2489 2490 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2491 { 2492 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2493 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2494 #if CUSPARSE_VERSION >= 13500 2495 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 2496 #endif 2497 2498 PetscFunctionBegin; 2499 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2500 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 2501 if (A->factortype == MAT_FACTOR_NONE) { 2502 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 2503 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2504 } 2505 #if CUSPARSE_VERSION >= 13500 2506 else if (fs->csrVal) { 2507 /* We have a factorized matrix on device and are able to copy it to host */ 2508 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2509 } 2510 #endif 2511 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for copying this type of factorized matrix from device to host"); 2512 PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 2513 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 2514 A->offloadmask = PETSC_OFFLOAD_BOTH; 2515 } 2516 PetscFunctionReturn(0); 2517 } 2518 2519 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2520 { 2521 PetscFunctionBegin; 2522 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2523 *array = ((Mat_SeqAIJ*)A->data)->a; 2524 PetscFunctionReturn(0); 2525 } 2526 2527 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2528 { 2529 PetscFunctionBegin; 2530 A->offloadmask = PETSC_OFFLOAD_CPU; 2531 *array = NULL; 2532 PetscFunctionReturn(0); 2533 } 2534 2535 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 2536 { 2537 PetscFunctionBegin; 2538 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2539 *array = ((Mat_SeqAIJ*)A->data)->a; 2540 PetscFunctionReturn(0); 2541 } 2542 2543 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 2544 { 2545 PetscFunctionBegin; 2546 *array = NULL; 2547 PetscFunctionReturn(0); 2548 } 2549 2550 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2551 { 2552 PetscFunctionBegin; 2553 *array = ((Mat_SeqAIJ*)A->data)->a; 2554 PetscFunctionReturn(0); 2555 } 2556 2557 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 2558 { 2559 PetscFunctionBegin; 2560 A->offloadmask = PETSC_OFFLOAD_CPU; 2561 *array = NULL; 2562 PetscFunctionReturn(0); 2563 } 2564 2565 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 2566 { 2567 Mat_SeqAIJCUSPARSE *cusp; 2568 CsrMatrix *matrix; 2569 2570 PetscFunctionBegin; 2571 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2572 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2573 cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 2574 PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 2575 matrix = (CsrMatrix*)cusp->mat->mat; 2576 2577 if (i) { 2578 #if !defined(PETSC_USE_64BIT_INDICES) 2579 *i = matrix->row_offsets->data().get(); 2580 #else 2581 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 2582 #endif 2583 } 2584 if (j) { 2585 #if !defined(PETSC_USE_64BIT_INDICES) 2586 *j = matrix->column_indices->data().get(); 2587 #else 2588 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 2589 #endif 2590 } 2591 if (a) *a = matrix->values->data().get(); 2592 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2593 PetscFunctionReturn(0); 2594 } 2595 2596 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2597 { 2598 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2599 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2600 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2601 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 2602 cusparseStatus_t stat; 2603 PetscBool both = PETSC_TRUE; 2604 2605 PetscFunctionBegin; 2606 PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 2607 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2608 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2609 CsrMatrix *matrix; 2610 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 2611 2612 PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 2613 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2614 matrix->values->assign(a->a, a->a+a->nz); 2615 PetscCallCUDA(WaitForCUDA()); 2616 PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 2617 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2618 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 2619 } else { 2620 PetscInt nnz; 2621 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2622 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 2623 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 2624 delete cusparsestruct->workVector; 2625 delete cusparsestruct->rowoffsets_gpu; 2626 cusparsestruct->workVector = NULL; 2627 cusparsestruct->rowoffsets_gpu = NULL; 2628 try { 2629 if (a->compressedrow.use) { 2630 m = a->compressedrow.nrows; 2631 ii = a->compressedrow.i; 2632 ridx = a->compressedrow.rindex; 2633 } else { 2634 m = A->rmap->n; 2635 ii = a->i; 2636 ridx = NULL; 2637 } 2638 PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 2639 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 2640 else nnz = a->nz; 2641 PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 2642 2643 /* create cusparse matrix */ 2644 cusparsestruct->nrows = m; 2645 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2646 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2647 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2648 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2649 2650 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 2651 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 2652 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2653 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2654 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2655 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2656 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2657 2658 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2659 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 2660 /* set the matrix */ 2661 CsrMatrix *mat= new CsrMatrix; 2662 mat->num_rows = m; 2663 mat->num_cols = A->cmap->n; 2664 mat->num_entries = nnz; 2665 mat->row_offsets = new THRUSTINTARRAY32(m+1); 2666 mat->row_offsets->assign(ii, ii + m+1); 2667 2668 mat->column_indices = new THRUSTINTARRAY32(nnz); 2669 mat->column_indices->assign(a->j, a->j+nnz); 2670 2671 mat->values = new THRUSTARRAY(nnz); 2672 if (a->a) mat->values->assign(a->a, a->a+nnz); 2673 2674 /* assign the pointer */ 2675 matstruct->mat = mat; 2676 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2677 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2678 stat = cusparseCreateCsr(&matstruct->matDescr, 2679 mat->num_rows, mat->num_cols, mat->num_entries, 2680 mat->row_offsets->data().get(), mat->column_indices->data().get(), 2681 mat->values->data().get(), 2682 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2683 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2684 } 2685 #endif 2686 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 2687 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2688 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2689 #else 2690 CsrMatrix *mat= new CsrMatrix; 2691 mat->num_rows = m; 2692 mat->num_cols = A->cmap->n; 2693 mat->num_entries = nnz; 2694 mat->row_offsets = new THRUSTINTARRAY32(m+1); 2695 mat->row_offsets->assign(ii, ii + m+1); 2696 2697 mat->column_indices = new THRUSTINTARRAY32(nnz); 2698 mat->column_indices->assign(a->j, a->j+nnz); 2699 2700 mat->values = new THRUSTARRAY(nnz); 2701 if (a->a) mat->values->assign(a->a, a->a+nnz); 2702 2703 cusparseHybMat_t hybMat; 2704 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2705 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 2706 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2707 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 2708 matstruct->descr, mat->values->data().get(), 2709 mat->row_offsets->data().get(), 2710 mat->column_indices->data().get(), 2711 hybMat, 0, partition);PetscCallCUSPARSE(stat); 2712 /* assign the pointer */ 2713 matstruct->mat = hybMat; 2714 2715 if (mat) { 2716 if (mat->values) delete (THRUSTARRAY*)mat->values; 2717 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 2718 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 2719 delete (CsrMatrix*)mat; 2720 } 2721 #endif 2722 } 2723 2724 /* assign the compressed row indices */ 2725 if (a->compressedrow.use) { 2726 cusparsestruct->workVector = new THRUSTARRAY(m); 2727 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2728 matstruct->cprowIndices->assign(ridx,ridx+m); 2729 tmp = m; 2730 } else { 2731 cusparsestruct->workVector = NULL; 2732 matstruct->cprowIndices = NULL; 2733 tmp = 0; 2734 } 2735 PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 2736 2737 /* assign the pointer */ 2738 cusparsestruct->mat = matstruct; 2739 } catch(char *ex) { 2740 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2741 } 2742 PetscCallCUDA(WaitForCUDA()); 2743 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2744 cusparsestruct->nonzerostate = A->nonzerostate; 2745 } 2746 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2747 } 2748 PetscFunctionReturn(0); 2749 } 2750 2751 struct VecCUDAPlusEquals 2752 { 2753 template <typename Tuple> 2754 __host__ __device__ 2755 void operator()(Tuple t) 2756 { 2757 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2758 } 2759 }; 2760 2761 struct VecCUDAEquals 2762 { 2763 template <typename Tuple> 2764 __host__ __device__ 2765 void operator()(Tuple t) 2766 { 2767 thrust::get<1>(t) = thrust::get<0>(t); 2768 } 2769 }; 2770 2771 struct VecCUDAEqualsReverse 2772 { 2773 template <typename Tuple> 2774 __host__ __device__ 2775 void operator()(Tuple t) 2776 { 2777 thrust::get<0>(t) = thrust::get<1>(t); 2778 } 2779 }; 2780 2781 struct MatMatCusparse { 2782 PetscBool cisdense; 2783 PetscScalar *Bt; 2784 Mat X; 2785 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2786 PetscLogDouble flops; 2787 CsrMatrix *Bcsr; 2788 2789 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2790 cusparseSpMatDescr_t matSpBDescr; 2791 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2792 cusparseDnMatDescr_t matBDescr; 2793 cusparseDnMatDescr_t matCDescr; 2794 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2795 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2796 void *dBuffer4; 2797 void *dBuffer5; 2798 #endif 2799 size_t mmBufferSize; 2800 void *mmBuffer; 2801 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2802 cusparseSpGEMMDescr_t spgemmDesc; 2803 #endif 2804 }; 2805 2806 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2807 { 2808 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2809 2810 PetscFunctionBegin; 2811 PetscCallCUDA(cudaFree(mmdata->Bt)); 2812 delete mmdata->Bcsr; 2813 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2814 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2815 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2816 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2817 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2818 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2819 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2820 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2821 #endif 2822 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2823 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2824 #endif 2825 PetscCall(MatDestroy(&mmdata->X)); 2826 PetscCall(PetscFree(data)); 2827 PetscFunctionReturn(0); 2828 } 2829 2830 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2831 2832 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2833 { 2834 Mat_Product *product = C->product; 2835 Mat A,B; 2836 PetscInt m,n,blda,clda; 2837 PetscBool flg,biscuda; 2838 Mat_SeqAIJCUSPARSE *cusp; 2839 cusparseStatus_t stat; 2840 cusparseOperation_t opA; 2841 const PetscScalar *barray; 2842 PetscScalar *carray; 2843 MatMatCusparse *mmdata; 2844 Mat_SeqAIJCUSPARSEMultStruct *mat; 2845 CsrMatrix *csrmat; 2846 2847 PetscFunctionBegin; 2848 MatCheckProduct(C,1); 2849 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2850 mmdata = (MatMatCusparse*)product->data; 2851 A = product->A; 2852 B = product->B; 2853 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2854 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2855 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2856 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2857 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2858 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2859 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2860 switch (product->type) { 2861 case MATPRODUCT_AB: 2862 case MATPRODUCT_PtAP: 2863 mat = cusp->mat; 2864 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2865 m = A->rmap->n; 2866 n = B->cmap->n; 2867 break; 2868 case MATPRODUCT_AtB: 2869 if (!A->form_explicit_transpose) { 2870 mat = cusp->mat; 2871 opA = CUSPARSE_OPERATION_TRANSPOSE; 2872 } else { 2873 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2874 mat = cusp->matTranspose; 2875 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2876 } 2877 m = A->cmap->n; 2878 n = B->cmap->n; 2879 break; 2880 case MATPRODUCT_ABt: 2881 case MATPRODUCT_RARt: 2882 mat = cusp->mat; 2883 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2884 m = A->rmap->n; 2885 n = B->rmap->n; 2886 break; 2887 default: 2888 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2889 } 2890 PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2891 csrmat = (CsrMatrix*)mat->mat; 2892 /* if the user passed a CPU matrix, copy the data to the GPU */ 2893 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 2894 if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 2895 PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2896 2897 PetscCall(MatDenseGetLDA(B,&blda)); 2898 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2899 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 2900 PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2901 } else { 2902 PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 2903 PetscCall(MatDenseGetLDA(C,&clda)); 2904 } 2905 2906 PetscCall(PetscLogGpuTimeBegin()); 2907 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2908 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2909 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2910 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2911 size_t mmBufferSize; 2912 if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2913 if (!mmdata->matBDescr) { 2914 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2915 mmdata->Blda = blda; 2916 } 2917 2918 if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2919 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2920 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2921 mmdata->Clda = clda; 2922 } 2923 2924 if (!mat->matDescr) { 2925 stat = cusparseCreateCsr(&mat->matDescr, 2926 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2927 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2928 csrmat->values->data().get(), 2929 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2930 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2931 } 2932 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2933 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2934 mmdata->matCDescr,cusparse_scalartype, 2935 cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2936 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2937 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2938 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2939 mmdata->mmBufferSize = mmBufferSize; 2940 } 2941 mmdata->initialized = PETSC_TRUE; 2942 } else { 2943 /* to be safe, always update pointers of the mats */ 2944 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 2945 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 2946 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2947 } 2948 2949 /* do cusparseSpMM, which supports transpose on B */ 2950 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2951 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2952 mmdata->matCDescr,cusparse_scalartype, 2953 cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2954 #else 2955 PetscInt k; 2956 /* cusparseXcsrmm does not support transpose on B */ 2957 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2958 cublasHandle_t cublasv2handle; 2959 cublasStatus_t cerr; 2960 2961 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2962 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2963 B->cmap->n,B->rmap->n, 2964 &PETSC_CUSPARSE_ONE ,barray,blda, 2965 &PETSC_CUSPARSE_ZERO,barray,blda, 2966 mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2967 blda = B->cmap->n; 2968 k = B->cmap->n; 2969 } else { 2970 k = B->rmap->n; 2971 } 2972 2973 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2974 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2975 csrmat->num_entries,mat->alpha_one,mat->descr, 2976 csrmat->values->data().get(), 2977 csrmat->row_offsets->data().get(), 2978 csrmat->column_indices->data().get(), 2979 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2980 carray,clda);PetscCallCUSPARSE(stat); 2981 #endif 2982 PetscCall(PetscLogGpuTimeEnd()); 2983 PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 2984 PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2985 if (product->type == MATPRODUCT_RARt) { 2986 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2987 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2988 } else if (product->type == MATPRODUCT_PtAP) { 2989 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2990 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2991 } else { 2992 PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2993 } 2994 if (mmdata->cisdense) { 2995 PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2996 } 2997 if (!biscuda) { 2998 PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2999 } 3000 PetscFunctionReturn(0); 3001 } 3002 3003 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 3004 { 3005 Mat_Product *product = C->product; 3006 Mat A,B; 3007 PetscInt m,n; 3008 PetscBool cisdense,flg; 3009 MatMatCusparse *mmdata; 3010 Mat_SeqAIJCUSPARSE *cusp; 3011 3012 PetscFunctionBegin; 3013 MatCheckProduct(C,1); 3014 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 3015 A = product->A; 3016 B = product->B; 3017 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 3018 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 3019 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3020 PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3021 switch (product->type) { 3022 case MATPRODUCT_AB: 3023 m = A->rmap->n; 3024 n = B->cmap->n; 3025 break; 3026 case MATPRODUCT_AtB: 3027 m = A->cmap->n; 3028 n = B->cmap->n; 3029 break; 3030 case MATPRODUCT_ABt: 3031 m = A->rmap->n; 3032 n = B->rmap->n; 3033 break; 3034 case MATPRODUCT_PtAP: 3035 m = B->cmap->n; 3036 n = B->cmap->n; 3037 break; 3038 case MATPRODUCT_RARt: 3039 m = B->rmap->n; 3040 n = B->rmap->n; 3041 break; 3042 default: 3043 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3044 } 3045 PetscCall(MatSetSizes(C,m,n,m,n)); 3046 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 3047 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 3048 PetscCall(MatSetType(C,MATSEQDENSECUDA)); 3049 3050 /* product data */ 3051 PetscCall(PetscNew(&mmdata)); 3052 mmdata->cisdense = cisdense; 3053 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 3054 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 3055 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 3056 PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 3057 } 3058 #endif 3059 /* for these products we need intermediate storage */ 3060 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 3061 PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 3062 PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 3063 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 3064 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 3065 } else { 3066 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 3067 } 3068 } 3069 C->product->data = mmdata; 3070 C->product->destroy = MatDestroy_MatMatCusparse; 3071 3072 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 3073 PetscFunctionReturn(0); 3074 } 3075 3076 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3077 { 3078 Mat_Product *product = C->product; 3079 Mat A,B; 3080 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 3081 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 3082 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 3083 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3084 PetscBool flg; 3085 cusparseStatus_t stat; 3086 MatProductType ptype; 3087 MatMatCusparse *mmdata; 3088 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3089 cusparseSpMatDescr_t BmatSpDescr; 3090 #endif 3091 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3092 3093 PetscFunctionBegin; 3094 MatCheckProduct(C,1); 3095 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 3096 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 3097 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 3098 mmdata = (MatMatCusparse*)C->product->data; 3099 A = product->A; 3100 B = product->B; 3101 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 3102 mmdata->reusesym = PETSC_FALSE; 3103 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3104 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3105 Cmat = Ccusp->mat; 3106 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 3107 Ccsr = (CsrMatrix*)Cmat->mat; 3108 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 3109 goto finalize; 3110 } 3111 if (!c->nz) goto finalize; 3112 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 3113 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 3114 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 3115 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 3116 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 3117 PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 3118 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3119 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 3120 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3121 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3122 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3123 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3124 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3125 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3126 3127 ptype = product->type; 3128 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3129 ptype = MATPRODUCT_AB; 3130 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 3131 } 3132 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3133 ptype = MATPRODUCT_AB; 3134 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 3135 } 3136 switch (ptype) { 3137 case MATPRODUCT_AB: 3138 Amat = Acusp->mat; 3139 Bmat = Bcusp->mat; 3140 break; 3141 case MATPRODUCT_AtB: 3142 Amat = Acusp->matTranspose; 3143 Bmat = Bcusp->mat; 3144 break; 3145 case MATPRODUCT_ABt: 3146 Amat = Acusp->mat; 3147 Bmat = Bcusp->matTranspose; 3148 break; 3149 default: 3150 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3151 } 3152 Cmat = Ccusp->mat; 3153 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 3154 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 3155 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 3156 Acsr = (CsrMatrix*)Amat->mat; 3157 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 3158 Ccsr = (CsrMatrix*)Cmat->mat; 3159 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 3160 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 3161 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 3162 PetscCall(PetscLogGpuTimeBegin()); 3163 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3164 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 3165 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3166 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3167 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 3168 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3169 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3170 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3171 #else 3172 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3173 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3174 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3175 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 3176 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 3177 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3178 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3179 #endif 3180 #else 3181 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 3182 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3183 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3184 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3185 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 3186 #endif 3187 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3188 PetscCallCUDA(WaitForCUDA()); 3189 PetscCall(PetscLogGpuTimeEnd()); 3190 C->offloadmask = PETSC_OFFLOAD_GPU; 3191 finalize: 3192 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3193 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 3194 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 3195 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 3196 c->reallocs = 0; 3197 C->info.mallocs += 0; 3198 C->info.nz_unneeded = 0; 3199 C->assembled = C->was_assembled = PETSC_TRUE; 3200 C->num_ass++; 3201 PetscFunctionReturn(0); 3202 } 3203 3204 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3205 { 3206 Mat_Product *product = C->product; 3207 Mat A,B; 3208 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 3209 Mat_SeqAIJ *a,*b,*c; 3210 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 3211 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3212 PetscInt i,j,m,n,k; 3213 PetscBool flg; 3214 cusparseStatus_t stat; 3215 MatProductType ptype; 3216 MatMatCusparse *mmdata; 3217 PetscLogDouble flops; 3218 PetscBool biscompressed,ciscompressed; 3219 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3220 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3221 cusparseSpMatDescr_t BmatSpDescr; 3222 #else 3223 int cnz; 3224 #endif 3225 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3226 3227 PetscFunctionBegin; 3228 MatCheckProduct(C,1); 3229 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 3230 A = product->A; 3231 B = product->B; 3232 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 3233 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 3234 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 3235 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 3236 a = (Mat_SeqAIJ*)A->data; 3237 b = (Mat_SeqAIJ*)B->data; 3238 /* product data */ 3239 PetscCall(PetscNew(&mmdata)); 3240 C->product->data = mmdata; 3241 C->product->destroy = MatDestroy_MatMatCusparse; 3242 3243 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3244 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3245 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3246 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 3247 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3248 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3249 3250 ptype = product->type; 3251 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3252 ptype = MATPRODUCT_AB; 3253 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3254 } 3255 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3256 ptype = MATPRODUCT_AB; 3257 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3258 } 3259 biscompressed = PETSC_FALSE; 3260 ciscompressed = PETSC_FALSE; 3261 switch (ptype) { 3262 case MATPRODUCT_AB: 3263 m = A->rmap->n; 3264 n = B->cmap->n; 3265 k = A->cmap->n; 3266 Amat = Acusp->mat; 3267 Bmat = Bcusp->mat; 3268 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3269 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3270 break; 3271 case MATPRODUCT_AtB: 3272 m = A->cmap->n; 3273 n = B->cmap->n; 3274 k = A->rmap->n; 3275 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3276 Amat = Acusp->matTranspose; 3277 Bmat = Bcusp->mat; 3278 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3279 break; 3280 case MATPRODUCT_ABt: 3281 m = A->rmap->n; 3282 n = B->rmap->n; 3283 k = A->cmap->n; 3284 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3285 Amat = Acusp->mat; 3286 Bmat = Bcusp->matTranspose; 3287 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3288 break; 3289 default: 3290 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3291 } 3292 3293 /* create cusparse matrix */ 3294 PetscCall(MatSetSizes(C,m,n,m,n)); 3295 PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 3296 c = (Mat_SeqAIJ*)C->data; 3297 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3298 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3299 Ccsr = new CsrMatrix; 3300 3301 c->compressedrow.use = ciscompressed; 3302 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3303 c->compressedrow.nrows = a->compressedrow.nrows; 3304 PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 3305 PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 3306 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3307 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3308 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 3309 } else { 3310 c->compressedrow.nrows = 0; 3311 c->compressedrow.i = NULL; 3312 c->compressedrow.rindex = NULL; 3313 Ccusp->workVector = NULL; 3314 Cmat->cprowIndices = NULL; 3315 } 3316 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3317 Ccusp->mat = Cmat; 3318 Ccusp->mat->mat = Ccsr; 3319 Ccsr->num_rows = Ccusp->nrows; 3320 Ccsr->num_cols = n; 3321 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 3322 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3323 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3324 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3325 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 3326 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 3327 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 3328 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3329 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3330 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3331 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3332 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 3333 c->nz = 0; 3334 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3335 Ccsr->values = new THRUSTARRAY(c->nz); 3336 goto finalizesym; 3337 } 3338 3339 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 3340 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 3341 Acsr = (CsrMatrix*)Amat->mat; 3342 if (!biscompressed) { 3343 Bcsr = (CsrMatrix*)Bmat->mat; 3344 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3345 BmatSpDescr = Bmat->matDescr; 3346 #endif 3347 } else { /* we need to use row offsets for the full matrix */ 3348 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 3349 Bcsr = new CsrMatrix; 3350 Bcsr->num_rows = B->rmap->n; 3351 Bcsr->num_cols = cBcsr->num_cols; 3352 Bcsr->num_entries = cBcsr->num_entries; 3353 Bcsr->column_indices = cBcsr->column_indices; 3354 Bcsr->values = cBcsr->values; 3355 if (!Bcusp->rowoffsets_gpu) { 3356 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3357 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3358 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 3359 } 3360 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3361 mmdata->Bcsr = Bcsr; 3362 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3363 if (Bcsr->num_rows && Bcsr->num_cols) { 3364 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 3365 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3366 Bcsr->values->data().get(), 3367 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3368 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 3369 } 3370 BmatSpDescr = mmdata->matSpBDescr; 3371 #endif 3372 } 3373 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 3374 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 3375 /* precompute flops count */ 3376 if (ptype == MATPRODUCT_AB) { 3377 for (i=0, flops = 0; i<A->rmap->n; i++) { 3378 const PetscInt st = a->i[i]; 3379 const PetscInt en = a->i[i+1]; 3380 for (j=st; j<en; j++) { 3381 const PetscInt brow = a->j[j]; 3382 flops += 2.*(b->i[brow+1] - b->i[brow]); 3383 } 3384 } 3385 } else if (ptype == MATPRODUCT_AtB) { 3386 for (i=0, flops = 0; i<A->rmap->n; i++) { 3387 const PetscInt anzi = a->i[i+1] - a->i[i]; 3388 const PetscInt bnzi = b->i[i+1] - b->i[i]; 3389 flops += (2.*anzi)*bnzi; 3390 } 3391 } else { /* TODO */ 3392 flops = 0.; 3393 } 3394 3395 mmdata->flops = flops; 3396 PetscCall(PetscLogGpuTimeBegin()); 3397 3398 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3399 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3400 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 3401 NULL, NULL, NULL, 3402 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3403 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 3404 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3405 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3406 { 3407 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3408 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3409 */ 3410 void* dBuffer1 = NULL; 3411 void* dBuffer2 = NULL; 3412 void* dBuffer3 = NULL; 3413 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3414 size_t bufferSize1 = 0; 3415 size_t bufferSize2 = 0; 3416 size_t bufferSize3 = 0; 3417 size_t bufferSize4 = 0; 3418 size_t bufferSize5 = 0; 3419 3420 /*----------------------------------------------------------------------*/ 3421 /* ask bufferSize1 bytes for external memory */ 3422 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3423 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3424 &bufferSize1, NULL);PetscCallCUSPARSE(stat); 3425 PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 3426 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3427 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3428 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3429 &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 3430 3431 /*----------------------------------------------------------------------*/ 3432 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3433 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3434 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 3435 PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 3436 PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 3437 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 3438 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3439 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3440 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 3441 PetscCallCUDA(cudaFree(dBuffer1)); 3442 PetscCallCUDA(cudaFree(dBuffer2)); 3443 3444 /*----------------------------------------------------------------------*/ 3445 /* get matrix C non-zero entries C_nnz1 */ 3446 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3447 c->nz = (PetscInt) C_nnz1; 3448 /* allocate matrix C */ 3449 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3450 Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3451 /* update matC with the new pointers */ 3452 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 3453 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 3454 3455 /*----------------------------------------------------------------------*/ 3456 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3457 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3458 &bufferSize5, NULL);PetscCallCUSPARSE(stat); 3459 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 3460 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3461 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 3462 &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 3463 PetscCallCUDA(cudaFree(dBuffer3)); 3464 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 3465 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3466 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3467 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3468 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 3469 } 3470 #else 3471 size_t bufSize2; 3472 /* ask bufferSize bytes for external memory */ 3473 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 3474 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3475 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3476 mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 3477 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 3478 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3479 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 3480 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3481 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3482 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 3483 /* ask bufferSize again bytes for external memory */ 3484 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3485 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3486 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3487 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 3488 /* The CUSPARSE documentation is not clear, nor the API 3489 We need both buffers to perform the operations properly! 3490 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3491 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3492 is stored in the descriptor! What a messy API... */ 3493 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 3494 /* compute the intermediate product of A * B */ 3495 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3496 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3497 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 3498 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 3499 /* get matrix C non-zero entries C_nnz1 */ 3500 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3501 c->nz = (PetscInt) C_nnz1; 3502 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 3503 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3504 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3505 Ccsr->values = new THRUSTARRAY(c->nz); 3506 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3507 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 3508 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 3509 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 3510 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3511 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3512 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3513 #else 3514 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3515 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 3516 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3517 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3518 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3519 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 3520 c->nz = cnz; 3521 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3522 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3523 Ccsr->values = new THRUSTARRAY(c->nz); 3524 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3525 3526 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3527 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3528 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3529 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3530 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 3531 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3532 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3533 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3534 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 3535 #endif 3536 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3537 PetscCall(PetscLogGpuTimeEnd()); 3538 finalizesym: 3539 c->singlemalloc = PETSC_FALSE; 3540 c->free_a = PETSC_TRUE; 3541 c->free_ij = PETSC_TRUE; 3542 PetscCall(PetscMalloc1(m+1,&c->i)); 3543 PetscCall(PetscMalloc1(c->nz,&c->j)); 3544 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3545 PetscInt *d_i = c->i; 3546 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3547 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3548 ii = *Ccsr->row_offsets; 3549 jj = *Ccsr->column_indices; 3550 if (ciscompressed) d_i = c->compressedrow.i; 3551 PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3552 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3553 } else { 3554 PetscInt *d_i = c->i; 3555 if (ciscompressed) d_i = c->compressedrow.i; 3556 PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3557 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3558 } 3559 if (ciscompressed) { /* need to expand host row offsets */ 3560 PetscInt r = 0; 3561 c->i[0] = 0; 3562 for (k = 0; k < c->compressedrow.nrows; k++) { 3563 const PetscInt next = c->compressedrow.rindex[k]; 3564 const PetscInt old = c->compressedrow.i[k]; 3565 for (; r < next; r++) c->i[r+1] = old; 3566 } 3567 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 3568 } 3569 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 3570 PetscCall(PetscMalloc1(m,&c->ilen)); 3571 PetscCall(PetscMalloc1(m,&c->imax)); 3572 c->maxnz = c->nz; 3573 c->nonzerorowcnt = 0; 3574 c->rmax = 0; 3575 for (k = 0; k < m; k++) { 3576 const PetscInt nn = c->i[k+1] - c->i[k]; 3577 c->ilen[k] = c->imax[k] = nn; 3578 c->nonzerorowcnt += (PetscInt)!!nn; 3579 c->rmax = PetscMax(c->rmax,nn); 3580 } 3581 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3582 PetscCall(PetscMalloc1(c->nz,&c->a)); 3583 Ccsr->num_entries = c->nz; 3584 3585 C->nonzerostate++; 3586 PetscCall(PetscLayoutSetUp(C->rmap)); 3587 PetscCall(PetscLayoutSetUp(C->cmap)); 3588 Ccusp->nonzerostate = C->nonzerostate; 3589 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3590 C->preallocated = PETSC_TRUE; 3591 C->assembled = PETSC_FALSE; 3592 C->was_assembled = PETSC_FALSE; 3593 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3594 mmdata->reusesym = PETSC_TRUE; 3595 C->offloadmask = PETSC_OFFLOAD_GPU; 3596 } 3597 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3598 PetscFunctionReturn(0); 3599 } 3600 3601 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3602 3603 /* handles sparse or dense B */ 3604 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3605 { 3606 Mat_Product *product = mat->product; 3607 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 3608 3609 PetscFunctionBegin; 3610 MatCheckProduct(mat,1); 3611 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 3612 if (!product->A->boundtocpu && !product->B->boundtocpu) { 3613 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 3614 } 3615 if (product->type == MATPRODUCT_ABC) { 3616 Ciscusp = PETSC_FALSE; 3617 if (!product->C->boundtocpu) { 3618 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 3619 } 3620 } 3621 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3622 PetscBool usecpu = PETSC_FALSE; 3623 switch (product->type) { 3624 case MATPRODUCT_AB: 3625 if (product->api_user) { 3626 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 3627 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 3628 PetscOptionsEnd(); 3629 } else { 3630 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 3631 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 3632 PetscOptionsEnd(); 3633 } 3634 break; 3635 case MATPRODUCT_AtB: 3636 if (product->api_user) { 3637 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 3638 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 3639 PetscOptionsEnd(); 3640 } else { 3641 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 3642 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 3643 PetscOptionsEnd(); 3644 } 3645 break; 3646 case MATPRODUCT_PtAP: 3647 if (product->api_user) { 3648 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 3649 PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 3650 PetscOptionsEnd(); 3651 } else { 3652 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 3653 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 3654 PetscOptionsEnd(); 3655 } 3656 break; 3657 case MATPRODUCT_RARt: 3658 if (product->api_user) { 3659 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat"); 3660 PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 3661 PetscOptionsEnd(); 3662 } else { 3663 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat"); 3664 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 3665 PetscOptionsEnd(); 3666 } 3667 break; 3668 case MATPRODUCT_ABC: 3669 if (product->api_user) { 3670 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat"); 3671 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 3672 PetscOptionsEnd(); 3673 } else { 3674 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat"); 3675 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 3676 PetscOptionsEnd(); 3677 } 3678 break; 3679 default: 3680 break; 3681 } 3682 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3683 } 3684 /* dispatch */ 3685 if (isdense) { 3686 switch (product->type) { 3687 case MATPRODUCT_AB: 3688 case MATPRODUCT_AtB: 3689 case MATPRODUCT_ABt: 3690 case MATPRODUCT_PtAP: 3691 case MATPRODUCT_RARt: 3692 if (product->A->boundtocpu) { 3693 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3694 } else { 3695 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3696 } 3697 break; 3698 case MATPRODUCT_ABC: 3699 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3700 break; 3701 default: 3702 break; 3703 } 3704 } else if (Biscusp && Ciscusp) { 3705 switch (product->type) { 3706 case MATPRODUCT_AB: 3707 case MATPRODUCT_AtB: 3708 case MATPRODUCT_ABt: 3709 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3710 break; 3711 case MATPRODUCT_PtAP: 3712 case MATPRODUCT_RARt: 3713 case MATPRODUCT_ABC: 3714 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3715 break; 3716 default: 3717 break; 3718 } 3719 } else { /* fallback for AIJ */ 3720 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3721 } 3722 PetscFunctionReturn(0); 3723 } 3724 3725 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3726 { 3727 PetscFunctionBegin; 3728 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 3729 PetscFunctionReturn(0); 3730 } 3731 3732 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3733 { 3734 PetscFunctionBegin; 3735 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 3736 PetscFunctionReturn(0); 3737 } 3738 3739 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3740 { 3741 PetscFunctionBegin; 3742 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 3743 PetscFunctionReturn(0); 3744 } 3745 3746 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3747 { 3748 PetscFunctionBegin; 3749 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 3750 PetscFunctionReturn(0); 3751 } 3752 3753 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3754 { 3755 PetscFunctionBegin; 3756 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 3757 PetscFunctionReturn(0); 3758 } 3759 3760 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3761 { 3762 int i = blockIdx.x*blockDim.x + threadIdx.x; 3763 if (i < n) y[idx[i]] += x[i]; 3764 } 3765 3766 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3767 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3768 { 3769 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3770 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3771 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3772 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3773 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3774 PetscBool compressed; 3775 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3776 PetscInt nx,ny; 3777 #endif 3778 3779 PetscFunctionBegin; 3780 PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3781 if (!a->nz) { 3782 if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 3783 else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3784 PetscFunctionReturn(0); 3785 } 3786 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3787 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3788 if (!trans) { 3789 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3790 PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3791 } else { 3792 if (herm || !A->form_explicit_transpose) { 3793 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3794 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3795 } else { 3796 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3797 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3798 } 3799 } 3800 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3801 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3802 3803 try { 3804 PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 3805 if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3806 else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3807 3808 PetscCall(PetscLogGpuTimeBegin()); 3809 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3810 /* z = A x + beta y. 3811 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3812 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3813 */ 3814 xptr = xarray; 3815 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3816 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3817 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3818 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3819 allocated to accommodate different uses. So we get the length info directly from mat. 3820 */ 3821 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3822 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3823 nx = mat->num_cols; 3824 ny = mat->num_rows; 3825 } 3826 #endif 3827 } else { 3828 /* z = A^T x + beta y 3829 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3830 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3831 */ 3832 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3833 dptr = zarray; 3834 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3835 if (compressed) { /* Scatter x to work vector */ 3836 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3837 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3838 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3839 VecCUDAEqualsReverse()); 3840 } 3841 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3842 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3843 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3844 nx = mat->num_rows; 3845 ny = mat->num_cols; 3846 } 3847 #endif 3848 } 3849 3850 /* csr_spmv does y = alpha op(A) x + beta y */ 3851 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3852 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3853 PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3854 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3855 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 3856 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 3857 PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3858 matstruct->matDescr, 3859 matstruct->cuSpMV[opA].vecXDescr, beta, 3860 matstruct->cuSpMV[opA].vecYDescr, 3861 cusparse_scalartype, 3862 cusparsestruct->spmvAlg, 3863 &matstruct->cuSpMV[opA].spmvBufferSize)); 3864 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3865 3866 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3867 } else { 3868 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3869 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 3870 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3871 } 3872 3873 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3874 matstruct->alpha_one, 3875 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3876 matstruct->cuSpMV[opA].vecXDescr, 3877 beta, 3878 matstruct->cuSpMV[opA].vecYDescr, 3879 cusparse_scalartype, 3880 cusparsestruct->spmvAlg, 3881 matstruct->cuSpMV[opA].spmvBuffer)); 3882 #else 3883 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3884 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3885 mat->num_rows, mat->num_cols, 3886 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3887 mat->values->data().get(), mat->row_offsets->data().get(), 3888 mat->column_indices->data().get(), xptr, beta, 3889 dptr)); 3890 #endif 3891 } else { 3892 if (cusparsestruct->nrows) { 3893 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3894 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3895 #else 3896 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3897 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3898 matstruct->alpha_one, matstruct->descr, hybMat, 3899 xptr, beta, 3900 dptr)); 3901 #endif 3902 } 3903 } 3904 PetscCall(PetscLogGpuTimeEnd()); 3905 3906 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3907 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3908 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3909 PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3910 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3911 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3912 } 3913 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3914 PetscCall(VecSet_SeqCUDA(zz,0)); 3915 } 3916 3917 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3918 if (compressed) { 3919 PetscCall(PetscLogGpuTimeBegin()); 3920 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3921 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3922 prevent that. So I just add a ScatterAdd kernel. 3923 */ 3924 #if 0 3925 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3926 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3927 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3928 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3929 VecCUDAPlusEquals()); 3930 #else 3931 PetscInt n = matstruct->cprowIndices->size(); 3932 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3933 #endif 3934 PetscCall(PetscLogGpuTimeEnd()); 3935 } 3936 } else { 3937 if (yy && yy != zz) { 3938 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3939 } 3940 } 3941 PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 3942 if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 3943 else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 3944 } catch(char *ex) { 3945 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3946 } 3947 if (yy) { 3948 PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3949 } else { 3950 PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3951 } 3952 PetscFunctionReturn(0); 3953 } 3954 3955 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3956 { 3957 PetscFunctionBegin; 3958 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3959 PetscFunctionReturn(0); 3960 } 3961 3962 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3963 { 3964 PetscObjectState onnz = A->nonzerostate; 3965 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3966 3967 PetscFunctionBegin; 3968 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3969 if (onnz != A->nonzerostate && cusp->deviceMat) { 3970 3971 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 3972 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3973 cusp->deviceMat = NULL; 3974 } 3975 PetscFunctionReturn(0); 3976 } 3977 3978 /* --------------------------------------------------------------------------------*/ 3979 /*@ 3980 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3981 (the default parallel PETSc format). This matrix will ultimately pushed down 3982 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3983 assembly performance the user should preallocate the matrix storage by setting 3984 the parameter nz (or the array nnz). By setting these parameters accurately, 3985 performance during matrix assembly can be increased by more than a factor of 50. 3986 3987 Collective 3988 3989 Input Parameters: 3990 + comm - MPI communicator, set to PETSC_COMM_SELF 3991 . m - number of rows 3992 . n - number of columns 3993 . nz - number of nonzeros per row (same for all rows) 3994 - nnz - array containing the number of nonzeros in the various rows 3995 (possibly different for each row) or NULL 3996 3997 Output Parameter: 3998 . A - the matrix 3999 4000 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 4001 MatXXXXSetPreallocation() paradgm instead of this routine directly. 4002 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 4003 4004 Notes: 4005 If nnz is given then nz is ignored 4006 4007 The AIJ format (also called the Yale sparse matrix format or 4008 compressed row storage), is fully compatible with standard Fortran 77 4009 storage. That is, the stored row and column indices can begin at 4010 either one (as in Fortran) or zero. See the users' manual for details. 4011 4012 Specify the preallocated storage with either nz or nnz (not both). 4013 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 4014 allocation. For large problems you MUST preallocate memory or you 4015 will get TERRIBLE performance, see the users' manual chapter on matrices. 4016 4017 By default, this format uses inodes (identical nodes) when possible, to 4018 improve numerical efficiency of matrix-vector products and solves. We 4019 search for consecutive rows with the same nonzero structure, thereby 4020 reusing matrix information to achieve increased efficiency. 4021 4022 Level: intermediate 4023 4024 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 4025 @*/ 4026 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 4027 { 4028 PetscFunctionBegin; 4029 PetscCall(MatCreate(comm,A)); 4030 PetscCall(MatSetSizes(*A,m,n,m,n)); 4031 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 4032 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 4033 PetscFunctionReturn(0); 4034 } 4035 4036 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 4037 { 4038 PetscFunctionBegin; 4039 if (A->factortype == MAT_FACTOR_NONE) { 4040 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 4041 } else { 4042 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 4043 } 4044 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 4045 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 4046 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 4047 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 4048 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 4049 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 4050 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 4051 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 4052 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 4053 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 4054 PetscCall(MatDestroy_SeqAIJ(A)); 4055 PetscFunctionReturn(0); 4056 } 4057 4058 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 4059 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 4060 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 4061 { 4062 PetscFunctionBegin; 4063 PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 4064 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 4065 PetscFunctionReturn(0); 4066 } 4067 4068 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 4069 { 4070 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 4071 Mat_SeqAIJCUSPARSE *cy; 4072 Mat_SeqAIJCUSPARSE *cx; 4073 PetscScalar *ay; 4074 const PetscScalar *ax; 4075 CsrMatrix *csry,*csrx; 4076 4077 PetscFunctionBegin; 4078 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 4079 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 4080 if (X->ops->axpy != Y->ops->axpy) { 4081 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 4082 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 4083 PetscFunctionReturn(0); 4084 } 4085 /* if we are here, it means both matrices are bound to GPU */ 4086 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 4087 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 4088 PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 4089 PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 4090 csry = (CsrMatrix*)cy->mat->mat; 4091 csrx = (CsrMatrix*)cx->mat->mat; 4092 /* see if we can turn this into a cublas axpy */ 4093 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 4094 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 4095 if (eq) { 4096 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 4097 } 4098 if (eq) str = SAME_NONZERO_PATTERN; 4099 } 4100 /* spgeam is buggy with one column */ 4101 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 4102 4103 if (str == SUBSET_NONZERO_PATTERN) { 4104 PetscScalar b = 1.0; 4105 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4106 size_t bufferSize; 4107 void *buffer; 4108 #endif 4109 4110 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 4111 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 4112 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 4113 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4114 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 4115 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4116 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 4117 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 4118 PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 4119 PetscCall(PetscLogGpuTimeBegin()); 4120 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 4121 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4122 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 4123 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 4124 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 4125 PetscCall(PetscLogGpuTimeEnd()); 4126 PetscCallCUDA(cudaFree(buffer)); 4127 #else 4128 PetscCall(PetscLogGpuTimeBegin()); 4129 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 4130 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4131 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 4132 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 4133 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 4134 PetscCall(PetscLogGpuTimeEnd()); 4135 #endif 4136 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 4137 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 4138 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 4139 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4140 } else if (str == SAME_NONZERO_PATTERN) { 4141 cublasHandle_t cublasv2handle; 4142 PetscBLASInt one = 1, bnz = 1; 4143 4144 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 4145 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 4146 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 4147 PetscCall(PetscBLASIntCast(x->nz,&bnz)); 4148 PetscCall(PetscLogGpuTimeBegin()); 4149 PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 4150 PetscCall(PetscLogGpuFlops(2.0*bnz)); 4151 PetscCall(PetscLogGpuTimeEnd()); 4152 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 4153 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 4154 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4155 } else { 4156 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 4157 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 4158 } 4159 PetscFunctionReturn(0); 4160 } 4161 4162 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 4163 { 4164 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 4165 PetscScalar *ay; 4166 cublasHandle_t cublasv2handle; 4167 PetscBLASInt one = 1, bnz = 1; 4168 4169 PetscFunctionBegin; 4170 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 4171 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 4172 PetscCall(PetscBLASIntCast(y->nz,&bnz)); 4173 PetscCall(PetscLogGpuTimeBegin()); 4174 PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 4175 PetscCall(PetscLogGpuFlops(bnz)); 4176 PetscCall(PetscLogGpuTimeEnd()); 4177 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 4178 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4179 PetscFunctionReturn(0); 4180 } 4181 4182 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 4183 { 4184 PetscBool both = PETSC_FALSE; 4185 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4186 4187 PetscFunctionBegin; 4188 if (A->factortype == MAT_FACTOR_NONE) { 4189 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 4190 if (spptr->mat) { 4191 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 4192 if (matrix->values) { 4193 both = PETSC_TRUE; 4194 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4195 } 4196 } 4197 if (spptr->matTranspose) { 4198 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 4199 if (matrix->values) { 4200 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4201 } 4202 } 4203 } 4204 PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 4205 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4206 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 4207 else A->offloadmask = PETSC_OFFLOAD_CPU; 4208 PetscFunctionReturn(0); 4209 } 4210 4211 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 4212 { 4213 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4214 4215 PetscFunctionBegin; 4216 if (A->factortype != MAT_FACTOR_NONE) { 4217 A->boundtocpu = flg; 4218 PetscFunctionReturn(0); 4219 } 4220 if (flg) { 4221 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 4222 4223 A->ops->scale = MatScale_SeqAIJ; 4224 A->ops->axpy = MatAXPY_SeqAIJ; 4225 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 4226 A->ops->mult = MatMult_SeqAIJ; 4227 A->ops->multadd = MatMultAdd_SeqAIJ; 4228 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 4229 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 4230 A->ops->multhermitiantranspose = NULL; 4231 A->ops->multhermitiantransposeadd = NULL; 4232 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 4233 PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 4234 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 4235 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 4236 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 4237 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 4238 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 4239 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 4240 } else { 4241 A->ops->scale = MatScale_SeqAIJCUSPARSE; 4242 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4243 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4244 A->ops->mult = MatMult_SeqAIJCUSPARSE; 4245 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4246 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4247 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4248 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4249 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4250 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4251 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4252 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4253 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4254 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4255 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4256 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4257 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4258 4259 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4260 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4261 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4262 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4263 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 4264 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4265 } 4266 A->boundtocpu = flg; 4267 if (flg && a->inode.size) { 4268 a->inode.use = PETSC_TRUE; 4269 } else { 4270 a->inode.use = PETSC_FALSE; 4271 } 4272 PetscFunctionReturn(0); 4273 } 4274 4275 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4276 { 4277 Mat B; 4278 4279 PetscFunctionBegin; 4280 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4281 if (reuse == MAT_INITIAL_MATRIX) { 4282 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 4283 } else if (reuse == MAT_REUSE_MATRIX) { 4284 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 4285 } 4286 B = *newmat; 4287 4288 PetscCall(PetscFree(B->defaultvectype)); 4289 PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 4290 4291 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4292 if (B->factortype == MAT_FACTOR_NONE) { 4293 Mat_SeqAIJCUSPARSE *spptr; 4294 PetscCall(PetscNew(&spptr)); 4295 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4296 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 4297 spptr->format = MAT_CUSPARSE_CSR; 4298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4299 #if CUSPARSE_VERSION > 11301 4300 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4301 #else 4302 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4303 #endif 4304 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4305 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4306 #endif 4307 B->spptr = spptr; 4308 } else { 4309 Mat_SeqAIJCUSPARSETriFactors *spptr; 4310 4311 PetscCall(PetscNew(&spptr)); 4312 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4313 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 4314 B->spptr = spptr; 4315 } 4316 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4317 } 4318 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4319 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4320 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4321 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4322 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4323 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4324 4325 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 4326 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 4327 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4328 #if defined(PETSC_HAVE_HYPRE) 4329 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 4330 #endif 4331 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4332 PetscFunctionReturn(0); 4333 } 4334 4335 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4336 { 4337 PetscFunctionBegin; 4338 PetscCall(MatCreate_SeqAIJ(B)); 4339 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 4340 PetscFunctionReturn(0); 4341 } 4342 4343 /*MC 4344 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4345 4346 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 4347 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 4348 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 4349 4350 Options Database Keys: 4351 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 4352 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4353 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4354 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 4355 4356 Level: beginner 4357 4358 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4359 M*/ 4360 4361 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 4362 4363 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4364 { 4365 PetscFunctionBegin; 4366 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 4367 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 4368 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 4369 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 4370 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 4371 4372 PetscFunctionReturn(0); 4373 } 4374 4375 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 4376 { 4377 Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 4378 4379 PetscFunctionBegin; 4380 if (!cusp) PetscFunctionReturn(0); 4381 delete cusp->cooPerm; 4382 delete cusp->cooPerm_a; 4383 cusp->cooPerm = NULL; 4384 cusp->cooPerm_a = NULL; 4385 if (cusp->use_extended_coo) { 4386 PetscCallCUDA(cudaFree(cusp->jmap_d)); 4387 PetscCallCUDA(cudaFree(cusp->perm_d)); 4388 } 4389 cusp->use_extended_coo = PETSC_FALSE; 4390 PetscFunctionReturn(0); 4391 } 4392 4393 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 4394 { 4395 PetscFunctionBegin; 4396 if (*cusparsestruct) { 4397 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 4398 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 4399 delete (*cusparsestruct)->workVector; 4400 delete (*cusparsestruct)->rowoffsets_gpu; 4401 delete (*cusparsestruct)->cooPerm; 4402 delete (*cusparsestruct)->cooPerm_a; 4403 delete (*cusparsestruct)->csr2csc_i; 4404 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 4405 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 4406 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 4407 PetscCall(PetscFree(*cusparsestruct)); 4408 } 4409 PetscFunctionReturn(0); 4410 } 4411 4412 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4413 { 4414 PetscFunctionBegin; 4415 if (*mat) { 4416 delete (*mat)->values; 4417 delete (*mat)->column_indices; 4418 delete (*mat)->row_offsets; 4419 delete *mat; 4420 *mat = 0; 4421 } 4422 PetscFunctionReturn(0); 4423 } 4424 4425 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4426 { 4427 PetscFunctionBegin; 4428 if (*trifactor) { 4429 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4430 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4431 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4432 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4433 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4434 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4435 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4436 #endif 4437 PetscCall(PetscFree(*trifactor)); 4438 } 4439 PetscFunctionReturn(0); 4440 } 4441 4442 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 4443 { 4444 CsrMatrix *mat; 4445 4446 PetscFunctionBegin; 4447 if (*matstruct) { 4448 if ((*matstruct)->mat) { 4449 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 4450 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4451 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4452 #else 4453 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4454 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4455 #endif 4456 } else { 4457 mat = (CsrMatrix*)(*matstruct)->mat; 4458 CsrMatrix_Destroy(&mat); 4459 } 4460 } 4461 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4462 delete (*matstruct)->cprowIndices; 4463 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4464 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4465 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4466 4467 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4468 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4469 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4470 for (int i=0; i<3; i++) { 4471 if (mdata->cuSpMV[i].initialized) { 4472 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4473 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4474 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4475 } 4476 } 4477 #endif 4478 delete *matstruct; 4479 *matstruct = NULL; 4480 } 4481 PetscFunctionReturn(0); 4482 } 4483 4484 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 4485 { 4486 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4487 4488 PetscFunctionBegin; 4489 if (fs) { 4490 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4491 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4492 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4493 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4494 delete fs->rpermIndices; 4495 delete fs->cpermIndices; 4496 delete fs->workVector; 4497 fs->rpermIndices = NULL; 4498 fs->cpermIndices = NULL; 4499 fs->workVector = NULL; 4500 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 4501 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 4502 fs->init_dev_prop = PETSC_FALSE; 4503 #if CUSPARSE_VERSION >= 11500 4504 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4505 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4506 PetscCallCUDA(cudaFree(fs->csrVal)); 4507 PetscCallCUDA(cudaFree(fs->X)); 4508 PetscCallCUDA(cudaFree(fs->Y)); 4509 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4510 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4511 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4512 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4513 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4514 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4515 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4516 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4517 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4518 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4519 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4520 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4521 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4522 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4523 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4524 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4525 4526 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4527 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4528 #endif 4529 } 4530 PetscFunctionReturn(0); 4531 } 4532 4533 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 4534 { 4535 cusparseHandle_t handle; 4536 4537 PetscFunctionBegin; 4538 if (*trifactors) { 4539 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4540 if (handle = (*trifactors)->handle) { 4541 PetscCallCUSPARSE(cusparseDestroy(handle)); 4542 } 4543 PetscCall(PetscFree(*trifactors)); 4544 } 4545 PetscFunctionReturn(0); 4546 } 4547 4548 struct IJCompare 4549 { 4550 __host__ __device__ 4551 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4552 { 4553 if (t1.get<0>() < t2.get<0>()) return true; 4554 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4555 return false; 4556 } 4557 }; 4558 4559 struct IJEqual 4560 { 4561 __host__ __device__ 4562 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4563 { 4564 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 4565 return true; 4566 } 4567 }; 4568 4569 struct IJDiff 4570 { 4571 __host__ __device__ 4572 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 4573 { 4574 return t1 == t2 ? 0 : 1; 4575 } 4576 }; 4577 4578 struct IJSum 4579 { 4580 __host__ __device__ 4581 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 4582 { 4583 return t1||t2; 4584 } 4585 }; 4586 4587 #include <thrust/iterator/discard_iterator.h> 4588 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4589 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 4590 { 4591 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4592 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4593 THRUSTARRAY *cooPerm_v = NULL; 4594 thrust::device_ptr<const PetscScalar> d_v; 4595 CsrMatrix *matrix; 4596 PetscInt n; 4597 4598 PetscFunctionBegin; 4599 PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 4600 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 4601 if (!cusp->cooPerm) { 4602 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 4603 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 4604 PetscFunctionReturn(0); 4605 } 4606 matrix = (CsrMatrix*)cusp->mat->mat; 4607 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4608 if (!v) { 4609 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4610 goto finalize; 4611 } 4612 n = cusp->cooPerm->size(); 4613 if (isCudaMem(v)) { 4614 d_v = thrust::device_pointer_cast(v); 4615 } else { 4616 cooPerm_v = new THRUSTARRAY(n); 4617 cooPerm_v->assign(v,v+n); 4618 d_v = cooPerm_v->data(); 4619 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 4620 } 4621 PetscCall(PetscLogGpuTimeBegin()); 4622 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4623 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4624 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4625 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 4626 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4627 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4628 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4629 */ 4630 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 4631 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 4632 delete cooPerm_w; 4633 } else { 4634 /* all nonzeros in d_v[] are unique entries */ 4635 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 4636 matrix->values->begin())); 4637 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 4638 matrix->values->end())); 4639 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4640 } 4641 } else { 4642 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4643 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 4644 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 4645 } else { 4646 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 4647 matrix->values->begin())); 4648 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 4649 matrix->values->end())); 4650 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4651 } 4652 } 4653 PetscCall(PetscLogGpuTimeEnd()); 4654 finalize: 4655 delete cooPerm_v; 4656 A->offloadmask = PETSC_OFFLOAD_GPU; 4657 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4658 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4659 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 4660 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 4661 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 4662 a->reallocs = 0; 4663 A->info.mallocs += 0; 4664 A->info.nz_unneeded = 0; 4665 A->assembled = A->was_assembled = PETSC_TRUE; 4666 A->num_ass++; 4667 PetscFunctionReturn(0); 4668 } 4669 4670 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4671 { 4672 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4673 4674 PetscFunctionBegin; 4675 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4676 if (!cusp) PetscFunctionReturn(0); 4677 if (destroy) { 4678 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 4679 delete cusp->csr2csc_i; 4680 cusp->csr2csc_i = NULL; 4681 } 4682 A->transupdated = PETSC_FALSE; 4683 PetscFunctionReturn(0); 4684 } 4685 4686 #include <thrust/binary_search.h> 4687 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4688 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 4689 { 4690 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4691 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4692 PetscInt cooPerm_n, nzr = 0; 4693 4694 PetscFunctionBegin; 4695 PetscCall(PetscLayoutSetUp(A->rmap)); 4696 PetscCall(PetscLayoutSetUp(A->cmap)); 4697 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4698 if (n != cooPerm_n) { 4699 delete cusp->cooPerm; 4700 delete cusp->cooPerm_a; 4701 cusp->cooPerm = NULL; 4702 cusp->cooPerm_a = NULL; 4703 } 4704 if (n) { 4705 thrust::device_ptr<PetscInt> d_i,d_j; 4706 PetscInt *d_raw_i,*d_raw_j; 4707 PetscBool free_raw_i = PETSC_FALSE,free_raw_j = PETSC_FALSE; 4708 PetscMemType imtype,jmtype; 4709 4710 PetscCall(PetscGetMemType(coo_i,&imtype)); 4711 if (PetscMemTypeHost(imtype)) { 4712 PetscCallCUDA(cudaMalloc(&d_raw_i,sizeof(PetscInt)*n)); 4713 PetscCallCUDA(cudaMemcpy(d_raw_i,coo_i,sizeof(PetscInt)*n,cudaMemcpyHostToDevice)); 4714 d_i = thrust::device_pointer_cast(d_raw_i); 4715 free_raw_i = PETSC_TRUE; 4716 PetscCall(PetscLogCpuToGpu(1.*n*sizeof(PetscInt))); 4717 } else { 4718 d_i = thrust::device_pointer_cast(coo_i); 4719 } 4720 4721 PetscCall(PetscGetMemType(coo_j,&jmtype)); 4722 if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4723 PetscCallCUDA(cudaMalloc(&d_raw_j,sizeof(PetscInt)*n)); 4724 PetscCallCUDA(cudaMemcpy(d_raw_j,coo_j,sizeof(PetscInt)*n,cudaMemcpyHostToDevice)); 4725 d_j = thrust::device_pointer_cast(d_raw_j); 4726 free_raw_j = PETSC_TRUE; 4727 PetscCall(PetscLogCpuToGpu(1.*n*sizeof(PetscInt))); 4728 } else { 4729 d_j = thrust::device_pointer_cast(coo_j); 4730 } 4731 4732 THRUSTINTARRAY ii(A->rmap->n); 4733 4734 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 4735 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 4736 4737 /* Ex. 4738 n = 6 4739 coo_i = [3,3,1,4,1,4] 4740 coo_j = [3,2,2,5,2,6] 4741 */ 4742 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i,d_j)); 4743 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i+n,d_j+n)); 4744 4745 PetscCall(PetscLogGpuTimeBegin()); 4746 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4747 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4748 (*cusp->cooPerm_a).assign(d_i,d_i+n); /* copy the sorted array */ 4749 THRUSTINTARRAY w(d_j,d_j+n); 4750 4751 /* 4752 d_i = [1,1,3,3,4,4] 4753 d_j = [2,2,2,3,5,6] 4754 cooPerm = [2,4,1,0,3,5] 4755 */ 4756 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4757 4758 /* 4759 d_i = [1,3,3,4,4,x] 4760 ^ekey 4761 d_j = [2,2,3,5,6,x] 4762 ^nekye 4763 */ 4764 if (nekey == ekey) { /* all entries are unique */ 4765 delete cusp->cooPerm_a; 4766 cusp->cooPerm_a = NULL; 4767 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4768 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4769 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4770 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4771 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4772 w[0] = 0; 4773 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4774 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4775 } 4776 thrust::counting_iterator<PetscInt> search_begin(0); 4777 thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4778 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4779 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4780 PetscCall(PetscLogGpuTimeEnd()); 4781 4782 PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 4783 a->singlemalloc = PETSC_FALSE; 4784 a->free_a = PETSC_TRUE; 4785 a->free_ij = PETSC_TRUE; 4786 PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 4787 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4788 PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4789 a->nz = a->maxnz = a->i[A->rmap->n]; 4790 a->rmax = 0; 4791 PetscCall(PetscMalloc1(a->nz,&a->a)); 4792 PetscCall(PetscMalloc1(a->nz,&a->j)); 4793 PetscCallCUDA(cudaMemcpy(a->j,thrust::raw_pointer_cast(d_j),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4794 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 4795 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 4796 for (PetscInt i = 0; i < A->rmap->n; i++) { 4797 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4798 nzr += (PetscInt)!!(nnzr); 4799 a->ilen[i] = a->imax[i] = nnzr; 4800 a->rmax = PetscMax(a->rmax,nnzr); 4801 } 4802 a->nonzerorowcnt = nzr; 4803 A->preallocated = PETSC_TRUE; 4804 PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 4805 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4806 if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4807 if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 4808 } else { 4809 PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 4810 } 4811 PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 4812 4813 /* We want to allocate the CUSPARSE struct for matvec now. 4814 The code is so convoluted now that I prefer to copy zeros */ 4815 PetscCall(PetscArrayzero(a->a,a->nz)); 4816 PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 4817 A->offloadmask = PETSC_OFFLOAD_CPU; 4818 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4819 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 4820 PetscFunctionReturn(0); 4821 } 4822 4823 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4824 { 4825 Mat_SeqAIJ *seq; 4826 Mat_SeqAIJCUSPARSE *dev; 4827 PetscBool coo_basic = PETSC_TRUE; 4828 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4829 4830 PetscFunctionBegin; 4831 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4832 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4833 if (coo_i) { 4834 PetscCall(PetscGetMemType(coo_i,&mtype)); 4835 if (PetscMemTypeHost(mtype)) { 4836 for (PetscCount k=0; k<coo_n; k++) { 4837 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4838 } 4839 } 4840 } 4841 4842 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4843 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4844 } else { 4845 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4846 mat->offloadmask = PETSC_OFFLOAD_CPU; 4847 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4848 seq = static_cast<Mat_SeqAIJ*>(mat->data); 4849 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 4850 PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 4851 PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4852 PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 4853 PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4854 dev->use_extended_coo = PETSC_TRUE; 4855 } 4856 PetscFunctionReturn(0); 4857 } 4858 4859 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4860 { 4861 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4862 const PetscCount grid_size = gridDim.x * blockDim.x; 4863 for (; i<nnz; i+= grid_size) { 4864 PetscScalar sum = 0.0; 4865 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4866 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4867 } 4868 } 4869 4870 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4871 { 4872 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4873 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4874 PetscCount Annz = seq->nz; 4875 PetscMemType memtype; 4876 const PetscScalar *v1 = v; 4877 PetscScalar *Aa; 4878 4879 PetscFunctionBegin; 4880 if (dev->use_extended_coo) { 4881 PetscCall(PetscGetMemType(v,&memtype)); 4882 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4883 PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 4884 PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4885 } 4886 4887 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 4888 else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4889 4890 if (Annz) { 4891 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4892 PetscCallCUDA(cudaPeekAtLastError()); 4893 } 4894 4895 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 4896 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4897 4898 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4899 } else { 4900 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4901 } 4902 PetscFunctionReturn(0); 4903 } 4904 4905 /*@C 4906 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4907 4908 Not collective 4909 4910 Input Parameters: 4911 + A - the matrix 4912 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4913 4914 Output Parameters: 4915 + ia - the CSR row pointers 4916 - ja - the CSR column indices 4917 4918 Level: developer 4919 4920 Notes: 4921 When compressed is true, the CSR structure does not contain empty rows 4922 4923 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4924 @*/ 4925 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4926 { 4927 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4928 CsrMatrix *csr; 4929 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4930 4931 PetscFunctionBegin; 4932 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4933 if (!i || !j) PetscFunctionReturn(0); 4934 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4935 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4936 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4937 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4938 csr = (CsrMatrix*)cusp->mat->mat; 4939 if (i) { 4940 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4941 if (!cusp->rowoffsets_gpu) { 4942 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4943 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4944 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4945 } 4946 *i = cusp->rowoffsets_gpu->data().get(); 4947 } else *i = csr->row_offsets->data().get(); 4948 } 4949 if (j) *j = csr->column_indices->data().get(); 4950 PetscFunctionReturn(0); 4951 } 4952 4953 /*@C 4954 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4955 4956 Not collective 4957 4958 Input Parameters: 4959 + A - the matrix 4960 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4961 4962 Output Parameters: 4963 + ia - the CSR row pointers 4964 - ja - the CSR column indices 4965 4966 Level: developer 4967 4968 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4969 @*/ 4970 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4971 { 4972 PetscFunctionBegin; 4973 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4974 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4975 if (i) *i = NULL; 4976 if (j) *j = NULL; 4977 PetscFunctionReturn(0); 4978 } 4979 4980 /*@C 4981 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4982 4983 Not Collective 4984 4985 Input Parameter: 4986 . A - a MATSEQAIJCUSPARSE matrix 4987 4988 Output Parameter: 4989 . a - pointer to the device data 4990 4991 Level: developer 4992 4993 Notes: may trigger host-device copies if up-to-date matrix data is on host 4994 4995 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4996 @*/ 4997 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4998 { 4999 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 5000 CsrMatrix *csr; 5001 5002 PetscFunctionBegin; 5003 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5004 PetscValidPointer(a,2); 5005 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5006 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5007 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5008 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5009 csr = (CsrMatrix*)cusp->mat->mat; 5010 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5011 *a = csr->values->data().get(); 5012 PetscFunctionReturn(0); 5013 } 5014 5015 /*@C 5016 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 5017 5018 Not Collective 5019 5020 Input Parameter: 5021 . A - a MATSEQAIJCUSPARSE matrix 5022 5023 Output Parameter: 5024 . a - pointer to the device data 5025 5026 Level: developer 5027 5028 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 5029 @*/ 5030 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 5031 { 5032 PetscFunctionBegin; 5033 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5034 PetscValidPointer(a,2); 5035 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5036 *a = NULL; 5037 PetscFunctionReturn(0); 5038 } 5039 5040 /*@C 5041 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 5042 5043 Not Collective 5044 5045 Input Parameter: 5046 . A - a MATSEQAIJCUSPARSE matrix 5047 5048 Output Parameter: 5049 . a - pointer to the device data 5050 5051 Level: developer 5052 5053 Notes: may trigger host-device copies if up-to-date matrix data is on host 5054 5055 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 5056 @*/ 5057 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 5058 { 5059 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 5060 CsrMatrix *csr; 5061 5062 PetscFunctionBegin; 5063 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5064 PetscValidPointer(a,2); 5065 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5066 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5067 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5068 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5069 csr = (CsrMatrix*)cusp->mat->mat; 5070 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5071 *a = csr->values->data().get(); 5072 A->offloadmask = PETSC_OFFLOAD_GPU; 5073 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 5074 PetscFunctionReturn(0); 5075 } 5076 /*@C 5077 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 5078 5079 Not Collective 5080 5081 Input Parameter: 5082 . A - a MATSEQAIJCUSPARSE matrix 5083 5084 Output Parameter: 5085 . a - pointer to the device data 5086 5087 Level: developer 5088 5089 .seealso: `MatSeqAIJCUSPARSEGetArray()` 5090 @*/ 5091 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 5092 { 5093 PetscFunctionBegin; 5094 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5095 PetscValidPointer(a,2); 5096 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5097 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 5098 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 5099 *a = NULL; 5100 PetscFunctionReturn(0); 5101 } 5102 5103 /*@C 5104 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 5105 5106 Not Collective 5107 5108 Input Parameter: 5109 . A - a MATSEQAIJCUSPARSE matrix 5110 5111 Output Parameter: 5112 . a - pointer to the device data 5113 5114 Level: developer 5115 5116 Notes: does not trigger host-device copies and flags data validity on the GPU 5117 5118 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 5119 @*/ 5120 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 5121 { 5122 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 5123 CsrMatrix *csr; 5124 5125 PetscFunctionBegin; 5126 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5127 PetscValidPointer(a,2); 5128 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5129 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5130 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5131 csr = (CsrMatrix*)cusp->mat->mat; 5132 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5133 *a = csr->values->data().get(); 5134 A->offloadmask = PETSC_OFFLOAD_GPU; 5135 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 5136 PetscFunctionReturn(0); 5137 } 5138 5139 /*@C 5140 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 5141 5142 Not Collective 5143 5144 Input Parameter: 5145 . A - a MATSEQAIJCUSPARSE matrix 5146 5147 Output Parameter: 5148 . a - pointer to the device data 5149 5150 Level: developer 5151 5152 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 5153 @*/ 5154 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 5155 { 5156 PetscFunctionBegin; 5157 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5158 PetscValidPointer(a,2); 5159 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5160 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 5161 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 5162 *a = NULL; 5163 PetscFunctionReturn(0); 5164 } 5165 5166 struct IJCompare4 5167 { 5168 __host__ __device__ 5169 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 5170 { 5171 if (t1.get<0>() < t2.get<0>()) return true; 5172 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 5173 return false; 5174 } 5175 }; 5176 5177 struct Shift 5178 { 5179 int _shift; 5180 5181 Shift(int shift) : _shift(shift) {} 5182 __host__ __device__ 5183 inline int operator() (const int &c) 5184 { 5185 return c + _shift; 5186 } 5187 }; 5188 5189 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 5190 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 5191 { 5192 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 5193 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 5194 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 5195 CsrMatrix *Acsr,*Bcsr,*Ccsr; 5196 PetscInt Annz,Bnnz; 5197 cusparseStatus_t stat; 5198 PetscInt i,m,n,zero = 0; 5199 5200 PetscFunctionBegin; 5201 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5202 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 5203 PetscValidPointer(C,4); 5204 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5205 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 5206 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 5207 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 5208 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5209 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5210 if (reuse == MAT_INITIAL_MATRIX) { 5211 m = A->rmap->n; 5212 n = A->cmap->n + B->cmap->n; 5213 PetscCall(MatCreate(PETSC_COMM_SELF,C)); 5214 PetscCall(MatSetSizes(*C,m,n,m,n)); 5215 PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 5216 c = (Mat_SeqAIJ*)(*C)->data; 5217 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 5218 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 5219 Ccsr = new CsrMatrix; 5220 Cmat->cprowIndices = NULL; 5221 c->compressedrow.use = PETSC_FALSE; 5222 c->compressedrow.nrows = 0; 5223 c->compressedrow.i = NULL; 5224 c->compressedrow.rindex = NULL; 5225 Ccusp->workVector = NULL; 5226 Ccusp->nrows = m; 5227 Ccusp->mat = Cmat; 5228 Ccusp->mat->mat = Ccsr; 5229 Ccsr->num_rows = m; 5230 Ccsr->num_cols = n; 5231 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 5232 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 5233 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 5234 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 5235 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 5236 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 5237 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5238 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5239 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5240 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5241 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 5242 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5243 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5244 5245 Acsr = (CsrMatrix*)Acusp->mat->mat; 5246 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 5247 Annz = (PetscInt)Acsr->column_indices->size(); 5248 Bnnz = (PetscInt)Bcsr->column_indices->size(); 5249 c->nz = Annz + Bnnz; 5250 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 5251 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 5252 Ccsr->values = new THRUSTARRAY(c->nz); 5253 Ccsr->num_entries = c->nz; 5254 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 5255 if (c->nz) { 5256 auto Acoo = new THRUSTINTARRAY32(Annz); 5257 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 5258 auto Ccoo = new THRUSTINTARRAY32(c->nz); 5259 THRUSTINTARRAY32 *Aroff,*Broff; 5260 5261 if (a->compressedrow.use) { /* need full row offset */ 5262 if (!Acusp->rowoffsets_gpu) { 5263 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 5264 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 5265 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 5266 } 5267 Aroff = Acusp->rowoffsets_gpu; 5268 } else Aroff = Acsr->row_offsets; 5269 if (b->compressedrow.use) { /* need full row offset */ 5270 if (!Bcusp->rowoffsets_gpu) { 5271 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 5272 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 5273 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 5274 } 5275 Broff = Bcusp->rowoffsets_gpu; 5276 } else Broff = Bcsr->row_offsets; 5277 PetscCall(PetscLogGpuTimeBegin()); 5278 stat = cusparseXcsr2coo(Acusp->handle, 5279 Aroff->data().get(), 5280 Annz, 5281 m, 5282 Acoo->data().get(), 5283 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5284 stat = cusparseXcsr2coo(Bcusp->handle, 5285 Broff->data().get(), 5286 Bnnz, 5287 m, 5288 Bcoo->data().get(), 5289 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5290 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 5291 auto Aperm = thrust::make_constant_iterator(1); 5292 auto Bperm = thrust::make_constant_iterator(0); 5293 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 5294 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 5295 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 5296 #else 5297 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 5298 auto Bcib = Bcsr->column_indices->begin(); 5299 auto Bcie = Bcsr->column_indices->end(); 5300 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 5301 #endif 5302 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 5303 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 5304 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 5305 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 5306 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 5307 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 5308 auto p1 = Ccusp->cooPerm->begin(); 5309 auto p2 = Ccusp->cooPerm->begin(); 5310 thrust::advance(p2,Annz); 5311 PetscCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 5312 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 5313 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 5314 #endif 5315 auto cci = thrust::make_counting_iterator(zero); 5316 auto cce = thrust::make_counting_iterator(c->nz); 5317 #if 0 //Errors on SUMMIT cuda 11.1.0 5318 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 5319 #else 5320 auto pred = thrust::identity<int>(); 5321 PetscCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 5322 PetscCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 5323 #endif 5324 stat = cusparseXcoo2csr(Ccusp->handle, 5325 Ccoo->data().get(), 5326 c->nz, 5327 m, 5328 Ccsr->row_offsets->data().get(), 5329 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5330 PetscCall(PetscLogGpuTimeEnd()); 5331 delete wPerm; 5332 delete Acoo; 5333 delete Bcoo; 5334 delete Ccoo; 5335 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 5336 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 5337 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 5338 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 5339 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 5340 #endif 5341 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 5342 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 5343 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 5344 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5345 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 5346 CsrMatrix *CcsrT = new CsrMatrix; 5347 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 5348 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 5349 5350 (*C)->form_explicit_transpose = PETSC_TRUE; 5351 (*C)->transupdated = PETSC_TRUE; 5352 Ccusp->rowoffsets_gpu = NULL; 5353 CmatT->cprowIndices = NULL; 5354 CmatT->mat = CcsrT; 5355 CcsrT->num_rows = n; 5356 CcsrT->num_cols = m; 5357 CcsrT->num_entries = c->nz; 5358 5359 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 5360 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 5361 CcsrT->values = new THRUSTARRAY(c->nz); 5362 5363 PetscCall(PetscLogGpuTimeBegin()); 5364 auto rT = CcsrT->row_offsets->begin(); 5365 if (AT) { 5366 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 5367 thrust::advance(rT,-1); 5368 } 5369 if (BT) { 5370 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 5371 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 5372 thrust::copy(titb,tite,rT); 5373 } 5374 auto cT = CcsrT->column_indices->begin(); 5375 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 5376 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 5377 auto vT = CcsrT->values->begin(); 5378 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 5379 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 5380 PetscCall(PetscLogGpuTimeEnd()); 5381 5382 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 5383 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 5384 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 5385 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 5386 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 5387 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 5388 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5389 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5390 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5391 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 5392 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 5393 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 5394 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 5395 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 5396 #endif 5397 Ccusp->matTranspose = CmatT; 5398 } 5399 } 5400 5401 c->singlemalloc = PETSC_FALSE; 5402 c->free_a = PETSC_TRUE; 5403 c->free_ij = PETSC_TRUE; 5404 PetscCall(PetscMalloc1(m+1,&c->i)); 5405 PetscCall(PetscMalloc1(c->nz,&c->j)); 5406 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 5407 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 5408 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 5409 ii = *Ccsr->row_offsets; 5410 jj = *Ccsr->column_indices; 5411 PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5412 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5413 } else { 5414 PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5415 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5416 } 5417 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 5418 PetscCall(PetscMalloc1(m,&c->ilen)); 5419 PetscCall(PetscMalloc1(m,&c->imax)); 5420 c->maxnz = c->nz; 5421 c->nonzerorowcnt = 0; 5422 c->rmax = 0; 5423 for (i = 0; i < m; i++) { 5424 const PetscInt nn = c->i[i+1] - c->i[i]; 5425 c->ilen[i] = c->imax[i] = nn; 5426 c->nonzerorowcnt += (PetscInt)!!nn; 5427 c->rmax = PetscMax(c->rmax,nn); 5428 } 5429 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 5430 PetscCall(PetscMalloc1(c->nz,&c->a)); 5431 (*C)->nonzerostate++; 5432 PetscCall(PetscLayoutSetUp((*C)->rmap)); 5433 PetscCall(PetscLayoutSetUp((*C)->cmap)); 5434 Ccusp->nonzerostate = (*C)->nonzerostate; 5435 (*C)->preallocated = PETSC_TRUE; 5436 } else { 5437 PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 5438 c = (Mat_SeqAIJ*)(*C)->data; 5439 if (c->nz) { 5440 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 5441 PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 5442 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5443 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 5444 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5445 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 5446 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5447 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5448 Acsr = (CsrMatrix*)Acusp->mat->mat; 5449 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 5450 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 5451 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 5452 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 5453 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 5454 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 5455 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 5456 auto pmid = Ccusp->cooPerm->begin(); 5457 thrust::advance(pmid,Acsr->num_entries); 5458 PetscCall(PetscLogGpuTimeBegin()); 5459 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 5460 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 5461 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 5462 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 5463 thrust::for_each(zibait,zieait,VecCUDAEquals()); 5464 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 5465 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 5466 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 5467 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 5468 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 5469 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 5470 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 5471 PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 5472 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5473 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 5474 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 5475 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 5476 auto vT = CcsrT->values->begin(); 5477 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 5478 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 5479 (*C)->transupdated = PETSC_TRUE; 5480 } 5481 PetscCall(PetscLogGpuTimeEnd()); 5482 } 5483 } 5484 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 5485 (*C)->assembled = PETSC_TRUE; 5486 (*C)->was_assembled = PETSC_FALSE; 5487 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 5488 PetscFunctionReturn(0); 5489 } 5490 5491 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 5492 { 5493 bool dmem; 5494 const PetscScalar *av; 5495 5496 PetscFunctionBegin; 5497 dmem = isCudaMem(v); 5498 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 5499 if (n && idx) { 5500 THRUSTINTARRAY widx(n); 5501 widx.assign(idx,idx+n); 5502 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 5503 5504 THRUSTARRAY *w = NULL; 5505 thrust::device_ptr<PetscScalar> dv; 5506 if (dmem) { 5507 dv = thrust::device_pointer_cast(v); 5508 } else { 5509 w = new THRUSTARRAY(n); 5510 dv = w->data(); 5511 } 5512 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5513 5514 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 5515 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 5516 thrust::for_each(zibit,zieit,VecCUDAEquals()); 5517 if (w) { 5518 PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 5519 } 5520 delete w; 5521 } else { 5522 PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5523 } 5524 if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 5525 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 5526 PetscFunctionReturn(0); 5527 } 5528