1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #if PETSC_CPP_VERSION >= 14 17 #define PETSC_HAVE_THRUST_ASYNC 1 18 // thrust::for_each(thrust::cuda::par.on()) requires C++14 19 #include <thrust/async/for_each.h> 20 #endif 21 #include <thrust/iterator/constant_iterator.h> 22 #include <thrust/remove.h> 23 #include <thrust/sort.h> 24 #include <thrust/unique.h> 25 26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 29 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 30 31 typedef enum { 32 CUSPARSE_MV_ALG_DEFAULT = 0, 33 CUSPARSE_COOMV_ALG = 1, 34 CUSPARSE_CSRMV_ALG1 = 2, 35 CUSPARSE_CSRMV_ALG2 = 3 36 } cusparseSpMVAlg_t; 37 38 typedef enum { 39 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 40 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 41 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 42 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 43 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 44 CUSPARSE_SPMM_ALG_DEFAULT = 0, 45 CUSPARSE_SPMM_COO_ALG1 = 1, 46 CUSPARSE_SPMM_COO_ALG2 = 2, 47 CUSPARSE_SPMM_COO_ALG3 = 3, 48 CUSPARSE_SPMM_COO_ALG4 = 5, 49 CUSPARSE_SPMM_CSR_ALG1 = 4, 50 CUSPARSE_SPMM_CSR_ALG2 = 6, 51 } cusparseSpMMAlg_t; 52 53 typedef enum { 54 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 55 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 56 } cusparseCsr2CscAlg_t; 57 */ 58 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 59 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 61 #endif 62 63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 66 67 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 68 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 69 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 70 71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 72 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 74 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 75 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 76 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 77 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 78 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 79 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 80 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 81 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 82 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 83 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 84 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 85 86 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 88 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 89 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 90 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 91 92 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 93 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 94 95 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 96 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 97 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 98 99 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 100 { 101 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 102 103 PetscFunctionBegin; 104 switch (op) { 105 case MAT_CUSPARSE_MULT: 106 cusparsestruct->format = format; 107 break; 108 case MAT_CUSPARSE_ALL: 109 cusparsestruct->format = format; 110 break; 111 default: 112 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 113 } 114 PetscFunctionReturn(0); 115 } 116 117 /*@ 118 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 119 operation. Only the `MatMult()` operation can use different GPU storage formats 120 121 Not Collective 122 123 Input Parameters: 124 + A - Matrix of type `MATSEQAIJCUSPARSE` 125 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`, 126 `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 127 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 128 129 Output Parameter: 130 131 Level: intermediate 132 133 .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 134 @*/ 135 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 136 { 137 PetscFunctionBegin; 138 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 139 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 140 PetscFunctionReturn(0); 141 } 142 143 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 144 { 145 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 146 147 PetscFunctionBegin; 148 cusparsestruct->use_cpu_solve = use_cpu; 149 PetscFunctionReturn(0); 150 } 151 152 /*@ 153 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 154 155 Input Parameters: 156 + A - Matrix of type `MATSEQAIJCUSPARSE` 157 - use_cpu - set flag for using the built-in CPU `MatSolve()` 158 159 Output Parameter: 160 161 Note: 162 The cuSparse LU solver currently computes the factors with the built-in CPU method 163 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 164 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 165 166 Level: intermediate 167 168 .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 169 @*/ 170 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 171 { 172 PetscFunctionBegin; 173 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 174 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 175 PetscFunctionReturn(0); 176 } 177 178 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 179 { 180 PetscFunctionBegin; 181 switch (op) { 182 case MAT_FORM_EXPLICIT_TRANSPOSE: 183 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 184 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 185 A->form_explicit_transpose = flg; 186 break; 187 default: 188 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 189 break; 190 } 191 PetscFunctionReturn(0); 192 } 193 194 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 195 196 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 197 { 198 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 199 IS isrow = b->row, iscol = b->col; 200 PetscBool row_identity, col_identity; 201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr; 202 203 PetscFunctionBegin; 204 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 205 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 206 B->offloadmask = PETSC_OFFLOAD_CPU; 207 /* determine which version of MatSolve needs to be used. */ 208 PetscCall(ISIdentity(isrow, &row_identity)); 209 PetscCall(ISIdentity(iscol, &col_identity)); 210 211 if (!cusparsestruct->use_cpu_solve) { 212 if (row_identity && col_identity) { 213 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 214 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 215 } else { 216 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 217 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 218 } 219 } 220 B->ops->matsolve = NULL; 221 B->ops->matsolvetranspose = NULL; 222 223 /* get the triangular factors */ 224 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 225 PetscFunctionReturn(0); 226 } 227 228 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 229 { 230 MatCUSPARSEStorageFormat format; 231 PetscBool flg; 232 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 233 234 PetscFunctionBegin; 235 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 236 if (A->factortype == MAT_FACTOR_NONE) { 237 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 238 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 239 240 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 241 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 242 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 243 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 244 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 245 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 246 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 247 #if CUSPARSE_VERSION > 11301 248 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 249 #else 250 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 251 #endif 252 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 253 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 254 255 PetscCall( 256 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 257 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 258 #endif 259 } 260 PetscOptionsHeadEnd(); 261 PetscFunctionReturn(0); 262 } 263 264 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 265 { 266 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 267 PetscInt n = A->rmap->n; 268 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 269 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 270 const PetscInt *ai = a->i, *aj = a->j, *vi; 271 const MatScalar *aa = a->a, *v; 272 PetscInt *AiLo, *AjLo; 273 PetscInt i, nz, nzLower, offset, rowOffset; 274 275 PetscFunctionBegin; 276 if (!n) PetscFunctionReturn(0); 277 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 278 try { 279 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 280 nzLower = n + ai[n] - ai[1]; 281 if (!loTriFactor) { 282 PetscScalar *AALo; 283 284 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 285 286 /* Allocate Space for the lower triangular matrix */ 287 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 288 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 289 290 /* Fill the lower triangular matrix */ 291 AiLo[0] = (PetscInt)0; 292 AiLo[n] = nzLower; 293 AjLo[0] = (PetscInt)0; 294 AALo[0] = (MatScalar)1.0; 295 v = aa; 296 vi = aj; 297 offset = 1; 298 rowOffset = 1; 299 for (i = 1; i < n; i++) { 300 nz = ai[i + 1] - ai[i]; 301 /* additional 1 for the term on the diagonal */ 302 AiLo[i] = rowOffset; 303 rowOffset += nz + 1; 304 305 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 306 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 307 308 offset += nz; 309 AjLo[offset] = (PetscInt)i; 310 AALo[offset] = (MatScalar)1.0; 311 offset += 1; 312 313 v += nz; 314 vi += nz; 315 } 316 317 /* allocate space for the triangular factor information */ 318 PetscCall(PetscNew(&loTriFactor)); 319 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 320 /* Create the matrix description */ 321 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 322 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 323 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 324 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 325 #else 326 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 327 #endif 328 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 329 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 330 331 /* set the operation */ 332 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 333 334 /* set the matrix */ 335 loTriFactor->csrMat = new CsrMatrix; 336 loTriFactor->csrMat->num_rows = n; 337 loTriFactor->csrMat->num_cols = n; 338 loTriFactor->csrMat->num_entries = nzLower; 339 340 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 341 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 342 343 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 344 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 345 346 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 347 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 348 349 /* Create the solve analysis information */ 350 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 351 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 352 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 353 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 354 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 355 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 356 #endif 357 358 /* perform the solve analysis */ 359 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 360 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 361 PetscCallCUDA(WaitForCUDA()); 362 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 363 364 /* assign the pointer */ 365 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 366 loTriFactor->AA_h = AALo; 367 PetscCallCUDA(cudaFreeHost(AiLo)); 368 PetscCallCUDA(cudaFreeHost(AjLo)); 369 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 370 } else { /* update values only */ 371 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 372 /* Fill the lower triangular matrix */ 373 loTriFactor->AA_h[0] = 1.0; 374 v = aa; 375 vi = aj; 376 offset = 1; 377 for (i = 1; i < n; i++) { 378 nz = ai[i + 1] - ai[i]; 379 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 380 offset += nz; 381 loTriFactor->AA_h[offset] = 1.0; 382 offset += 1; 383 v += nz; 384 } 385 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 386 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 387 } 388 } catch (char *ex) { 389 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 390 } 391 } 392 PetscFunctionReturn(0); 393 } 394 395 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 396 { 397 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 398 PetscInt n = A->rmap->n; 399 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 400 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 401 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 402 const MatScalar *aa = a->a, *v; 403 PetscInt *AiUp, *AjUp; 404 PetscInt i, nz, nzUpper, offset; 405 406 PetscFunctionBegin; 407 if (!n) PetscFunctionReturn(0); 408 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 409 try { 410 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 411 nzUpper = adiag[0] - adiag[n]; 412 if (!upTriFactor) { 413 PetscScalar *AAUp; 414 415 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 416 417 /* Allocate Space for the upper triangular matrix */ 418 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 419 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 420 421 /* Fill the upper triangular matrix */ 422 AiUp[0] = (PetscInt)0; 423 AiUp[n] = nzUpper; 424 offset = nzUpper; 425 for (i = n - 1; i >= 0; i--) { 426 v = aa + adiag[i + 1] + 1; 427 vi = aj + adiag[i + 1] + 1; 428 429 /* number of elements NOT on the diagonal */ 430 nz = adiag[i] - adiag[i + 1] - 1; 431 432 /* decrement the offset */ 433 offset -= (nz + 1); 434 435 /* first, set the diagonal elements */ 436 AjUp[offset] = (PetscInt)i; 437 AAUp[offset] = (MatScalar)1. / v[nz]; 438 AiUp[i] = AiUp[i + 1] - (nz + 1); 439 440 PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 441 PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 442 } 443 444 /* allocate space for the triangular factor information */ 445 PetscCall(PetscNew(&upTriFactor)); 446 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 447 448 /* Create the matrix description */ 449 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 450 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 451 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 452 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 453 #else 454 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 455 #endif 456 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 457 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 458 459 /* set the operation */ 460 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 461 462 /* set the matrix */ 463 upTriFactor->csrMat = new CsrMatrix; 464 upTriFactor->csrMat->num_rows = n; 465 upTriFactor->csrMat->num_cols = n; 466 upTriFactor->csrMat->num_entries = nzUpper; 467 468 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 469 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 470 471 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 472 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 473 474 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 475 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 476 477 /* Create the solve analysis information */ 478 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 479 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 480 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 481 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 482 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 483 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 484 #endif 485 486 /* perform the solve analysis */ 487 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 488 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 489 490 PetscCallCUDA(WaitForCUDA()); 491 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 492 493 /* assign the pointer */ 494 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 495 upTriFactor->AA_h = AAUp; 496 PetscCallCUDA(cudaFreeHost(AiUp)); 497 PetscCallCUDA(cudaFreeHost(AjUp)); 498 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 499 } else { 500 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 501 /* Fill the upper triangular matrix */ 502 offset = nzUpper; 503 for (i = n - 1; i >= 0; i--) { 504 v = aa + adiag[i + 1] + 1; 505 506 /* number of elements NOT on the diagonal */ 507 nz = adiag[i] - adiag[i + 1] - 1; 508 509 /* decrement the offset */ 510 offset -= (nz + 1); 511 512 /* first, set the diagonal elements */ 513 upTriFactor->AA_h[offset] = 1. / v[nz]; 514 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 515 } 516 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 517 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 518 } 519 } catch (char *ex) { 520 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 521 } 522 } 523 PetscFunctionReturn(0); 524 } 525 526 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 527 { 528 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 529 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 530 IS isrow = a->row, iscol = a->icol; 531 PetscBool row_identity, col_identity; 532 PetscInt n = A->rmap->n; 533 534 PetscFunctionBegin; 535 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 536 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 537 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 538 539 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 540 cusparseTriFactors->nnz = a->nz; 541 542 A->offloadmask = PETSC_OFFLOAD_BOTH; 543 /* lower triangular indices */ 544 PetscCall(ISIdentity(isrow, &row_identity)); 545 if (!row_identity && !cusparseTriFactors->rpermIndices) { 546 const PetscInt *r; 547 548 PetscCall(ISGetIndices(isrow, &r)); 549 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 550 cusparseTriFactors->rpermIndices->assign(r, r + n); 551 PetscCall(ISRestoreIndices(isrow, &r)); 552 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 553 } 554 555 /* upper triangular indices */ 556 PetscCall(ISIdentity(iscol, &col_identity)); 557 if (!col_identity && !cusparseTriFactors->cpermIndices) { 558 const PetscInt *c; 559 560 PetscCall(ISGetIndices(iscol, &c)); 561 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 562 cusparseTriFactors->cpermIndices->assign(c, c + n); 563 PetscCall(ISRestoreIndices(iscol, &c)); 564 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 565 } 566 PetscFunctionReturn(0); 567 } 568 569 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 570 { 571 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 572 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 573 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 574 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 575 PetscInt *AiUp, *AjUp; 576 PetscScalar *AAUp; 577 PetscScalar *AALo; 578 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 579 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 580 const PetscInt *ai = b->i, *aj = b->j, *vj; 581 const MatScalar *aa = b->a, *v; 582 583 PetscFunctionBegin; 584 if (!n) PetscFunctionReturn(0); 585 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 586 try { 587 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 588 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 589 if (!upTriFactor && !loTriFactor) { 590 /* Allocate Space for the upper triangular matrix */ 591 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 592 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 593 594 /* Fill the upper triangular matrix */ 595 AiUp[0] = (PetscInt)0; 596 AiUp[n] = nzUpper; 597 offset = 0; 598 for (i = 0; i < n; i++) { 599 /* set the pointers */ 600 v = aa + ai[i]; 601 vj = aj + ai[i]; 602 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 603 604 /* first, set the diagonal elements */ 605 AjUp[offset] = (PetscInt)i; 606 AAUp[offset] = (MatScalar)1.0 / v[nz]; 607 AiUp[i] = offset; 608 AALo[offset] = (MatScalar)1.0 / v[nz]; 609 610 offset += 1; 611 if (nz > 0) { 612 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 613 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 614 for (j = offset; j < offset + nz; j++) { 615 AAUp[j] = -AAUp[j]; 616 AALo[j] = AAUp[j] / v[nz]; 617 } 618 offset += nz; 619 } 620 } 621 622 /* allocate space for the triangular factor information */ 623 PetscCall(PetscNew(&upTriFactor)); 624 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 625 626 /* Create the matrix description */ 627 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 628 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 629 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 630 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 631 #else 632 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 633 #endif 634 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 635 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 636 637 /* set the matrix */ 638 upTriFactor->csrMat = new CsrMatrix; 639 upTriFactor->csrMat->num_rows = A->rmap->n; 640 upTriFactor->csrMat->num_cols = A->cmap->n; 641 upTriFactor->csrMat->num_entries = a->nz; 642 643 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 644 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 645 646 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 647 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 648 649 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 650 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 651 652 /* set the operation */ 653 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 654 655 /* Create the solve analysis information */ 656 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 657 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 658 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 659 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 660 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 661 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 662 #endif 663 664 /* perform the solve analysis */ 665 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 666 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 667 668 PetscCallCUDA(WaitForCUDA()); 669 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 670 671 /* assign the pointer */ 672 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 673 674 /* allocate space for the triangular factor information */ 675 PetscCall(PetscNew(&loTriFactor)); 676 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 677 678 /* Create the matrix description */ 679 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 680 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 681 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 682 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 683 #else 684 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 685 #endif 686 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 687 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 688 689 /* set the operation */ 690 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 691 692 /* set the matrix */ 693 loTriFactor->csrMat = new CsrMatrix; 694 loTriFactor->csrMat->num_rows = A->rmap->n; 695 loTriFactor->csrMat->num_cols = A->cmap->n; 696 loTriFactor->csrMat->num_entries = a->nz; 697 698 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 699 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 700 701 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 702 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 703 704 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 705 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 706 707 /* Create the solve analysis information */ 708 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 709 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 710 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 711 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 712 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 713 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 714 #endif 715 716 /* perform the solve analysis */ 717 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 718 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 719 720 PetscCallCUDA(WaitForCUDA()); 721 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 722 723 /* assign the pointer */ 724 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 725 726 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 727 PetscCallCUDA(cudaFreeHost(AiUp)); 728 PetscCallCUDA(cudaFreeHost(AjUp)); 729 } else { 730 /* Fill the upper triangular matrix */ 731 offset = 0; 732 for (i = 0; i < n; i++) { 733 /* set the pointers */ 734 v = aa + ai[i]; 735 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 736 737 /* first, set the diagonal elements */ 738 AAUp[offset] = 1.0 / v[nz]; 739 AALo[offset] = 1.0 / v[nz]; 740 741 offset += 1; 742 if (nz > 0) { 743 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 744 for (j = offset; j < offset + nz; j++) { 745 AAUp[j] = -AAUp[j]; 746 AALo[j] = AAUp[j] / v[nz]; 747 } 748 offset += nz; 749 } 750 } 751 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 752 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 753 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 754 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 755 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 756 } 757 PetscCallCUDA(cudaFreeHost(AAUp)); 758 PetscCallCUDA(cudaFreeHost(AALo)); 759 } catch (char *ex) { 760 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 761 } 762 } 763 PetscFunctionReturn(0); 764 } 765 766 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 767 { 768 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 769 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 770 IS ip = a->row; 771 PetscBool perm_identity; 772 PetscInt n = A->rmap->n; 773 774 PetscFunctionBegin; 775 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 776 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 777 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 778 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 779 780 A->offloadmask = PETSC_OFFLOAD_BOTH; 781 782 /* lower triangular indices */ 783 PetscCall(ISIdentity(ip, &perm_identity)); 784 if (!perm_identity) { 785 IS iip; 786 const PetscInt *irip, *rip; 787 788 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 789 PetscCall(ISGetIndices(iip, &irip)); 790 PetscCall(ISGetIndices(ip, &rip)); 791 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 792 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 793 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 794 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 795 PetscCall(ISRestoreIndices(iip, &irip)); 796 PetscCall(ISDestroy(&iip)); 797 PetscCall(ISRestoreIndices(ip, &rip)); 798 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 799 } 800 PetscFunctionReturn(0); 801 } 802 803 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 804 { 805 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 806 IS ip = b->row; 807 PetscBool perm_identity; 808 809 PetscFunctionBegin; 810 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 811 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 812 B->offloadmask = PETSC_OFFLOAD_CPU; 813 /* determine which version of MatSolve needs to be used. */ 814 PetscCall(ISIdentity(ip, &perm_identity)); 815 if (perm_identity) { 816 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 817 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 818 B->ops->matsolve = NULL; 819 B->ops->matsolvetranspose = NULL; 820 } else { 821 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 822 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 823 B->ops->matsolve = NULL; 824 B->ops->matsolvetranspose = NULL; 825 } 826 827 /* get the triangular factors */ 828 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 829 PetscFunctionReturn(0); 830 } 831 832 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 833 { 834 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 835 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 836 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 837 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 838 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 839 cusparseIndexBase_t indexBase; 840 cusparseMatrixType_t matrixType; 841 cusparseFillMode_t fillMode; 842 cusparseDiagType_t diagType; 843 844 PetscFunctionBegin; 845 /* allocate space for the transpose of the lower triangular factor */ 846 PetscCall(PetscNew(&loTriFactorT)); 847 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 848 849 /* set the matrix descriptors of the lower triangular factor */ 850 matrixType = cusparseGetMatType(loTriFactor->descr); 851 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 852 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 853 diagType = cusparseGetMatDiagType(loTriFactor->descr); 854 855 /* Create the matrix description */ 856 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 857 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 858 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 859 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 860 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 861 862 /* set the operation */ 863 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 864 865 /* allocate GPU space for the CSC of the lower triangular factor*/ 866 loTriFactorT->csrMat = new CsrMatrix; 867 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 868 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 869 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 870 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 871 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 872 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 873 874 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 875 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 876 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 877 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 878 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 879 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 880 #endif 881 882 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 883 { 884 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 885 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 886 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 887 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 888 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 889 #else 890 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 891 #endif 892 PetscCallCUSPARSE(stat); 893 } 894 895 PetscCallCUDA(WaitForCUDA()); 896 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 897 898 /* Create the solve analysis information */ 899 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 900 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 901 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 902 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 903 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 904 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 905 #endif 906 907 /* perform the solve analysis */ 908 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 909 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 910 911 PetscCallCUDA(WaitForCUDA()); 912 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 913 914 /* assign the pointer */ 915 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 916 917 /*********************************************/ 918 /* Now the Transpose of the Upper Tri Factor */ 919 /*********************************************/ 920 921 /* allocate space for the transpose of the upper triangular factor */ 922 PetscCall(PetscNew(&upTriFactorT)); 923 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 924 925 /* set the matrix descriptors of the upper triangular factor */ 926 matrixType = cusparseGetMatType(upTriFactor->descr); 927 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 928 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 929 diagType = cusparseGetMatDiagType(upTriFactor->descr); 930 931 /* Create the matrix description */ 932 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 933 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 934 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 935 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 936 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 937 938 /* set the operation */ 939 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 940 941 /* allocate GPU space for the CSC of the upper triangular factor*/ 942 upTriFactorT->csrMat = new CsrMatrix; 943 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 944 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 945 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 946 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 947 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 948 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 949 950 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 951 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 952 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 953 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 954 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 955 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 956 #endif 957 958 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 959 { 960 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 961 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 962 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 963 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 964 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 965 #else 966 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 967 #endif 968 PetscCallCUSPARSE(stat); 969 } 970 971 PetscCallCUDA(WaitForCUDA()); 972 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 973 974 /* Create the solve analysis information */ 975 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 976 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 977 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 978 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 979 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 980 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 981 #endif 982 983 /* perform the solve analysis */ 984 /* christ, would it have killed you to put this stuff in a function????????? */ 985 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 986 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 987 988 PetscCallCUDA(WaitForCUDA()); 989 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 990 991 /* assign the pointer */ 992 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 993 PetscFunctionReturn(0); 994 } 995 996 struct PetscScalarToPetscInt { 997 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 998 }; 999 1000 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1001 { 1002 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1003 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1004 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1005 cusparseStatus_t stat; 1006 cusparseIndexBase_t indexBase; 1007 1008 PetscFunctionBegin; 1009 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1010 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1011 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1012 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1013 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1014 if (A->transupdated) PetscFunctionReturn(0); 1015 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1016 PetscCall(PetscLogGpuTimeBegin()); 1017 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1018 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1019 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1020 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1021 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1022 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1023 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1024 1025 /* set alpha and beta */ 1026 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 1027 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 1028 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1029 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1030 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1031 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1032 1033 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1034 CsrMatrix *matrixT = new CsrMatrix; 1035 matstructT->mat = matrixT; 1036 matrixT->num_rows = A->cmap->n; 1037 matrixT->num_cols = A->rmap->n; 1038 matrixT->num_entries = a->nz; 1039 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1040 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1041 matrixT->values = new THRUSTARRAY(a->nz); 1042 1043 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1044 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1045 1046 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1047 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1048 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1049 indexBase, cusparse_scalartype); 1050 PetscCallCUSPARSE(stat); 1051 #else 1052 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1053 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1054 1055 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1056 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1057 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1058 */ 1059 if (matrixT->num_entries) { 1060 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1061 PetscCallCUSPARSE(stat); 1062 1063 } else { 1064 matstructT->matDescr = NULL; 1065 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1066 } 1067 #endif 1068 #endif 1069 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1070 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1071 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1072 #else 1073 CsrMatrix *temp = new CsrMatrix; 1074 CsrMatrix *tempT = new CsrMatrix; 1075 /* First convert HYB to CSR */ 1076 temp->num_rows = A->rmap->n; 1077 temp->num_cols = A->cmap->n; 1078 temp->num_entries = a->nz; 1079 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1080 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1081 temp->values = new THRUSTARRAY(a->nz); 1082 1083 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1084 PetscCallCUSPARSE(stat); 1085 1086 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1087 tempT->num_rows = A->rmap->n; 1088 tempT->num_cols = A->cmap->n; 1089 tempT->num_entries = a->nz; 1090 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1091 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1092 tempT->values = new THRUSTARRAY(a->nz); 1093 1094 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1095 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1096 PetscCallCUSPARSE(stat); 1097 1098 /* Last, convert CSC to HYB */ 1099 cusparseHybMat_t hybMat; 1100 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1101 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1102 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1103 PetscCallCUSPARSE(stat); 1104 1105 /* assign the pointer */ 1106 matstructT->mat = hybMat; 1107 A->transupdated = PETSC_TRUE; 1108 /* delete temporaries */ 1109 if (tempT) { 1110 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1111 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1112 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1113 delete (CsrMatrix *)tempT; 1114 } 1115 if (temp) { 1116 if (temp->values) delete (THRUSTARRAY *)temp->values; 1117 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1118 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1119 delete (CsrMatrix *)temp; 1120 } 1121 #endif 1122 } 1123 } 1124 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1125 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1126 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1127 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1128 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1129 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1130 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1131 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1132 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1133 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1134 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1135 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1136 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1137 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1138 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1139 } 1140 if (!cusparsestruct->csr2csc_i) { 1141 THRUSTARRAY csr2csc_a(matrix->num_entries); 1142 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1143 1144 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1145 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1146 void *csr2cscBuffer; 1147 size_t csr2cscBufferSize; 1148 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1149 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1150 PetscCallCUSPARSE(stat); 1151 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1152 #endif 1153 1154 if (matrix->num_entries) { 1155 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1156 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1157 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1158 1159 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1160 should be filled with indexBase. So I just take a shortcut here. 1161 */ 1162 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1163 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1164 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1165 PetscCallCUSPARSE(stat); 1166 #else 1167 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1168 PetscCallCUSPARSE(stat); 1169 #endif 1170 } else { 1171 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1172 } 1173 1174 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1175 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1176 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1177 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1178 #endif 1179 } 1180 PetscCallThrust( 1181 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1182 } 1183 PetscCall(PetscLogGpuTimeEnd()); 1184 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1185 /* the compressed row indices is not used for matTranspose */ 1186 matstructT->cprowIndices = NULL; 1187 /* assign the pointer */ 1188 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1189 A->transupdated = PETSC_TRUE; 1190 PetscFunctionReturn(0); 1191 } 1192 1193 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1194 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1195 { 1196 PetscInt n = xx->map->n; 1197 const PetscScalar *barray; 1198 PetscScalar *xarray; 1199 thrust::device_ptr<const PetscScalar> bGPU; 1200 thrust::device_ptr<PetscScalar> xGPU; 1201 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1202 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1203 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1204 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1205 1206 PetscFunctionBegin; 1207 /* Analyze the matrix and create the transpose ... on the fly */ 1208 if (!loTriFactorT && !upTriFactorT) { 1209 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1210 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1211 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1212 } 1213 1214 /* Get the GPU pointers */ 1215 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1216 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1217 xGPU = thrust::device_pointer_cast(xarray); 1218 bGPU = thrust::device_pointer_cast(barray); 1219 1220 PetscCall(PetscLogGpuTimeBegin()); 1221 /* First, reorder with the row permutation */ 1222 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1223 1224 /* First, solve U */ 1225 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1226 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1227 1228 /* Then, solve L */ 1229 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1230 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1231 1232 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1233 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1234 1235 /* Copy the temporary to the full solution. */ 1236 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1237 1238 /* restore */ 1239 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1240 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1241 PetscCall(PetscLogGpuTimeEnd()); 1242 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1243 PetscFunctionReturn(0); 1244 } 1245 1246 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1247 { 1248 const PetscScalar *barray; 1249 PetscScalar *xarray; 1250 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1251 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1252 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1253 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1254 1255 PetscFunctionBegin; 1256 /* Analyze the matrix and create the transpose ... on the fly */ 1257 if (!loTriFactorT && !upTriFactorT) { 1258 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1259 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1260 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1261 } 1262 1263 /* Get the GPU pointers */ 1264 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1265 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1266 1267 PetscCall(PetscLogGpuTimeBegin()); 1268 /* First, solve U */ 1269 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1270 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1271 1272 /* Then, solve L */ 1273 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1274 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1275 1276 /* restore */ 1277 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1278 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1279 PetscCall(PetscLogGpuTimeEnd()); 1280 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1281 PetscFunctionReturn(0); 1282 } 1283 1284 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1285 { 1286 const PetscScalar *barray; 1287 PetscScalar *xarray; 1288 thrust::device_ptr<const PetscScalar> bGPU; 1289 thrust::device_ptr<PetscScalar> xGPU; 1290 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1291 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1292 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1293 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1294 1295 PetscFunctionBegin; 1296 /* Get the GPU pointers */ 1297 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1298 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1299 xGPU = thrust::device_pointer_cast(xarray); 1300 bGPU = thrust::device_pointer_cast(barray); 1301 1302 PetscCall(PetscLogGpuTimeBegin()); 1303 /* First, reorder with the row permutation */ 1304 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1305 1306 /* Next, solve L */ 1307 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1308 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1309 1310 /* Then, solve U */ 1311 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1312 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1313 1314 /* Last, reorder with the column permutation */ 1315 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1316 1317 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1318 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1319 PetscCall(PetscLogGpuTimeEnd()); 1320 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1321 PetscFunctionReturn(0); 1322 } 1323 1324 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1325 { 1326 const PetscScalar *barray; 1327 PetscScalar *xarray; 1328 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1329 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1330 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1331 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1332 1333 PetscFunctionBegin; 1334 /* Get the GPU pointers */ 1335 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1336 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1337 1338 PetscCall(PetscLogGpuTimeBegin()); 1339 /* First, solve L */ 1340 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1341 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1342 1343 /* Next, solve U */ 1344 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1345 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1346 1347 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1348 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1349 PetscCall(PetscLogGpuTimeEnd()); 1350 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1351 PetscFunctionReturn(0); 1352 } 1353 1354 #if CUSPARSE_VERSION >= 11500 1355 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1356 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) 1357 { 1358 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1359 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1360 const PetscScalar *barray; 1361 PetscScalar *xarray; 1362 1363 PetscFunctionBegin; 1364 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1365 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1366 PetscCall(PetscLogGpuTimeBegin()); 1367 1368 /* Solve L*y = b */ 1369 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1370 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1371 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1372 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, 1373 fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1374 1375 /* Solve U*x = y */ 1376 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1377 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 1378 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); 1379 1380 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1381 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1382 1383 PetscCall(PetscLogGpuTimeEnd()); 1384 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1385 PetscFunctionReturn(0); 1386 } 1387 1388 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) 1389 { 1390 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1391 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1392 const PetscScalar *barray; 1393 PetscScalar *xarray; 1394 1395 PetscFunctionBegin; 1396 if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1397 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1398 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1399 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1400 1401 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1402 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1403 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1404 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1405 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1406 } 1407 1408 if (!fs->updatedTransposeSpSVAnalysis) { 1409 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1410 1411 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1412 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1413 } 1414 1415 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1416 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1417 PetscCall(PetscLogGpuTimeBegin()); 1418 1419 /* Solve Ut*y = b */ 1420 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1421 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1422 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 1423 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); 1424 1425 /* Solve Lt*x = y */ 1426 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1427 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1428 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1429 1430 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1431 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1432 PetscCall(PetscLogGpuTimeEnd()); 1433 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1434 PetscFunctionReturn(0); 1435 } 1436 1437 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) 1438 { 1439 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1440 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1441 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1442 CsrMatrix *Acsr; 1443 PetscInt m, nz; 1444 PetscBool flg; 1445 1446 PetscFunctionBegin; 1447 if (PetscDefined(USE_DEBUG)) { 1448 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1449 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1450 } 1451 1452 /* Copy A's value to fact */ 1453 m = fact->rmap->n; 1454 nz = aij->nz; 1455 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1456 Acsr = (CsrMatrix *)Acusp->mat->mat; 1457 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1458 1459 /* Factorize fact inplace */ 1460 if (m) 1461 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1462 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1463 if (PetscDefined(USE_DEBUG)) { 1464 int numerical_zero; 1465 cusparseStatus_t status; 1466 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1467 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1468 } 1469 1470 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1471 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1472 */ 1473 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1474 1475 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1476 1477 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1478 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1479 1480 fact->offloadmask = PETSC_OFFLOAD_GPU; 1481 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1482 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1483 fact->ops->matsolve = NULL; 1484 fact->ops->matsolvetranspose = NULL; 1485 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1486 PetscFunctionReturn(0); 1487 } 1488 1489 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1490 { 1491 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1492 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1493 PetscInt m, nz; 1494 1495 PetscFunctionBegin; 1496 if (PetscDefined(USE_DEBUG)) { 1497 PetscInt i; 1498 PetscBool flg, missing; 1499 1500 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1501 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1502 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1503 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1504 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1505 } 1506 1507 /* Free the old stale stuff */ 1508 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1509 1510 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1511 but they will not be used. Allocate them just for easy debugging. 1512 */ 1513 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1514 1515 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1516 fact->factortype = MAT_FACTOR_ILU; 1517 fact->info.factor_mallocs = 0; 1518 fact->info.fill_ratio_given = info->fill; 1519 fact->info.fill_ratio_needed = 1.0; 1520 1521 aij->row = NULL; 1522 aij->col = NULL; 1523 1524 /* ====================================================================== */ 1525 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1526 /* We'll do in-place factorization on fact */ 1527 /* ====================================================================== */ 1528 const int *Ai, *Aj; 1529 1530 m = fact->rmap->n; 1531 nz = aij->nz; 1532 1533 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1534 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1535 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1536 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1537 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1538 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1539 1540 /* ====================================================================== */ 1541 /* Create descriptors for M, L, U */ 1542 /* ====================================================================== */ 1543 cusparseFillMode_t fillMode; 1544 cusparseDiagType_t diagType; 1545 1546 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1547 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1548 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1549 1550 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1551 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1552 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1553 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1554 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1555 */ 1556 fillMode = CUSPARSE_FILL_MODE_LOWER; 1557 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1558 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1559 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1560 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1561 1562 fillMode = CUSPARSE_FILL_MODE_UPPER; 1563 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1564 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1565 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1566 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1567 1568 /* ========================================================================= */ 1569 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1570 /* ========================================================================= */ 1571 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1572 if (m) 1573 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1574 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M)); 1575 1576 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1577 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1578 1579 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1580 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1581 1582 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1583 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1584 1585 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1586 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1587 1588 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1589 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1590 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1591 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1592 */ 1593 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1594 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1595 fs->spsvBuffer_L = fs->factBuffer_M; 1596 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1597 } else { 1598 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1599 fs->spsvBuffer_U = fs->factBuffer_M; 1600 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1601 } 1602 1603 /* ========================================================================== */ 1604 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1605 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1606 /* ========================================================================== */ 1607 int structural_zero; 1608 cusparseStatus_t status; 1609 1610 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1611 if (m) 1612 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1613 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1614 if (PetscDefined(USE_DEBUG)) { 1615 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1616 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1617 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1618 } 1619 1620 /* Estimate FLOPs of the numeric factorization */ 1621 { 1622 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1623 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1624 PetscLogDouble flops = 0.0; 1625 1626 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1627 Ai = Aseq->i; 1628 Adiag = Aseq->diag; 1629 for (PetscInt i = 0; i < m; i++) { 1630 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1631 nzRow = Ai[i + 1] - Ai[i]; 1632 nzLeft = Adiag[i] - Ai[i]; 1633 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1634 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1635 */ 1636 nzLeft = (nzRow - 1) / 2; 1637 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1638 } 1639 } 1640 fs->numericFactFlops = flops; 1641 } 1642 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1643 PetscFunctionReturn(0); 1644 } 1645 1646 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1647 { 1648 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1649 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1650 const PetscScalar *barray; 1651 PetscScalar *xarray; 1652 1653 PetscFunctionBegin; 1654 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1655 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1656 PetscCall(PetscLogGpuTimeBegin()); 1657 1658 /* Solve L*y = b */ 1659 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1660 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1661 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1662 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1663 1664 /* Solve Lt*x = y */ 1665 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1666 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1667 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1668 1669 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1670 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1671 1672 PetscCall(PetscLogGpuTimeEnd()); 1673 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1674 PetscFunctionReturn(0); 1675 } 1676 1677 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) 1678 { 1679 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1680 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1681 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1682 CsrMatrix *Acsr; 1683 PetscInt m, nz; 1684 PetscBool flg; 1685 1686 PetscFunctionBegin; 1687 if (PetscDefined(USE_DEBUG)) { 1688 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1689 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1690 } 1691 1692 /* Copy A's value to fact */ 1693 m = fact->rmap->n; 1694 nz = aij->nz; 1695 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1696 Acsr = (CsrMatrix *)Acusp->mat->mat; 1697 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1698 1699 /* Factorize fact inplace */ 1700 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1701 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1702 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1703 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1704 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1705 */ 1706 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1707 if (PetscDefined(USE_DEBUG)) { 1708 int numerical_zero; 1709 cusparseStatus_t status; 1710 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1711 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1712 } 1713 1714 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1715 1716 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1717 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1718 */ 1719 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1720 1721 fact->offloadmask = PETSC_OFFLOAD_GPU; 1722 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1723 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1724 fact->ops->matsolve = NULL; 1725 fact->ops->matsolvetranspose = NULL; 1726 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1727 PetscFunctionReturn(0); 1728 } 1729 1730 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1731 { 1732 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1733 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1734 PetscInt m, nz; 1735 1736 PetscFunctionBegin; 1737 if (PetscDefined(USE_DEBUG)) { 1738 PetscInt i; 1739 PetscBool flg, missing; 1740 1741 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1742 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1743 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1744 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1745 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1746 } 1747 1748 /* Free the old stale stuff */ 1749 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1750 1751 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1752 but they will not be used. Allocate them just for easy debugging. 1753 */ 1754 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1755 1756 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1757 fact->factortype = MAT_FACTOR_ICC; 1758 fact->info.factor_mallocs = 0; 1759 fact->info.fill_ratio_given = info->fill; 1760 fact->info.fill_ratio_needed = 1.0; 1761 1762 aij->row = NULL; 1763 aij->col = NULL; 1764 1765 /* ====================================================================== */ 1766 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1767 /* We'll do in-place factorization on fact */ 1768 /* ====================================================================== */ 1769 const int *Ai, *Aj; 1770 1771 m = fact->rmap->n; 1772 nz = aij->nz; 1773 1774 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1775 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1776 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1777 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1778 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1779 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1780 1781 /* ====================================================================== */ 1782 /* Create mat descriptors for M, L */ 1783 /* ====================================================================== */ 1784 cusparseFillMode_t fillMode; 1785 cusparseDiagType_t diagType; 1786 1787 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1788 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1789 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1790 1791 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1792 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1793 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1794 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1795 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1796 */ 1797 fillMode = CUSPARSE_FILL_MODE_LOWER; 1798 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1799 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1800 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1801 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1802 1803 /* ========================================================================= */ 1804 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 1805 /* ========================================================================= */ 1806 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 1807 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M)); 1808 1809 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1810 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1811 1812 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1813 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1814 1815 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1816 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1817 1818 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1819 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1820 1821 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 1822 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 1823 */ 1824 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 1825 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1826 fs->spsvBuffer_L = fs->factBuffer_M; 1827 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1828 } else { 1829 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 1830 fs->spsvBuffer_Lt = fs->factBuffer_M; 1831 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1832 } 1833 1834 /* ========================================================================== */ 1835 /* Perform analysis of ic0 on M */ 1836 /* The lower triangular part of M has the same sparsity pattern as L */ 1837 /* ========================================================================== */ 1838 int structural_zero; 1839 cusparseStatus_t status; 1840 1841 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1842 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1843 if (PetscDefined(USE_DEBUG)) { 1844 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1845 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 1846 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 1847 } 1848 1849 /* Estimate FLOPs of the numeric factorization */ 1850 { 1851 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1852 PetscInt *Ai, nzRow, nzLeft; 1853 PetscLogDouble flops = 0.0; 1854 1855 Ai = Aseq->i; 1856 for (PetscInt i = 0; i < m; i++) { 1857 nzRow = Ai[i + 1] - Ai[i]; 1858 if (nzRow > 1) { 1859 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1860 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1861 */ 1862 nzLeft = (nzRow - 1) / 2; 1863 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1864 } 1865 } 1866 fs->numericFactFlops = flops; 1867 } 1868 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 1869 PetscFunctionReturn(0); 1870 } 1871 #endif 1872 1873 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1874 { 1875 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1876 1877 PetscFunctionBegin; 1878 #if CUSPARSE_VERSION >= 11500 1879 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 1880 if (cusparseTriFactors->factorizeOnDevice) { 1881 PetscCall(ISIdentity(isrow, &row_identity)); 1882 PetscCall(ISIdentity(iscol, &col_identity)); 1883 } 1884 if (!info->levels && row_identity && col_identity) { 1885 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 1886 } else 1887 #endif 1888 { 1889 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1890 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1891 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1892 } 1893 PetscFunctionReturn(0); 1894 } 1895 1896 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1897 { 1898 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1899 1900 PetscFunctionBegin; 1901 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1902 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1903 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1904 PetscFunctionReturn(0); 1905 } 1906 1907 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 1908 { 1909 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1910 1911 PetscFunctionBegin; 1912 #if CUSPARSE_VERSION >= 11500 1913 PetscBool perm_identity = PETSC_FALSE; 1914 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 1915 if (!info->levels && perm_identity) { 1916 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 1917 } else 1918 #endif 1919 { 1920 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1921 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 1922 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1923 } 1924 PetscFunctionReturn(0); 1925 } 1926 1927 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 1928 { 1929 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1930 1931 PetscFunctionBegin; 1932 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1933 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 1934 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1935 PetscFunctionReturn(0); 1936 } 1937 1938 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) 1939 { 1940 PetscFunctionBegin; 1941 *type = MATSOLVERCUSPARSE; 1942 PetscFunctionReturn(0); 1943 } 1944 1945 /*MC 1946 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 1947 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 1948 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 1949 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 1950 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 1951 algorithms are not recommended. This class does NOT support direct solver operations. 1952 1953 Level: beginner 1954 1955 .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 1956 M*/ 1957 1958 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 1959 { 1960 PetscInt n = A->rmap->n; 1961 PetscBool factOnDevice, factOnHost; 1962 char *prefix; 1963 char factPlace[32] = "device"; /* the default */ 1964 1965 PetscFunctionBegin; 1966 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1967 PetscCall(MatSetSizes(*B, n, n, n, n)); 1968 (*B)->factortype = ftype; 1969 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 1970 1971 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 1972 PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 1973 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 1974 PetscOptionsEnd(); 1975 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 1976 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 1977 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 1978 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 1979 1980 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 1981 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 1982 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1983 if (!A->boundtocpu) { 1984 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1985 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1986 } else { 1987 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 1988 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 1989 } 1990 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1991 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 1992 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 1993 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 1994 if (!A->boundtocpu) { 1995 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 1996 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1997 } else { 1998 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 1999 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2000 } 2001 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2002 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2003 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2004 2005 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2006 (*B)->canuseordering = PETSC_TRUE; 2007 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2008 PetscFunctionReturn(0); 2009 } 2010 2011 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2012 { 2013 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2014 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2015 #if CUSPARSE_VERSION >= 13500 2016 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2017 #endif 2018 2019 PetscFunctionBegin; 2020 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2021 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2022 if (A->factortype == MAT_FACTOR_NONE) { 2023 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2024 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2025 } 2026 #if CUSPARSE_VERSION >= 13500 2027 else if (fs->csrVal) { 2028 /* We have a factorized matrix on device and are able to copy it to host */ 2029 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2030 } 2031 #endif 2032 else 2033 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2034 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2035 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2036 A->offloadmask = PETSC_OFFLOAD_BOTH; 2037 } 2038 PetscFunctionReturn(0); 2039 } 2040 2041 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2042 { 2043 PetscFunctionBegin; 2044 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2045 *array = ((Mat_SeqAIJ *)A->data)->a; 2046 PetscFunctionReturn(0); 2047 } 2048 2049 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2050 { 2051 PetscFunctionBegin; 2052 A->offloadmask = PETSC_OFFLOAD_CPU; 2053 *array = NULL; 2054 PetscFunctionReturn(0); 2055 } 2056 2057 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2058 { 2059 PetscFunctionBegin; 2060 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2061 *array = ((Mat_SeqAIJ *)A->data)->a; 2062 PetscFunctionReturn(0); 2063 } 2064 2065 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2066 { 2067 PetscFunctionBegin; 2068 *array = NULL; 2069 PetscFunctionReturn(0); 2070 } 2071 2072 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2073 { 2074 PetscFunctionBegin; 2075 *array = ((Mat_SeqAIJ *)A->data)->a; 2076 PetscFunctionReturn(0); 2077 } 2078 2079 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2080 { 2081 PetscFunctionBegin; 2082 A->offloadmask = PETSC_OFFLOAD_CPU; 2083 *array = NULL; 2084 PetscFunctionReturn(0); 2085 } 2086 2087 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2088 { 2089 Mat_SeqAIJCUSPARSE *cusp; 2090 CsrMatrix *matrix; 2091 2092 PetscFunctionBegin; 2093 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2094 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2095 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2096 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2097 matrix = (CsrMatrix *)cusp->mat->mat; 2098 2099 if (i) { 2100 #if !defined(PETSC_USE_64BIT_INDICES) 2101 *i = matrix->row_offsets->data().get(); 2102 #else 2103 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2104 #endif 2105 } 2106 if (j) { 2107 #if !defined(PETSC_USE_64BIT_INDICES) 2108 *j = matrix->column_indices->data().get(); 2109 #else 2110 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2111 #endif 2112 } 2113 if (a) *a = matrix->values->data().get(); 2114 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2115 PetscFunctionReturn(0); 2116 } 2117 2118 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2119 { 2120 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2121 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2122 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2123 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2124 cusparseStatus_t stat; 2125 PetscBool both = PETSC_TRUE; 2126 2127 PetscFunctionBegin; 2128 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2129 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2130 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2131 CsrMatrix *matrix; 2132 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2133 2134 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2135 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2136 matrix->values->assign(a->a, a->a + a->nz); 2137 PetscCallCUDA(WaitForCUDA()); 2138 PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 2139 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2140 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2141 } else { 2142 PetscInt nnz; 2143 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2144 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2145 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2146 delete cusparsestruct->workVector; 2147 delete cusparsestruct->rowoffsets_gpu; 2148 cusparsestruct->workVector = NULL; 2149 cusparsestruct->rowoffsets_gpu = NULL; 2150 try { 2151 if (a->compressedrow.use) { 2152 m = a->compressedrow.nrows; 2153 ii = a->compressedrow.i; 2154 ridx = a->compressedrow.rindex; 2155 } else { 2156 m = A->rmap->n; 2157 ii = a->i; 2158 ridx = NULL; 2159 } 2160 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2161 if (!a->a) { 2162 nnz = ii[m]; 2163 both = PETSC_FALSE; 2164 } else nnz = a->nz; 2165 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2166 2167 /* create cusparse matrix */ 2168 cusparsestruct->nrows = m; 2169 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2170 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2171 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2172 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2173 2174 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 2175 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 2176 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2177 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2178 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2179 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2180 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2181 2182 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2183 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2184 /* set the matrix */ 2185 CsrMatrix *mat = new CsrMatrix; 2186 mat->num_rows = m; 2187 mat->num_cols = A->cmap->n; 2188 mat->num_entries = nnz; 2189 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2190 mat->row_offsets->assign(ii, ii + m + 1); 2191 2192 mat->column_indices = new THRUSTINTARRAY32(nnz); 2193 mat->column_indices->assign(a->j, a->j + nnz); 2194 2195 mat->values = new THRUSTARRAY(nnz); 2196 if (a->a) mat->values->assign(a->a, a->a + nnz); 2197 2198 /* assign the pointer */ 2199 matstruct->mat = mat; 2200 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2201 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2202 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2203 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2204 PetscCallCUSPARSE(stat); 2205 } 2206 #endif 2207 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2208 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2209 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2210 #else 2211 CsrMatrix *mat = new CsrMatrix; 2212 mat->num_rows = m; 2213 mat->num_cols = A->cmap->n; 2214 mat->num_entries = nnz; 2215 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2216 mat->row_offsets->assign(ii, ii + m + 1); 2217 2218 mat->column_indices = new THRUSTINTARRAY32(nnz); 2219 mat->column_indices->assign(a->j, a->j + nnz); 2220 2221 mat->values = new THRUSTARRAY(nnz); 2222 if (a->a) mat->values->assign(a->a, a->a + nnz); 2223 2224 cusparseHybMat_t hybMat; 2225 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2226 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2227 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2228 PetscCallCUSPARSE(stat); 2229 /* assign the pointer */ 2230 matstruct->mat = hybMat; 2231 2232 if (mat) { 2233 if (mat->values) delete (THRUSTARRAY *)mat->values; 2234 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2235 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2236 delete (CsrMatrix *)mat; 2237 } 2238 #endif 2239 } 2240 2241 /* assign the compressed row indices */ 2242 if (a->compressedrow.use) { 2243 cusparsestruct->workVector = new THRUSTARRAY(m); 2244 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2245 matstruct->cprowIndices->assign(ridx, ridx + m); 2246 tmp = m; 2247 } else { 2248 cusparsestruct->workVector = NULL; 2249 matstruct->cprowIndices = NULL; 2250 tmp = 0; 2251 } 2252 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2253 2254 /* assign the pointer */ 2255 cusparsestruct->mat = matstruct; 2256 } catch (char *ex) { 2257 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2258 } 2259 PetscCallCUDA(WaitForCUDA()); 2260 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2261 cusparsestruct->nonzerostate = A->nonzerostate; 2262 } 2263 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2264 } 2265 PetscFunctionReturn(0); 2266 } 2267 2268 struct VecCUDAPlusEquals { 2269 template <typename Tuple> 2270 __host__ __device__ void operator()(Tuple t) 2271 { 2272 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2273 } 2274 }; 2275 2276 struct VecCUDAEquals { 2277 template <typename Tuple> 2278 __host__ __device__ void operator()(Tuple t) 2279 { 2280 thrust::get<1>(t) = thrust::get<0>(t); 2281 } 2282 }; 2283 2284 struct VecCUDAEqualsReverse { 2285 template <typename Tuple> 2286 __host__ __device__ void operator()(Tuple t) 2287 { 2288 thrust::get<0>(t) = thrust::get<1>(t); 2289 } 2290 }; 2291 2292 struct MatMatCusparse { 2293 PetscBool cisdense; 2294 PetscScalar *Bt; 2295 Mat X; 2296 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2297 PetscLogDouble flops; 2298 CsrMatrix *Bcsr; 2299 2300 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2301 cusparseSpMatDescr_t matSpBDescr; 2302 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2303 cusparseDnMatDescr_t matBDescr; 2304 cusparseDnMatDescr_t matCDescr; 2305 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2306 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2307 void *dBuffer4; 2308 void *dBuffer5; 2309 #endif 2310 size_t mmBufferSize; 2311 void *mmBuffer; 2312 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2313 cusparseSpGEMMDescr_t spgemmDesc; 2314 #endif 2315 }; 2316 2317 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2318 { 2319 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2320 2321 PetscFunctionBegin; 2322 PetscCallCUDA(cudaFree(mmdata->Bt)); 2323 delete mmdata->Bcsr; 2324 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2325 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2326 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2327 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2328 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2329 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2330 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2331 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2332 #endif 2333 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2334 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2335 #endif 2336 PetscCall(MatDestroy(&mmdata->X)); 2337 PetscCall(PetscFree(data)); 2338 PetscFunctionReturn(0); 2339 } 2340 2341 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool); 2342 2343 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2344 { 2345 Mat_Product *product = C->product; 2346 Mat A, B; 2347 PetscInt m, n, blda, clda; 2348 PetscBool flg, biscuda; 2349 Mat_SeqAIJCUSPARSE *cusp; 2350 cusparseStatus_t stat; 2351 cusparseOperation_t opA; 2352 const PetscScalar *barray; 2353 PetscScalar *carray; 2354 MatMatCusparse *mmdata; 2355 Mat_SeqAIJCUSPARSEMultStruct *mat; 2356 CsrMatrix *csrmat; 2357 2358 PetscFunctionBegin; 2359 MatCheckProduct(C, 1); 2360 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2361 mmdata = (MatMatCusparse *)product->data; 2362 A = product->A; 2363 B = product->B; 2364 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2365 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2366 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2367 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2368 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2369 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2370 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2371 switch (product->type) { 2372 case MATPRODUCT_AB: 2373 case MATPRODUCT_PtAP: 2374 mat = cusp->mat; 2375 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2376 m = A->rmap->n; 2377 n = B->cmap->n; 2378 break; 2379 case MATPRODUCT_AtB: 2380 if (!A->form_explicit_transpose) { 2381 mat = cusp->mat; 2382 opA = CUSPARSE_OPERATION_TRANSPOSE; 2383 } else { 2384 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2385 mat = cusp->matTranspose; 2386 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2387 } 2388 m = A->cmap->n; 2389 n = B->cmap->n; 2390 break; 2391 case MATPRODUCT_ABt: 2392 case MATPRODUCT_RARt: 2393 mat = cusp->mat; 2394 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2395 m = A->rmap->n; 2396 n = B->rmap->n; 2397 break; 2398 default: 2399 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2400 } 2401 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2402 csrmat = (CsrMatrix *)mat->mat; 2403 /* if the user passed a CPU matrix, copy the data to the GPU */ 2404 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2405 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2406 PetscCall(MatDenseCUDAGetArrayRead(B, &barray)); 2407 2408 PetscCall(MatDenseGetLDA(B, &blda)); 2409 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2410 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray)); 2411 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2412 } else { 2413 PetscCall(MatDenseCUDAGetArrayWrite(C, &carray)); 2414 PetscCall(MatDenseGetLDA(C, &clda)); 2415 } 2416 2417 PetscCall(PetscLogGpuTimeBegin()); 2418 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2419 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2420 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2421 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2422 size_t mmBufferSize; 2423 if (mmdata->initialized && mmdata->Blda != blda) { 2424 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2425 mmdata->matBDescr = NULL; 2426 } 2427 if (!mmdata->matBDescr) { 2428 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2429 mmdata->Blda = blda; 2430 } 2431 2432 if (mmdata->initialized && mmdata->Clda != clda) { 2433 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2434 mmdata->matCDescr = NULL; 2435 } 2436 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2437 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2438 mmdata->Clda = clda; 2439 } 2440 2441 if (!mat->matDescr) { 2442 stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2443 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2444 PetscCallCUSPARSE(stat); 2445 } 2446 stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 2447 PetscCallCUSPARSE(stat); 2448 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2449 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2450 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2451 mmdata->mmBufferSize = mmBufferSize; 2452 } 2453 mmdata->initialized = PETSC_TRUE; 2454 } else { 2455 /* to be safe, always update pointers of the mats */ 2456 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2457 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2458 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2459 } 2460 2461 /* do cusparseSpMM, which supports transpose on B */ 2462 stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 2463 PetscCallCUSPARSE(stat); 2464 #else 2465 PetscInt k; 2466 /* cusparseXcsrmm does not support transpose on B */ 2467 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2468 cublasHandle_t cublasv2handle; 2469 cublasStatus_t cerr; 2470 2471 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2472 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2473 PetscCallCUBLAS(cerr); 2474 blda = B->cmap->n; 2475 k = B->cmap->n; 2476 } else { 2477 k = B->rmap->n; 2478 } 2479 2480 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2481 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2482 PetscCallCUSPARSE(stat); 2483 #endif 2484 PetscCall(PetscLogGpuTimeEnd()); 2485 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2486 PetscCall(MatDenseCUDARestoreArrayRead(B, &barray)); 2487 if (product->type == MATPRODUCT_RARt) { 2488 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 2489 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2490 } else if (product->type == MATPRODUCT_PtAP) { 2491 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 2492 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2493 } else { 2494 PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray)); 2495 } 2496 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2497 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2498 PetscFunctionReturn(0); 2499 } 2500 2501 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2502 { 2503 Mat_Product *product = C->product; 2504 Mat A, B; 2505 PetscInt m, n; 2506 PetscBool cisdense, flg; 2507 MatMatCusparse *mmdata; 2508 Mat_SeqAIJCUSPARSE *cusp; 2509 2510 PetscFunctionBegin; 2511 MatCheckProduct(C, 1); 2512 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2513 A = product->A; 2514 B = product->B; 2515 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2516 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2517 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2518 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2519 switch (product->type) { 2520 case MATPRODUCT_AB: 2521 m = A->rmap->n; 2522 n = B->cmap->n; 2523 break; 2524 case MATPRODUCT_AtB: 2525 m = A->cmap->n; 2526 n = B->cmap->n; 2527 break; 2528 case MATPRODUCT_ABt: 2529 m = A->rmap->n; 2530 n = B->rmap->n; 2531 break; 2532 case MATPRODUCT_PtAP: 2533 m = B->cmap->n; 2534 n = B->cmap->n; 2535 break; 2536 case MATPRODUCT_RARt: 2537 m = B->rmap->n; 2538 n = B->rmap->n; 2539 break; 2540 default: 2541 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2542 } 2543 PetscCall(MatSetSizes(C, m, n, m, n)); 2544 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2545 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2546 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2547 2548 /* product data */ 2549 PetscCall(PetscNew(&mmdata)); 2550 mmdata->cisdense = cisdense; 2551 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2552 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2553 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2554 #endif 2555 /* for these products we need intermediate storage */ 2556 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2557 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2558 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2559 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2560 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2561 } else { 2562 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2563 } 2564 } 2565 C->product->data = mmdata; 2566 C->product->destroy = MatDestroy_MatMatCusparse; 2567 2568 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2569 PetscFunctionReturn(0); 2570 } 2571 2572 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2573 { 2574 Mat_Product *product = C->product; 2575 Mat A, B; 2576 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2577 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2578 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2579 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2580 PetscBool flg; 2581 cusparseStatus_t stat; 2582 MatProductType ptype; 2583 MatMatCusparse *mmdata; 2584 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2585 cusparseSpMatDescr_t BmatSpDescr; 2586 #endif 2587 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2588 2589 PetscFunctionBegin; 2590 MatCheckProduct(C, 1); 2591 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2592 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2593 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2594 mmdata = (MatMatCusparse *)C->product->data; 2595 A = product->A; 2596 B = product->B; 2597 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2598 mmdata->reusesym = PETSC_FALSE; 2599 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2600 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2601 Cmat = Ccusp->mat; 2602 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2603 Ccsr = (CsrMatrix *)Cmat->mat; 2604 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2605 goto finalize; 2606 } 2607 if (!c->nz) goto finalize; 2608 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2609 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2610 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2611 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2612 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2613 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2614 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2615 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2616 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2617 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2618 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2619 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2620 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2621 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2622 2623 ptype = product->type; 2624 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2625 ptype = MATPRODUCT_AB; 2626 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2627 } 2628 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2629 ptype = MATPRODUCT_AB; 2630 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2631 } 2632 switch (ptype) { 2633 case MATPRODUCT_AB: 2634 Amat = Acusp->mat; 2635 Bmat = Bcusp->mat; 2636 break; 2637 case MATPRODUCT_AtB: 2638 Amat = Acusp->matTranspose; 2639 Bmat = Bcusp->mat; 2640 break; 2641 case MATPRODUCT_ABt: 2642 Amat = Acusp->mat; 2643 Bmat = Bcusp->matTranspose; 2644 break; 2645 default: 2646 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2647 } 2648 Cmat = Ccusp->mat; 2649 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2650 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2651 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2652 Acsr = (CsrMatrix *)Amat->mat; 2653 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2654 Ccsr = (CsrMatrix *)Cmat->mat; 2655 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2656 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2657 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2658 PetscCall(PetscLogGpuTimeBegin()); 2659 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2660 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2661 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2662 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2663 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2664 PetscCallCUSPARSE(stat); 2665 #else 2666 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2667 PetscCallCUSPARSE(stat); 2668 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2669 PetscCallCUSPARSE(stat); 2670 #endif 2671 #else 2672 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2673 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2674 PetscCallCUSPARSE(stat); 2675 #endif 2676 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2677 PetscCallCUDA(WaitForCUDA()); 2678 PetscCall(PetscLogGpuTimeEnd()); 2679 C->offloadmask = PETSC_OFFLOAD_GPU; 2680 finalize: 2681 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2682 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2683 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2684 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2685 c->reallocs = 0; 2686 C->info.mallocs += 0; 2687 C->info.nz_unneeded = 0; 2688 C->assembled = C->was_assembled = PETSC_TRUE; 2689 C->num_ass++; 2690 PetscFunctionReturn(0); 2691 } 2692 2693 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2694 { 2695 Mat_Product *product = C->product; 2696 Mat A, B; 2697 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2698 Mat_SeqAIJ *a, *b, *c; 2699 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2700 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2701 PetscInt i, j, m, n, k; 2702 PetscBool flg; 2703 cusparseStatus_t stat; 2704 MatProductType ptype; 2705 MatMatCusparse *mmdata; 2706 PetscLogDouble flops; 2707 PetscBool biscompressed, ciscompressed; 2708 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2709 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2710 cusparseSpMatDescr_t BmatSpDescr; 2711 #else 2712 int cnz; 2713 #endif 2714 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2715 2716 PetscFunctionBegin; 2717 MatCheckProduct(C, 1); 2718 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2719 A = product->A; 2720 B = product->B; 2721 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2722 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2723 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2724 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2725 a = (Mat_SeqAIJ *)A->data; 2726 b = (Mat_SeqAIJ *)B->data; 2727 /* product data */ 2728 PetscCall(PetscNew(&mmdata)); 2729 C->product->data = mmdata; 2730 C->product->destroy = MatDestroy_MatMatCusparse; 2731 2732 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2733 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2734 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2735 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2736 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2737 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2738 2739 ptype = product->type; 2740 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2741 ptype = MATPRODUCT_AB; 2742 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2743 } 2744 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2745 ptype = MATPRODUCT_AB; 2746 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2747 } 2748 biscompressed = PETSC_FALSE; 2749 ciscompressed = PETSC_FALSE; 2750 switch (ptype) { 2751 case MATPRODUCT_AB: 2752 m = A->rmap->n; 2753 n = B->cmap->n; 2754 k = A->cmap->n; 2755 Amat = Acusp->mat; 2756 Bmat = Bcusp->mat; 2757 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2758 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2759 break; 2760 case MATPRODUCT_AtB: 2761 m = A->cmap->n; 2762 n = B->cmap->n; 2763 k = A->rmap->n; 2764 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2765 Amat = Acusp->matTranspose; 2766 Bmat = Bcusp->mat; 2767 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2768 break; 2769 case MATPRODUCT_ABt: 2770 m = A->rmap->n; 2771 n = B->rmap->n; 2772 k = A->cmap->n; 2773 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2774 Amat = Acusp->mat; 2775 Bmat = Bcusp->matTranspose; 2776 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2777 break; 2778 default: 2779 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2780 } 2781 2782 /* create cusparse matrix */ 2783 PetscCall(MatSetSizes(C, m, n, m, n)); 2784 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 2785 c = (Mat_SeqAIJ *)C->data; 2786 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2787 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2788 Ccsr = new CsrMatrix; 2789 2790 c->compressedrow.use = ciscompressed; 2791 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2792 c->compressedrow.nrows = a->compressedrow.nrows; 2793 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 2794 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 2795 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2796 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2797 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 2798 } else { 2799 c->compressedrow.nrows = 0; 2800 c->compressedrow.i = NULL; 2801 c->compressedrow.rindex = NULL; 2802 Ccusp->workVector = NULL; 2803 Cmat->cprowIndices = NULL; 2804 } 2805 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2806 Ccusp->mat = Cmat; 2807 Ccusp->mat->mat = Ccsr; 2808 Ccsr->num_rows = Ccusp->nrows; 2809 Ccsr->num_cols = n; 2810 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 2811 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2812 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2813 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2814 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 2815 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 2816 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2817 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2818 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2819 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2820 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2821 thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0); 2822 c->nz = 0; 2823 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2824 Ccsr->values = new THRUSTARRAY(c->nz); 2825 goto finalizesym; 2826 } 2827 2828 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2829 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2830 Acsr = (CsrMatrix *)Amat->mat; 2831 if (!biscompressed) { 2832 Bcsr = (CsrMatrix *)Bmat->mat; 2833 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2834 BmatSpDescr = Bmat->matDescr; 2835 #endif 2836 } else { /* we need to use row offsets for the full matrix */ 2837 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 2838 Bcsr = new CsrMatrix; 2839 Bcsr->num_rows = B->rmap->n; 2840 Bcsr->num_cols = cBcsr->num_cols; 2841 Bcsr->num_entries = cBcsr->num_entries; 2842 Bcsr->column_indices = cBcsr->column_indices; 2843 Bcsr->values = cBcsr->values; 2844 if (!Bcusp->rowoffsets_gpu) { 2845 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2846 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 2847 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 2848 } 2849 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2850 mmdata->Bcsr = Bcsr; 2851 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2852 if (Bcsr->num_rows && Bcsr->num_cols) { 2853 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2854 PetscCallCUSPARSE(stat); 2855 } 2856 BmatSpDescr = mmdata->matSpBDescr; 2857 #endif 2858 } 2859 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2860 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2861 /* precompute flops count */ 2862 if (ptype == MATPRODUCT_AB) { 2863 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2864 const PetscInt st = a->i[i]; 2865 const PetscInt en = a->i[i + 1]; 2866 for (j = st; j < en; j++) { 2867 const PetscInt brow = a->j[j]; 2868 flops += 2. * (b->i[brow + 1] - b->i[brow]); 2869 } 2870 } 2871 } else if (ptype == MATPRODUCT_AtB) { 2872 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2873 const PetscInt anzi = a->i[i + 1] - a->i[i]; 2874 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 2875 flops += (2. * anzi) * bnzi; 2876 } 2877 } else { /* TODO */ 2878 flops = 0.; 2879 } 2880 2881 mmdata->flops = flops; 2882 PetscCall(PetscLogGpuTimeBegin()); 2883 2884 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2885 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2886 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2887 PetscCallCUSPARSE(stat); 2888 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2889 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2890 { 2891 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2892 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2893 */ 2894 void *dBuffer1 = NULL; 2895 void *dBuffer2 = NULL; 2896 void *dBuffer3 = NULL; 2897 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2898 size_t bufferSize1 = 0; 2899 size_t bufferSize2 = 0; 2900 size_t bufferSize3 = 0; 2901 size_t bufferSize4 = 0; 2902 size_t bufferSize5 = 0; 2903 2904 /*----------------------------------------------------------------------*/ 2905 /* ask bufferSize1 bytes for external memory */ 2906 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 2907 PetscCallCUSPARSE(stat); 2908 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 2909 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2910 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 2911 PetscCallCUSPARSE(stat); 2912 2913 /*----------------------------------------------------------------------*/ 2914 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 2915 PetscCallCUSPARSE(stat); 2916 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 2917 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 2918 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 2919 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 2920 PetscCallCUSPARSE(stat); 2921 PetscCallCUDA(cudaFree(dBuffer1)); 2922 PetscCallCUDA(cudaFree(dBuffer2)); 2923 2924 /*----------------------------------------------------------------------*/ 2925 /* get matrix C non-zero entries C_nnz1 */ 2926 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2927 c->nz = (PetscInt)C_nnz1; 2928 /* allocate matrix C */ 2929 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2930 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2931 Ccsr->values = new THRUSTARRAY(c->nz); 2932 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2933 /* update matC with the new pointers */ 2934 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 2935 PetscCallCUSPARSE(stat); 2936 2937 /*----------------------------------------------------------------------*/ 2938 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 2939 PetscCallCUSPARSE(stat); 2940 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 2941 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 2942 PetscCallCUSPARSE(stat); 2943 PetscCallCUDA(cudaFree(dBuffer3)); 2944 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2945 PetscCallCUSPARSE(stat); 2946 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 2947 } 2948 #else 2949 size_t bufSize2; 2950 /* ask bufferSize bytes for external memory */ 2951 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 2952 PetscCallCUSPARSE(stat); 2953 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 2954 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2955 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 2956 PetscCallCUSPARSE(stat); 2957 /* ask bufferSize again bytes for external memory */ 2958 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 2959 PetscCallCUSPARSE(stat); 2960 /* The CUSPARSE documentation is not clear, nor the API 2961 We need both buffers to perform the operations properly! 2962 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2963 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2964 is stored in the descriptor! What a messy API... */ 2965 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 2966 /* compute the intermediate product of A * B */ 2967 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2968 PetscCallCUSPARSE(stat); 2969 /* get matrix C non-zero entries C_nnz1 */ 2970 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2971 c->nz = (PetscInt)C_nnz1; 2972 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 2973 mmdata->mmBufferSize / 1024)); 2974 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2975 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2976 Ccsr->values = new THRUSTARRAY(c->nz); 2977 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2978 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 2979 PetscCallCUSPARSE(stat); 2980 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2981 PetscCallCUSPARSE(stat); 2982 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2983 #else 2984 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2985 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2986 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 2987 PetscCallCUSPARSE(stat); 2988 c->nz = cnz; 2989 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2990 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2991 Ccsr->values = new THRUSTARRAY(c->nz); 2992 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2993 2994 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2995 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2996 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2997 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2998 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2999 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3000 PetscCallCUSPARSE(stat); 3001 #endif 3002 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3003 PetscCall(PetscLogGpuTimeEnd()); 3004 finalizesym: 3005 c->singlemalloc = PETSC_FALSE; 3006 c->free_a = PETSC_TRUE; 3007 c->free_ij = PETSC_TRUE; 3008 PetscCall(PetscMalloc1(m + 1, &c->i)); 3009 PetscCall(PetscMalloc1(c->nz, &c->j)); 3010 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3011 PetscInt *d_i = c->i; 3012 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3013 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3014 ii = *Ccsr->row_offsets; 3015 jj = *Ccsr->column_indices; 3016 if (ciscompressed) d_i = c->compressedrow.i; 3017 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3018 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3019 } else { 3020 PetscInt *d_i = c->i; 3021 if (ciscompressed) d_i = c->compressedrow.i; 3022 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3023 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3024 } 3025 if (ciscompressed) { /* need to expand host row offsets */ 3026 PetscInt r = 0; 3027 c->i[0] = 0; 3028 for (k = 0; k < c->compressedrow.nrows; k++) { 3029 const PetscInt next = c->compressedrow.rindex[k]; 3030 const PetscInt old = c->compressedrow.i[k]; 3031 for (; r < next; r++) c->i[r + 1] = old; 3032 } 3033 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3034 } 3035 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3036 PetscCall(PetscMalloc1(m, &c->ilen)); 3037 PetscCall(PetscMalloc1(m, &c->imax)); 3038 c->maxnz = c->nz; 3039 c->nonzerorowcnt = 0; 3040 c->rmax = 0; 3041 for (k = 0; k < m; k++) { 3042 const PetscInt nn = c->i[k + 1] - c->i[k]; 3043 c->ilen[k] = c->imax[k] = nn; 3044 c->nonzerorowcnt += (PetscInt) !!nn; 3045 c->rmax = PetscMax(c->rmax, nn); 3046 } 3047 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3048 PetscCall(PetscMalloc1(c->nz, &c->a)); 3049 Ccsr->num_entries = c->nz; 3050 3051 C->nonzerostate++; 3052 PetscCall(PetscLayoutSetUp(C->rmap)); 3053 PetscCall(PetscLayoutSetUp(C->cmap)); 3054 Ccusp->nonzerostate = C->nonzerostate; 3055 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3056 C->preallocated = PETSC_TRUE; 3057 C->assembled = PETSC_FALSE; 3058 C->was_assembled = PETSC_FALSE; 3059 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3060 mmdata->reusesym = PETSC_TRUE; 3061 C->offloadmask = PETSC_OFFLOAD_GPU; 3062 } 3063 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3064 PetscFunctionReturn(0); 3065 } 3066 3067 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3068 3069 /* handles sparse or dense B */ 3070 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3071 { 3072 Mat_Product *product = mat->product; 3073 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3074 3075 PetscFunctionBegin; 3076 MatCheckProduct(mat, 1); 3077 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3078 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3079 if (product->type == MATPRODUCT_ABC) { 3080 Ciscusp = PETSC_FALSE; 3081 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3082 } 3083 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3084 PetscBool usecpu = PETSC_FALSE; 3085 switch (product->type) { 3086 case MATPRODUCT_AB: 3087 if (product->api_user) { 3088 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3089 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3090 PetscOptionsEnd(); 3091 } else { 3092 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3093 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3094 PetscOptionsEnd(); 3095 } 3096 break; 3097 case MATPRODUCT_AtB: 3098 if (product->api_user) { 3099 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3100 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3101 PetscOptionsEnd(); 3102 } else { 3103 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3104 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3105 PetscOptionsEnd(); 3106 } 3107 break; 3108 case MATPRODUCT_PtAP: 3109 if (product->api_user) { 3110 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3111 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3112 PetscOptionsEnd(); 3113 } else { 3114 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3115 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3116 PetscOptionsEnd(); 3117 } 3118 break; 3119 case MATPRODUCT_RARt: 3120 if (product->api_user) { 3121 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3122 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3123 PetscOptionsEnd(); 3124 } else { 3125 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3126 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3127 PetscOptionsEnd(); 3128 } 3129 break; 3130 case MATPRODUCT_ABC: 3131 if (product->api_user) { 3132 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3133 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3134 PetscOptionsEnd(); 3135 } else { 3136 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3137 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3138 PetscOptionsEnd(); 3139 } 3140 break; 3141 default: 3142 break; 3143 } 3144 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3145 } 3146 /* dispatch */ 3147 if (isdense) { 3148 switch (product->type) { 3149 case MATPRODUCT_AB: 3150 case MATPRODUCT_AtB: 3151 case MATPRODUCT_ABt: 3152 case MATPRODUCT_PtAP: 3153 case MATPRODUCT_RARt: 3154 if (product->A->boundtocpu) { 3155 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3156 } else { 3157 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3158 } 3159 break; 3160 case MATPRODUCT_ABC: 3161 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3162 break; 3163 default: 3164 break; 3165 } 3166 } else if (Biscusp && Ciscusp) { 3167 switch (product->type) { 3168 case MATPRODUCT_AB: 3169 case MATPRODUCT_AtB: 3170 case MATPRODUCT_ABt: 3171 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3172 break; 3173 case MATPRODUCT_PtAP: 3174 case MATPRODUCT_RARt: 3175 case MATPRODUCT_ABC: 3176 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3177 break; 3178 default: 3179 break; 3180 } 3181 } else { /* fallback for AIJ */ 3182 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3183 } 3184 PetscFunctionReturn(0); 3185 } 3186 3187 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3188 { 3189 PetscFunctionBegin; 3190 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3191 PetscFunctionReturn(0); 3192 } 3193 3194 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3195 { 3196 PetscFunctionBegin; 3197 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3198 PetscFunctionReturn(0); 3199 } 3200 3201 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3202 { 3203 PetscFunctionBegin; 3204 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3205 PetscFunctionReturn(0); 3206 } 3207 3208 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3209 { 3210 PetscFunctionBegin; 3211 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3212 PetscFunctionReturn(0); 3213 } 3214 3215 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3216 { 3217 PetscFunctionBegin; 3218 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3219 PetscFunctionReturn(0); 3220 } 3221 3222 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3223 { 3224 int i = blockIdx.x * blockDim.x + threadIdx.x; 3225 if (i < n) y[idx[i]] += x[i]; 3226 } 3227 3228 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3229 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3230 { 3231 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3232 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3233 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3234 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3235 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3236 PetscBool compressed; 3237 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3238 PetscInt nx, ny; 3239 #endif 3240 3241 PetscFunctionBegin; 3242 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3243 if (!a->nz) { 3244 if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0)); 3245 else PetscCall(VecCopy_SeqCUDA(yy, zz)); 3246 PetscFunctionReturn(0); 3247 } 3248 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3249 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3250 if (!trans) { 3251 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3252 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3253 } else { 3254 if (herm || !A->form_explicit_transpose) { 3255 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3256 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3257 } else { 3258 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3259 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3260 } 3261 } 3262 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3263 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3264 3265 try { 3266 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3267 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3268 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3269 3270 PetscCall(PetscLogGpuTimeBegin()); 3271 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3272 /* z = A x + beta y. 3273 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3274 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3275 */ 3276 xptr = xarray; 3277 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3278 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3279 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3280 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3281 allocated to accommodate different uses. So we get the length info directly from mat. 3282 */ 3283 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3284 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3285 nx = mat->num_cols; 3286 ny = mat->num_rows; 3287 } 3288 #endif 3289 } else { 3290 /* z = A^T x + beta y 3291 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3292 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3293 */ 3294 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3295 dptr = zarray; 3296 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3297 if (compressed) { /* Scatter x to work vector */ 3298 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3299 3300 thrust::for_each( 3301 #if PetscDefined(HAVE_THRUST_ASYNC) 3302 thrust::cuda::par.on(PetscDefaultCudaStream), 3303 #endif 3304 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3305 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3306 } 3307 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3308 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3309 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3310 nx = mat->num_rows; 3311 ny = mat->num_cols; 3312 } 3313 #endif 3314 } 3315 3316 /* csr_spmv does y = alpha op(A) x + beta y */ 3317 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3318 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3319 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3320 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3321 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3322 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3323 PetscCallCUSPARSE( 3324 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3325 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3326 3327 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3328 } else { 3329 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3330 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3331 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3332 } 3333 3334 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3335 matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3336 #else 3337 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3338 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3339 #endif 3340 } else { 3341 if (cusparsestruct->nrows) { 3342 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3343 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3344 #else 3345 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3346 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3347 #endif 3348 } 3349 } 3350 PetscCall(PetscLogGpuTimeEnd()); 3351 3352 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3353 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3354 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3355 PetscCall(VecCopy_SeqCUDA(yy, zz)); /* zz = yy */ 3356 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3357 PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ 3358 } 3359 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3360 PetscCall(VecSet_SeqCUDA(zz, 0)); 3361 } 3362 3363 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3364 if (compressed) { 3365 PetscCall(PetscLogGpuTimeBegin()); 3366 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3367 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3368 prevent that. So I just add a ScatterAdd kernel. 3369 */ 3370 #if 0 3371 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3372 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3373 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3374 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3375 VecCUDAPlusEquals()); 3376 #else 3377 PetscInt n = matstruct->cprowIndices->size(); 3378 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3379 #endif 3380 PetscCall(PetscLogGpuTimeEnd()); 3381 } 3382 } else { 3383 if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ } 3384 } 3385 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3386 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3387 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3388 } catch (char *ex) { 3389 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3390 } 3391 if (yy) { 3392 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3393 } else { 3394 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3395 } 3396 PetscFunctionReturn(0); 3397 } 3398 3399 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3400 { 3401 PetscFunctionBegin; 3402 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3403 PetscFunctionReturn(0); 3404 } 3405 3406 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3407 { 3408 PetscObjectState onnz = A->nonzerostate; 3409 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3410 3411 PetscFunctionBegin; 3412 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3413 if (onnz != A->nonzerostate && cusp->deviceMat) { 3414 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 3415 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3416 cusp->deviceMat = NULL; 3417 } 3418 PetscFunctionReturn(0); 3419 } 3420 3421 /* --------------------------------------------------------------------------------*/ 3422 /*@ 3423 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3424 (the default parallel PETSc format). This matrix will ultimately pushed down 3425 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 3426 assembly performance the user should preallocate the matrix storage by setting 3427 the parameter nz (or the array nnz). By setting these parameters accurately, 3428 performance during matrix assembly can be increased by more than a factor of 50. 3429 3430 Collective 3431 3432 Input Parameters: 3433 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3434 . m - number of rows 3435 . n - number of columns 3436 . nz - number of nonzeros per row (same for all rows) 3437 - nnz - array containing the number of nonzeros in the various rows 3438 (possibly different for each row) or NULL 3439 3440 Output Parameter: 3441 . A - the matrix 3442 3443 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3444 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3445 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3446 3447 Notes: 3448 If nnz is given then nz is ignored 3449 3450 The AIJ format, also called 3451 compressed row storage, is fully compatible with standard Fortran 77 3452 storage. That is, the stored row and column indices can begin at 3453 either one (as in Fortran) or zero. See the users' manual for details. 3454 3455 Specify the preallocated storage with either nz or nnz (not both). 3456 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 3457 allocation. For large problems you MUST preallocate memory or you 3458 will get TERRIBLE performance, see the users' manual chapter on matrices. 3459 3460 By default, this format uses inodes (identical nodes) when possible, to 3461 improve numerical efficiency of matrix-vector products and solves. We 3462 search for consecutive rows with the same nonzero structure, thereby 3463 reusing matrix information to achieve increased efficiency. 3464 3465 Level: intermediate 3466 3467 .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3468 @*/ 3469 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3470 { 3471 PetscFunctionBegin; 3472 PetscCall(MatCreate(comm, A)); 3473 PetscCall(MatSetSizes(*A, m, n, m, n)); 3474 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3475 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3476 PetscFunctionReturn(0); 3477 } 3478 3479 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3480 { 3481 PetscFunctionBegin; 3482 if (A->factortype == MAT_FACTOR_NONE) { 3483 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 3484 } else { 3485 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3486 } 3487 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3488 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3489 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3490 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3491 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3492 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3493 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3494 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3495 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3496 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3497 PetscCall(MatDestroy_SeqAIJ(A)); 3498 PetscFunctionReturn(0); 3499 } 3500 3501 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3502 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3503 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3504 { 3505 PetscFunctionBegin; 3506 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3507 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3508 PetscFunctionReturn(0); 3509 } 3510 3511 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3512 { 3513 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3514 Mat_SeqAIJCUSPARSE *cy; 3515 Mat_SeqAIJCUSPARSE *cx; 3516 PetscScalar *ay; 3517 const PetscScalar *ax; 3518 CsrMatrix *csry, *csrx; 3519 3520 PetscFunctionBegin; 3521 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3522 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3523 if (X->ops->axpy != Y->ops->axpy) { 3524 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3525 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3526 PetscFunctionReturn(0); 3527 } 3528 /* if we are here, it means both matrices are bound to GPU */ 3529 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3530 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3531 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3532 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3533 csry = (CsrMatrix *)cy->mat->mat; 3534 csrx = (CsrMatrix *)cx->mat->mat; 3535 /* see if we can turn this into a cublas axpy */ 3536 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3537 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3538 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3539 if (eq) str = SAME_NONZERO_PATTERN; 3540 } 3541 /* spgeam is buggy with one column */ 3542 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3543 3544 if (str == SUBSET_NONZERO_PATTERN) { 3545 PetscScalar b = 1.0; 3546 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3547 size_t bufferSize; 3548 void *buffer; 3549 #endif 3550 3551 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3552 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3553 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3554 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3555 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3556 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3557 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3558 PetscCall(PetscLogGpuTimeBegin()); 3559 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3560 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3561 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3562 PetscCall(PetscLogGpuTimeEnd()); 3563 PetscCallCUDA(cudaFree(buffer)); 3564 #else 3565 PetscCall(PetscLogGpuTimeBegin()); 3566 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3567 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3568 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3569 PetscCall(PetscLogGpuTimeEnd()); 3570 #endif 3571 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3572 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3573 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3574 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3575 } else if (str == SAME_NONZERO_PATTERN) { 3576 cublasHandle_t cublasv2handle; 3577 PetscBLASInt one = 1, bnz = 1; 3578 3579 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3580 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3581 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3582 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3583 PetscCall(PetscLogGpuTimeBegin()); 3584 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3585 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3586 PetscCall(PetscLogGpuTimeEnd()); 3587 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3588 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3589 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3590 } else { 3591 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3592 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3593 } 3594 PetscFunctionReturn(0); 3595 } 3596 3597 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3598 { 3599 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3600 PetscScalar *ay; 3601 cublasHandle_t cublasv2handle; 3602 PetscBLASInt one = 1, bnz = 1; 3603 3604 PetscFunctionBegin; 3605 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3606 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3607 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3608 PetscCall(PetscLogGpuTimeBegin()); 3609 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3610 PetscCall(PetscLogGpuFlops(bnz)); 3611 PetscCall(PetscLogGpuTimeEnd()); 3612 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3613 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3614 PetscFunctionReturn(0); 3615 } 3616 3617 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3618 { 3619 PetscBool both = PETSC_FALSE; 3620 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3621 3622 PetscFunctionBegin; 3623 if (A->factortype == MAT_FACTOR_NONE) { 3624 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3625 if (spptr->mat) { 3626 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3627 if (matrix->values) { 3628 both = PETSC_TRUE; 3629 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3630 } 3631 } 3632 if (spptr->matTranspose) { 3633 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3634 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3635 } 3636 } 3637 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3638 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3639 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3640 else A->offloadmask = PETSC_OFFLOAD_CPU; 3641 PetscFunctionReturn(0); 3642 } 3643 3644 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3645 { 3646 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3647 3648 PetscFunctionBegin; 3649 if (A->factortype != MAT_FACTOR_NONE) { 3650 A->boundtocpu = flg; 3651 PetscFunctionReturn(0); 3652 } 3653 if (flg) { 3654 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3655 3656 A->ops->scale = MatScale_SeqAIJ; 3657 A->ops->axpy = MatAXPY_SeqAIJ; 3658 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3659 A->ops->mult = MatMult_SeqAIJ; 3660 A->ops->multadd = MatMultAdd_SeqAIJ; 3661 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3662 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3663 A->ops->multhermitiantranspose = NULL; 3664 A->ops->multhermitiantransposeadd = NULL; 3665 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3666 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3667 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3668 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3669 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3670 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3671 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3672 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3673 } else { 3674 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3675 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3676 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3677 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3678 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3679 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3680 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3681 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3682 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3683 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3684 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3685 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3686 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3687 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3688 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3689 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3690 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3691 3692 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3693 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3694 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3695 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3696 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 3697 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3698 } 3699 A->boundtocpu = flg; 3700 if (flg && a->inode.size) { 3701 a->inode.use = PETSC_TRUE; 3702 } else { 3703 a->inode.use = PETSC_FALSE; 3704 } 3705 PetscFunctionReturn(0); 3706 } 3707 3708 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 3709 { 3710 Mat B; 3711 3712 PetscFunctionBegin; 3713 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3714 if (reuse == MAT_INITIAL_MATRIX) { 3715 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3716 } else if (reuse == MAT_REUSE_MATRIX) { 3717 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 3718 } 3719 B = *newmat; 3720 3721 PetscCall(PetscFree(B->defaultvectype)); 3722 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 3723 3724 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3725 if (B->factortype == MAT_FACTOR_NONE) { 3726 Mat_SeqAIJCUSPARSE *spptr; 3727 PetscCall(PetscNew(&spptr)); 3728 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3729 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3730 spptr->format = MAT_CUSPARSE_CSR; 3731 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3732 #if CUSPARSE_VERSION > 11301 3733 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3734 #else 3735 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3736 #endif 3737 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3738 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3739 #endif 3740 B->spptr = spptr; 3741 } else { 3742 Mat_SeqAIJCUSPARSETriFactors *spptr; 3743 3744 PetscCall(PetscNew(&spptr)); 3745 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3746 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3747 B->spptr = spptr; 3748 } 3749 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3750 } 3751 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3752 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3753 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3754 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3755 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3756 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3757 3758 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 3759 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 3760 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3761 #if defined(PETSC_HAVE_HYPRE) 3762 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 3763 #endif 3764 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3765 PetscFunctionReturn(0); 3766 } 3767 3768 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3769 { 3770 PetscFunctionBegin; 3771 PetscCall(MatCreate_SeqAIJ(B)); 3772 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 3773 PetscFunctionReturn(0); 3774 } 3775 3776 /*MC 3777 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3778 3779 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 3780 CSR, ELL, or Hybrid format. 3781 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 3782 3783 Options Database Keys: 3784 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 3785 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 3786 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 3787 + -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 3788 3789 Level: beginner 3790 3791 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3792 M*/ 3793 3794 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 3795 3796 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3797 { 3798 PetscFunctionBegin; 3799 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 3800 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 3801 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 3802 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 3803 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 3804 3805 PetscFunctionReturn(0); 3806 } 3807 3808 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3809 { 3810 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 3811 3812 PetscFunctionBegin; 3813 if (!cusp) PetscFunctionReturn(0); 3814 delete cusp->cooPerm; 3815 delete cusp->cooPerm_a; 3816 cusp->cooPerm = NULL; 3817 cusp->cooPerm_a = NULL; 3818 if (cusp->use_extended_coo) { 3819 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3820 PetscCallCUDA(cudaFree(cusp->perm_d)); 3821 } 3822 cusp->use_extended_coo = PETSC_FALSE; 3823 PetscFunctionReturn(0); 3824 } 3825 3826 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3827 { 3828 PetscFunctionBegin; 3829 if (*cusparsestruct) { 3830 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 3831 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 3832 delete (*cusparsestruct)->workVector; 3833 delete (*cusparsestruct)->rowoffsets_gpu; 3834 delete (*cusparsestruct)->cooPerm; 3835 delete (*cusparsestruct)->cooPerm_a; 3836 delete (*cusparsestruct)->csr2csc_i; 3837 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3838 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3839 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3840 PetscCall(PetscFree(*cusparsestruct)); 3841 } 3842 PetscFunctionReturn(0); 3843 } 3844 3845 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3846 { 3847 PetscFunctionBegin; 3848 if (*mat) { 3849 delete (*mat)->values; 3850 delete (*mat)->column_indices; 3851 delete (*mat)->row_offsets; 3852 delete *mat; 3853 *mat = 0; 3854 } 3855 PetscFunctionReturn(0); 3856 } 3857 3858 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3859 { 3860 PetscFunctionBegin; 3861 if (*trifactor) { 3862 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3863 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 3864 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3865 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3866 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3867 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3868 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3869 #endif 3870 PetscCall(PetscFree(*trifactor)); 3871 } 3872 PetscFunctionReturn(0); 3873 } 3874 3875 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 3876 { 3877 CsrMatrix *mat; 3878 3879 PetscFunctionBegin; 3880 if (*matstruct) { 3881 if ((*matstruct)->mat) { 3882 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 3883 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3884 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3885 #else 3886 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3887 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3888 #endif 3889 } else { 3890 mat = (CsrMatrix *)(*matstruct)->mat; 3891 CsrMatrix_Destroy(&mat); 3892 } 3893 } 3894 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3895 delete (*matstruct)->cprowIndices; 3896 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3897 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3898 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3899 3900 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3901 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3902 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3903 for (int i = 0; i < 3; i++) { 3904 if (mdata->cuSpMV[i].initialized) { 3905 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3906 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3907 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3908 } 3909 } 3910 #endif 3911 delete *matstruct; 3912 *matstruct = NULL; 3913 } 3914 PetscFunctionReturn(0); 3915 } 3916 3917 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 3918 { 3919 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 3920 3921 PetscFunctionBegin; 3922 if (fs) { 3923 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 3924 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 3925 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 3926 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 3927 delete fs->rpermIndices; 3928 delete fs->cpermIndices; 3929 delete fs->workVector; 3930 fs->rpermIndices = NULL; 3931 fs->cpermIndices = NULL; 3932 fs->workVector = NULL; 3933 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 3934 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 3935 fs->init_dev_prop = PETSC_FALSE; 3936 #if CUSPARSE_VERSION >= 11500 3937 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 3938 PetscCallCUDA(cudaFree(fs->csrColIdx)); 3939 PetscCallCUDA(cudaFree(fs->csrVal)); 3940 PetscCallCUDA(cudaFree(fs->X)); 3941 PetscCallCUDA(cudaFree(fs->Y)); 3942 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 3943 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 3944 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 3945 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 3946 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 3947 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 3948 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 3949 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 3950 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 3951 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 3952 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 3953 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 3954 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 3955 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 3956 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 3957 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 3958 3959 fs->createdTransposeSpSVDescr = PETSC_FALSE; 3960 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 3961 #endif 3962 } 3963 PetscFunctionReturn(0); 3964 } 3965 3966 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 3967 { 3968 cusparseHandle_t handle; 3969 3970 PetscFunctionBegin; 3971 if (*trifactors) { 3972 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3973 if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle)); 3974 PetscCall(PetscFree(*trifactors)); 3975 } 3976 PetscFunctionReturn(0); 3977 } 3978 3979 struct IJCompare { 3980 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3981 { 3982 if (t1.get<0>() < t2.get<0>()) return true; 3983 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3984 return false; 3985 } 3986 }; 3987 3988 struct IJEqual { 3989 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3990 { 3991 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3992 return true; 3993 } 3994 }; 3995 3996 struct IJDiff { 3997 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 3998 }; 3999 4000 struct IJSum { 4001 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 4002 }; 4003 4004 #include <thrust/iterator/discard_iterator.h> 4005 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4006 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 4007 { 4008 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4009 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4010 THRUSTARRAY *cooPerm_v = NULL; 4011 thrust::device_ptr<const PetscScalar> d_v; 4012 CsrMatrix *matrix; 4013 PetscInt n; 4014 4015 PetscFunctionBegin; 4016 PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 4017 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 4018 if (!cusp->cooPerm) { 4019 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 4020 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 4021 PetscFunctionReturn(0); 4022 } 4023 matrix = (CsrMatrix *)cusp->mat->mat; 4024 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4025 if (!v) { 4026 if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4027 goto finalize; 4028 } 4029 n = cusp->cooPerm->size(); 4030 if (isCudaMem(v)) { 4031 d_v = thrust::device_pointer_cast(v); 4032 } else { 4033 cooPerm_v = new THRUSTARRAY(n); 4034 cooPerm_v->assign(v, v + n); 4035 d_v = cooPerm_v->data(); 4036 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4037 } 4038 PetscCall(PetscLogGpuTimeBegin()); 4039 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4040 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4041 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4042 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4043 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4044 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4045 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4046 */ 4047 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4048 thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4049 delete cooPerm_w; 4050 } else { 4051 /* all nonzeros in d_v[] are unique entries */ 4052 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4053 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4054 thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4055 } 4056 } else { 4057 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4058 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4059 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4060 } else { 4061 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4062 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4063 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4064 } 4065 } 4066 PetscCall(PetscLogGpuTimeEnd()); 4067 finalize: 4068 delete cooPerm_v; 4069 A->offloadmask = PETSC_OFFLOAD_GPU; 4070 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4071 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4072 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 4073 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 4074 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4075 a->reallocs = 0; 4076 A->info.mallocs += 0; 4077 A->info.nz_unneeded = 0; 4078 A->assembled = A->was_assembled = PETSC_TRUE; 4079 A->num_ass++; 4080 PetscFunctionReturn(0); 4081 } 4082 4083 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4084 { 4085 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4086 4087 PetscFunctionBegin; 4088 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4089 if (!cusp) PetscFunctionReturn(0); 4090 if (destroy) { 4091 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4092 delete cusp->csr2csc_i; 4093 cusp->csr2csc_i = NULL; 4094 } 4095 A->transupdated = PETSC_FALSE; 4096 PetscFunctionReturn(0); 4097 } 4098 4099 #include <thrust/binary_search.h> 4100 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4101 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 4102 { 4103 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4104 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4105 PetscInt cooPerm_n, nzr = 0; 4106 4107 PetscFunctionBegin; 4108 PetscCall(PetscLayoutSetUp(A->rmap)); 4109 PetscCall(PetscLayoutSetUp(A->cmap)); 4110 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4111 if (n != cooPerm_n) { 4112 delete cusp->cooPerm; 4113 delete cusp->cooPerm_a; 4114 cusp->cooPerm = NULL; 4115 cusp->cooPerm_a = NULL; 4116 } 4117 if (n) { 4118 thrust::device_ptr<PetscInt> d_i, d_j; 4119 PetscInt *d_raw_i, *d_raw_j; 4120 PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4121 PetscMemType imtype, jmtype; 4122 4123 PetscCall(PetscGetMemType(coo_i, &imtype)); 4124 if (PetscMemTypeHost(imtype)) { 4125 PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4126 PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4127 d_i = thrust::device_pointer_cast(d_raw_i); 4128 free_raw_i = PETSC_TRUE; 4129 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4130 } else { 4131 d_i = thrust::device_pointer_cast(coo_i); 4132 } 4133 4134 PetscCall(PetscGetMemType(coo_j, &jmtype)); 4135 if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4136 PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4137 PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4138 d_j = thrust::device_pointer_cast(d_raw_j); 4139 free_raw_j = PETSC_TRUE; 4140 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4141 } else { 4142 d_j = thrust::device_pointer_cast(coo_j); 4143 } 4144 4145 THRUSTINTARRAY ii(A->rmap->n); 4146 4147 if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4148 if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 4149 4150 /* Ex. 4151 n = 6 4152 coo_i = [3,3,1,4,1,4] 4153 coo_j = [3,2,2,5,2,6] 4154 */ 4155 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4156 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 4157 4158 PetscCall(PetscLogGpuTimeBegin()); 4159 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4160 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4161 (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4162 THRUSTINTARRAY w(d_j, d_j + n); 4163 4164 /* 4165 d_i = [1,1,3,3,4,4] 4166 d_j = [2,2,2,3,5,6] 4167 cooPerm = [2,4,1,0,3,5] 4168 */ 4169 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4170 4171 /* 4172 d_i = [1,3,3,4,4,x] 4173 ^ekey 4174 d_j = [2,2,3,5,6,x] 4175 ^nekye 4176 */ 4177 if (nekey == ekey) { /* all entries are unique */ 4178 delete cusp->cooPerm_a; 4179 cusp->cooPerm_a = NULL; 4180 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4181 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4182 adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4183 adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4184 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4185 w[0] = 0; 4186 thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4187 thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4188 } 4189 thrust::counting_iterator<PetscInt> search_begin(0); 4190 thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4191 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4192 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4193 PetscCall(PetscLogGpuTimeEnd()); 4194 4195 PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 4196 a->singlemalloc = PETSC_FALSE; 4197 a->free_a = PETSC_TRUE; 4198 a->free_ij = PETSC_TRUE; 4199 PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4200 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4201 PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4202 a->nz = a->maxnz = a->i[A->rmap->n]; 4203 a->rmax = 0; 4204 PetscCall(PetscMalloc1(a->nz, &a->a)); 4205 PetscCall(PetscMalloc1(a->nz, &a->j)); 4206 PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4207 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 4208 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 4209 for (PetscInt i = 0; i < A->rmap->n; i++) { 4210 const PetscInt nnzr = a->i[i + 1] - a->i[i]; 4211 nzr += (PetscInt) !!(nnzr); 4212 a->ilen[i] = a->imax[i] = nnzr; 4213 a->rmax = PetscMax(a->rmax, nnzr); 4214 } 4215 a->nonzerorowcnt = nzr; 4216 A->preallocated = PETSC_TRUE; 4217 PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 4218 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4219 if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4220 if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 4221 } else { 4222 PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 4223 } 4224 PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 4225 4226 /* We want to allocate the CUSPARSE struct for matvec now. 4227 The code is so convoluted now that I prefer to copy zeros */ 4228 PetscCall(PetscArrayzero(a->a, a->nz)); 4229 PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 4230 A->offloadmask = PETSC_OFFLOAD_CPU; 4231 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4232 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 4233 PetscFunctionReturn(0); 4234 } 4235 4236 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4237 { 4238 Mat_SeqAIJ *seq; 4239 Mat_SeqAIJCUSPARSE *dev; 4240 PetscBool coo_basic = PETSC_TRUE; 4241 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4242 4243 PetscFunctionBegin; 4244 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4245 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4246 if (coo_i) { 4247 PetscCall(PetscGetMemType(coo_i, &mtype)); 4248 if (PetscMemTypeHost(mtype)) { 4249 for (PetscCount k = 0; k < coo_n; k++) { 4250 if (coo_i[k] < 0 || coo_j[k] < 0) { 4251 coo_basic = PETSC_FALSE; 4252 break; 4253 } 4254 } 4255 } 4256 } 4257 4258 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4259 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4260 } else { 4261 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4262 mat->offloadmask = PETSC_OFFLOAD_CPU; 4263 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4264 seq = static_cast<Mat_SeqAIJ *>(mat->data); 4265 dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4266 PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 4267 PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4268 PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 4269 PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4270 dev->use_extended_coo = PETSC_TRUE; 4271 } 4272 PetscFunctionReturn(0); 4273 } 4274 4275 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4276 { 4277 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4278 const PetscCount grid_size = gridDim.x * blockDim.x; 4279 for (; i < nnz; i += grid_size) { 4280 PetscScalar sum = 0.0; 4281 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4282 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4283 } 4284 } 4285 4286 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4287 { 4288 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4289 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4290 PetscCount Annz = seq->nz; 4291 PetscMemType memtype; 4292 const PetscScalar *v1 = v; 4293 PetscScalar *Aa; 4294 4295 PetscFunctionBegin; 4296 if (dev->use_extended_coo) { 4297 PetscCall(PetscGetMemType(v, &memtype)); 4298 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4299 PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 4300 PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4301 } 4302 4303 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4304 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4305 4306 if (Annz) { 4307 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 4308 PetscCallCUDA(cudaPeekAtLastError()); 4309 } 4310 4311 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4312 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4313 4314 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4315 } else { 4316 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4317 } 4318 PetscFunctionReturn(0); 4319 } 4320 4321 /*@C 4322 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices. 4323 4324 Not collective 4325 4326 Input Parameters: 4327 + A - the matrix 4328 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4329 4330 Output Parameters: 4331 + ia - the CSR row pointers 4332 - ja - the CSR column indices 4333 4334 Level: developer 4335 4336 Note: 4337 When compressed is true, the CSR structure does not contain empty rows 4338 4339 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4340 @*/ 4341 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4342 { 4343 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4344 CsrMatrix *csr; 4345 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4346 4347 PetscFunctionBegin; 4348 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4349 if (!i || !j) PetscFunctionReturn(0); 4350 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4351 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4352 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4353 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4354 csr = (CsrMatrix *)cusp->mat->mat; 4355 if (i) { 4356 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4357 if (!cusp->rowoffsets_gpu) { 4358 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4359 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4360 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4361 } 4362 *i = cusp->rowoffsets_gpu->data().get(); 4363 } else *i = csr->row_offsets->data().get(); 4364 } 4365 if (j) *j = csr->column_indices->data().get(); 4366 PetscFunctionReturn(0); 4367 } 4368 4369 /*@C 4370 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4371 4372 Not collective 4373 4374 Input Parameters: 4375 + A - the matrix 4376 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4377 4378 Output Parameters: 4379 + ia - the CSR row pointers 4380 - ja - the CSR column indices 4381 4382 Level: developer 4383 4384 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4385 @*/ 4386 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4387 { 4388 PetscFunctionBegin; 4389 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4390 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4391 if (i) *i = NULL; 4392 if (j) *j = NULL; 4393 PetscFunctionReturn(0); 4394 } 4395 4396 /*@C 4397 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4398 4399 Not Collective 4400 4401 Input Parameter: 4402 . A - a `MATSEQAIJCUSPARSE` matrix 4403 4404 Output Parameter: 4405 . a - pointer to the device data 4406 4407 Level: developer 4408 4409 Note: 4410 May trigger host-device copies if up-to-date matrix data is on host 4411 4412 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4413 @*/ 4414 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4415 { 4416 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4417 CsrMatrix *csr; 4418 4419 PetscFunctionBegin; 4420 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4421 PetscValidPointer(a, 2); 4422 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4423 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4424 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4425 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4426 csr = (CsrMatrix *)cusp->mat->mat; 4427 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4428 *a = csr->values->data().get(); 4429 PetscFunctionReturn(0); 4430 } 4431 4432 /*@C 4433 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4434 4435 Not Collective 4436 4437 Input Parameter: 4438 . A - a `MATSEQAIJCUSPARSE` matrix 4439 4440 Output Parameter: 4441 . a - pointer to the device data 4442 4443 Level: developer 4444 4445 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4446 @*/ 4447 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4448 { 4449 PetscFunctionBegin; 4450 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4451 PetscValidPointer(a, 2); 4452 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4453 *a = NULL; 4454 PetscFunctionReturn(0); 4455 } 4456 4457 /*@C 4458 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4459 4460 Not Collective 4461 4462 Input Parameter: 4463 . A - a `MATSEQAIJCUSPARSE` matrix 4464 4465 Output Parameter: 4466 . a - pointer to the device data 4467 4468 Level: developer 4469 4470 Note: 4471 May trigger host-device copies if up-to-date matrix data is on host 4472 4473 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4474 @*/ 4475 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4476 { 4477 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4478 CsrMatrix *csr; 4479 4480 PetscFunctionBegin; 4481 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4482 PetscValidPointer(a, 2); 4483 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4484 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4485 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4486 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4487 csr = (CsrMatrix *)cusp->mat->mat; 4488 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4489 *a = csr->values->data().get(); 4490 A->offloadmask = PETSC_OFFLOAD_GPU; 4491 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4492 PetscFunctionReturn(0); 4493 } 4494 /*@C 4495 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4496 4497 Not Collective 4498 4499 Input Parameter: 4500 . A - a `MATSEQAIJCUSPARSE` matrix 4501 4502 Output Parameter: 4503 . a - pointer to the device data 4504 4505 Level: developer 4506 4507 .seealso: `MatSeqAIJCUSPARSEGetArray()` 4508 @*/ 4509 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4510 { 4511 PetscFunctionBegin; 4512 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4513 PetscValidPointer(a, 2); 4514 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4515 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4516 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4517 *a = NULL; 4518 PetscFunctionReturn(0); 4519 } 4520 4521 /*@C 4522 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4523 4524 Not Collective 4525 4526 Input Parameter: 4527 . A - a `MATSEQAIJCUSPARSE` matrix 4528 4529 Output Parameter: 4530 . a - pointer to the device data 4531 4532 Level: developer 4533 4534 Note: 4535 Does not trigger host-device copies and flags data validity on the GPU 4536 4537 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4538 @*/ 4539 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4540 { 4541 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4542 CsrMatrix *csr; 4543 4544 PetscFunctionBegin; 4545 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4546 PetscValidPointer(a, 2); 4547 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4548 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4549 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4550 csr = (CsrMatrix *)cusp->mat->mat; 4551 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4552 *a = csr->values->data().get(); 4553 A->offloadmask = PETSC_OFFLOAD_GPU; 4554 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4555 PetscFunctionReturn(0); 4556 } 4557 4558 /*@C 4559 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4560 4561 Not Collective 4562 4563 Input Parameter: 4564 . A - a `MATSEQAIJCUSPARSE` matrix 4565 4566 Output Parameter: 4567 . a - pointer to the device data 4568 4569 Level: developer 4570 4571 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 4572 @*/ 4573 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4574 { 4575 PetscFunctionBegin; 4576 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4577 PetscValidPointer(a, 2); 4578 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4579 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4580 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4581 *a = NULL; 4582 PetscFunctionReturn(0); 4583 } 4584 4585 struct IJCompare4 { 4586 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4587 { 4588 if (t1.get<0>() < t2.get<0>()) return true; 4589 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4590 return false; 4591 } 4592 }; 4593 4594 struct Shift { 4595 int _shift; 4596 4597 Shift(int shift) : _shift(shift) { } 4598 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4599 }; 4600 4601 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4602 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4603 { 4604 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4605 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4606 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4607 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4608 PetscInt Annz, Bnnz; 4609 cusparseStatus_t stat; 4610 PetscInt i, m, n, zero = 0; 4611 4612 PetscFunctionBegin; 4613 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4614 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4615 PetscValidPointer(C, 4); 4616 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4617 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4618 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4619 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4620 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4621 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4622 if (reuse == MAT_INITIAL_MATRIX) { 4623 m = A->rmap->n; 4624 n = A->cmap->n + B->cmap->n; 4625 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4626 PetscCall(MatSetSizes(*C, m, n, m, n)); 4627 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4628 c = (Mat_SeqAIJ *)(*C)->data; 4629 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4630 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4631 Ccsr = new CsrMatrix; 4632 Cmat->cprowIndices = NULL; 4633 c->compressedrow.use = PETSC_FALSE; 4634 c->compressedrow.nrows = 0; 4635 c->compressedrow.i = NULL; 4636 c->compressedrow.rindex = NULL; 4637 Ccusp->workVector = NULL; 4638 Ccusp->nrows = m; 4639 Ccusp->mat = Cmat; 4640 Ccusp->mat->mat = Ccsr; 4641 Ccsr->num_rows = m; 4642 Ccsr->num_cols = n; 4643 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4644 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4645 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4646 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 4647 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 4648 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4649 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4650 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4651 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4652 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4653 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4654 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4655 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4656 4657 Acsr = (CsrMatrix *)Acusp->mat->mat; 4658 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4659 Annz = (PetscInt)Acsr->column_indices->size(); 4660 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4661 c->nz = Annz + Bnnz; 4662 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4663 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4664 Ccsr->values = new THRUSTARRAY(c->nz); 4665 Ccsr->num_entries = c->nz; 4666 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4667 if (c->nz) { 4668 auto Acoo = new THRUSTINTARRAY32(Annz); 4669 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4670 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4671 THRUSTINTARRAY32 *Aroff, *Broff; 4672 4673 if (a->compressedrow.use) { /* need full row offset */ 4674 if (!Acusp->rowoffsets_gpu) { 4675 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4676 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4677 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4678 } 4679 Aroff = Acusp->rowoffsets_gpu; 4680 } else Aroff = Acsr->row_offsets; 4681 if (b->compressedrow.use) { /* need full row offset */ 4682 if (!Bcusp->rowoffsets_gpu) { 4683 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4684 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4685 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4686 } 4687 Broff = Bcusp->rowoffsets_gpu; 4688 } else Broff = Bcsr->row_offsets; 4689 PetscCall(PetscLogGpuTimeBegin()); 4690 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4691 PetscCallCUSPARSE(stat); 4692 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4693 PetscCallCUSPARSE(stat); 4694 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4695 auto Aperm = thrust::make_constant_iterator(1); 4696 auto Bperm = thrust::make_constant_iterator(0); 4697 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4698 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4699 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4700 #else 4701 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4702 auto Bcib = Bcsr->column_indices->begin(); 4703 auto Bcie = Bcsr->column_indices->end(); 4704 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4705 #endif 4706 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4707 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4708 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4709 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4710 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4711 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4712 auto p1 = Ccusp->cooPerm->begin(); 4713 auto p2 = Ccusp->cooPerm->begin(); 4714 thrust::advance(p2, Annz); 4715 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4716 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4717 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4718 #endif 4719 auto cci = thrust::make_counting_iterator(zero); 4720 auto cce = thrust::make_counting_iterator(c->nz); 4721 #if 0 //Errors on SUMMIT cuda 11.1.0 4722 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4723 #else 4724 auto pred = thrust::identity<int>(); 4725 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4726 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4727 #endif 4728 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4729 PetscCallCUSPARSE(stat); 4730 PetscCall(PetscLogGpuTimeEnd()); 4731 delete wPerm; 4732 delete Acoo; 4733 delete Bcoo; 4734 delete Ccoo; 4735 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4736 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4737 PetscCallCUSPARSE(stat); 4738 #endif 4739 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4740 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4741 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4742 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4743 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4744 CsrMatrix *CcsrT = new CsrMatrix; 4745 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4746 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4747 4748 (*C)->form_explicit_transpose = PETSC_TRUE; 4749 (*C)->transupdated = PETSC_TRUE; 4750 Ccusp->rowoffsets_gpu = NULL; 4751 CmatT->cprowIndices = NULL; 4752 CmatT->mat = CcsrT; 4753 CcsrT->num_rows = n; 4754 CcsrT->num_cols = m; 4755 CcsrT->num_entries = c->nz; 4756 4757 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4758 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4759 CcsrT->values = new THRUSTARRAY(c->nz); 4760 4761 PetscCall(PetscLogGpuTimeBegin()); 4762 auto rT = CcsrT->row_offsets->begin(); 4763 if (AT) { 4764 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4765 thrust::advance(rT, -1); 4766 } 4767 if (BT) { 4768 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4769 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4770 thrust::copy(titb, tite, rT); 4771 } 4772 auto cT = CcsrT->column_indices->begin(); 4773 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4774 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4775 auto vT = CcsrT->values->begin(); 4776 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4777 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4778 PetscCall(PetscLogGpuTimeEnd()); 4779 4780 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4781 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4782 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4783 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 4784 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 4785 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4786 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4787 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4788 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4789 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4790 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4791 PetscCallCUSPARSE(stat); 4792 #endif 4793 Ccusp->matTranspose = CmatT; 4794 } 4795 } 4796 4797 c->singlemalloc = PETSC_FALSE; 4798 c->free_a = PETSC_TRUE; 4799 c->free_ij = PETSC_TRUE; 4800 PetscCall(PetscMalloc1(m + 1, &c->i)); 4801 PetscCall(PetscMalloc1(c->nz, &c->j)); 4802 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4803 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4804 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4805 ii = *Ccsr->row_offsets; 4806 jj = *Ccsr->column_indices; 4807 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4808 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4809 } else { 4810 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4811 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4812 } 4813 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4814 PetscCall(PetscMalloc1(m, &c->ilen)); 4815 PetscCall(PetscMalloc1(m, &c->imax)); 4816 c->maxnz = c->nz; 4817 c->nonzerorowcnt = 0; 4818 c->rmax = 0; 4819 for (i = 0; i < m; i++) { 4820 const PetscInt nn = c->i[i + 1] - c->i[i]; 4821 c->ilen[i] = c->imax[i] = nn; 4822 c->nonzerorowcnt += (PetscInt) !!nn; 4823 c->rmax = PetscMax(c->rmax, nn); 4824 } 4825 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4826 PetscCall(PetscMalloc1(c->nz, &c->a)); 4827 (*C)->nonzerostate++; 4828 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4829 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4830 Ccusp->nonzerostate = (*C)->nonzerostate; 4831 (*C)->preallocated = PETSC_TRUE; 4832 } else { 4833 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4834 c = (Mat_SeqAIJ *)(*C)->data; 4835 if (c->nz) { 4836 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4837 PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 4838 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4839 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4840 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4841 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4842 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4843 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4844 Acsr = (CsrMatrix *)Acusp->mat->mat; 4845 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4846 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4847 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4848 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4849 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4850 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4851 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 4852 auto pmid = Ccusp->cooPerm->begin(); 4853 thrust::advance(pmid, Acsr->num_entries); 4854 PetscCall(PetscLogGpuTimeBegin()); 4855 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 4856 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4857 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4858 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4859 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 4860 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4861 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4862 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4863 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4864 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4865 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4866 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4867 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4868 auto vT = CcsrT->values->begin(); 4869 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4870 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4871 (*C)->transupdated = PETSC_TRUE; 4872 } 4873 PetscCall(PetscLogGpuTimeEnd()); 4874 } 4875 } 4876 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4877 (*C)->assembled = PETSC_TRUE; 4878 (*C)->was_assembled = PETSC_FALSE; 4879 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4880 PetscFunctionReturn(0); 4881 } 4882 4883 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4884 { 4885 bool dmem; 4886 const PetscScalar *av; 4887 4888 PetscFunctionBegin; 4889 dmem = isCudaMem(v); 4890 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4891 if (n && idx) { 4892 THRUSTINTARRAY widx(n); 4893 widx.assign(idx, idx + n); 4894 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4895 4896 THRUSTARRAY *w = NULL; 4897 thrust::device_ptr<PetscScalar> dv; 4898 if (dmem) { 4899 dv = thrust::device_pointer_cast(v); 4900 } else { 4901 w = new THRUSTARRAY(n); 4902 dv = w->data(); 4903 } 4904 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4905 4906 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4907 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4908 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4909 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4910 delete w; 4911 } else { 4912 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4913 } 4914 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4915 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 4916 PetscFunctionReturn(0); 4917 } 4918