1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #include <thrust/async/for_each.h> 19 #endif 20 #include <thrust/iterator/constant_iterator.h> 21 #include <thrust/remove.h> 22 #include <thrust/sort.h> 23 #include <thrust/unique.h> 24 25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 28 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 29 30 typedef enum { 31 CUSPARSE_MV_ALG_DEFAULT = 0, 32 CUSPARSE_COOMV_ALG = 1, 33 CUSPARSE_CSRMV_ALG1 = 2, 34 CUSPARSE_CSRMV_ALG2 = 3 35 } cusparseSpMVAlg_t; 36 37 typedef enum { 38 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 39 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 40 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 41 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 42 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 43 CUSPARSE_SPMM_ALG_DEFAULT = 0, 44 CUSPARSE_SPMM_COO_ALG1 = 1, 45 CUSPARSE_SPMM_COO_ALG2 = 2, 46 CUSPARSE_SPMM_COO_ALG3 = 3, 47 CUSPARSE_SPMM_COO_ALG4 = 5, 48 CUSPARSE_SPMM_CSR_ALG1 = 4, 49 CUSPARSE_SPMM_CSR_ALG2 = 6, 50 } cusparseSpMMAlg_t; 51 52 typedef enum { 53 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 54 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 55 } cusparseCsr2CscAlg_t; 56 */ 57 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 58 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 60 #endif 61 62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 65 66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 67 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 68 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 69 70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 72 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 84 85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 88 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 89 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 90 91 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 92 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 93 94 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 95 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 96 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 97 98 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 99 { 100 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 101 102 PetscFunctionBegin; 103 switch (op) { 104 case MAT_CUSPARSE_MULT: 105 cusparsestruct->format = format; 106 break; 107 case MAT_CUSPARSE_ALL: 108 cusparsestruct->format = format; 109 break; 110 default: 111 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 112 } 113 PetscFunctionReturn(PETSC_SUCCESS); 114 } 115 116 /*@ 117 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 118 operation. Only the `MatMult()` operation can use different GPU storage formats 119 120 Not Collective 121 122 Input Parameters: 123 + A - Matrix of type `MATSEQAIJCUSPARSE` 124 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 125 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 126 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 127 128 Level: intermediate 129 130 .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 131 @*/ 132 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 133 { 134 PetscFunctionBegin; 135 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 136 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 137 PetscFunctionReturn(PETSC_SUCCESS); 138 } 139 140 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 141 { 142 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 143 144 PetscFunctionBegin; 145 cusparsestruct->use_cpu_solve = use_cpu; 146 PetscFunctionReturn(PETSC_SUCCESS); 147 } 148 149 /*@ 150 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 151 152 Input Parameters: 153 + A - Matrix of type `MATSEQAIJCUSPARSE` 154 - use_cpu - set flag for using the built-in CPU `MatSolve()` 155 156 Level: intermediate 157 158 Note: 159 The cuSparse LU solver currently computes the factors with the built-in CPU method 160 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 161 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 162 163 .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 164 @*/ 165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 166 { 167 PetscFunctionBegin; 168 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 169 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 170 PetscFunctionReturn(PETSC_SUCCESS); 171 } 172 173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 174 { 175 PetscFunctionBegin; 176 switch (op) { 177 case MAT_FORM_EXPLICIT_TRANSPOSE: 178 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 179 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 180 A->form_explicit_transpose = flg; 181 break; 182 default: 183 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 184 break; 185 } 186 PetscFunctionReturn(PETSC_SUCCESS); 187 } 188 189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 190 191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 192 { 193 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 194 IS isrow = b->row, iscol = b->col; 195 PetscBool row_identity, col_identity; 196 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr; 197 198 PetscFunctionBegin; 199 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 200 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 201 B->offloadmask = PETSC_OFFLOAD_CPU; 202 /* determine which version of MatSolve needs to be used. */ 203 PetscCall(ISIdentity(isrow, &row_identity)); 204 PetscCall(ISIdentity(iscol, &col_identity)); 205 206 if (!cusparsestruct->use_cpu_solve) { 207 if (row_identity && col_identity) { 208 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 209 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 210 } else { 211 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 212 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 213 } 214 } 215 B->ops->matsolve = NULL; 216 B->ops->matsolvetranspose = NULL; 217 218 /* get the triangular factors */ 219 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 220 PetscFunctionReturn(PETSC_SUCCESS); 221 } 222 223 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 224 { 225 MatCUSPARSEStorageFormat format; 226 PetscBool flg; 227 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 228 229 PetscFunctionBegin; 230 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 231 if (A->factortype == MAT_FACTOR_NONE) { 232 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 233 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 234 235 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 236 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 237 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 238 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 239 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 240 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 241 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 242 #if CUSPARSE_VERSION > 11301 243 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 244 #else 245 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 246 #endif 247 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 248 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 249 250 PetscCall( 251 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 252 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 253 #endif 254 } 255 PetscOptionsHeadEnd(); 256 PetscFunctionReturn(PETSC_SUCCESS); 257 } 258 259 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 260 { 261 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 262 PetscInt n = A->rmap->n; 263 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 264 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 265 const PetscInt *ai = a->i, *aj = a->j, *vi; 266 const MatScalar *aa = a->a, *v; 267 PetscInt *AiLo, *AjLo; 268 PetscInt i, nz, nzLower, offset, rowOffset; 269 270 PetscFunctionBegin; 271 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 272 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 273 try { 274 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 275 nzLower = n + ai[n] - ai[1]; 276 if (!loTriFactor) { 277 PetscScalar *AALo; 278 279 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 280 281 /* Allocate Space for the lower triangular matrix */ 282 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 283 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 284 285 /* Fill the lower triangular matrix */ 286 AiLo[0] = (PetscInt)0; 287 AiLo[n] = nzLower; 288 AjLo[0] = (PetscInt)0; 289 AALo[0] = (MatScalar)1.0; 290 v = aa; 291 vi = aj; 292 offset = 1; 293 rowOffset = 1; 294 for (i = 1; i < n; i++) { 295 nz = ai[i + 1] - ai[i]; 296 /* additional 1 for the term on the diagonal */ 297 AiLo[i] = rowOffset; 298 rowOffset += nz + 1; 299 300 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 301 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 302 303 offset += nz; 304 AjLo[offset] = (PetscInt)i; 305 AALo[offset] = (MatScalar)1.0; 306 offset += 1; 307 308 v += nz; 309 vi += nz; 310 } 311 312 /* allocate space for the triangular factor information */ 313 PetscCall(PetscNew(&loTriFactor)); 314 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 315 /* Create the matrix description */ 316 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 317 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 318 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 319 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 320 #else 321 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 322 #endif 323 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 324 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 325 326 /* set the operation */ 327 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 328 329 /* set the matrix */ 330 loTriFactor->csrMat = new CsrMatrix; 331 loTriFactor->csrMat->num_rows = n; 332 loTriFactor->csrMat->num_cols = n; 333 loTriFactor->csrMat->num_entries = nzLower; 334 335 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 336 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 337 338 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 339 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 340 341 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 342 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 343 344 /* Create the solve analysis information */ 345 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 346 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 347 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 348 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 349 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 350 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 351 #endif 352 353 /* perform the solve analysis */ 354 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 355 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 356 PetscCallCUDA(WaitForCUDA()); 357 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 358 359 /* assign the pointer */ 360 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 361 loTriFactor->AA_h = AALo; 362 PetscCallCUDA(cudaFreeHost(AiLo)); 363 PetscCallCUDA(cudaFreeHost(AjLo)); 364 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 365 } else { /* update values only */ 366 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 367 /* Fill the lower triangular matrix */ 368 loTriFactor->AA_h[0] = 1.0; 369 v = aa; 370 vi = aj; 371 offset = 1; 372 for (i = 1; i < n; i++) { 373 nz = ai[i + 1] - ai[i]; 374 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 375 offset += nz; 376 loTriFactor->AA_h[offset] = 1.0; 377 offset += 1; 378 v += nz; 379 } 380 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 381 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 382 } 383 } catch (char *ex) { 384 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 385 } 386 } 387 PetscFunctionReturn(PETSC_SUCCESS); 388 } 389 390 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 391 { 392 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 393 PetscInt n = A->rmap->n; 394 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 395 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 396 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 397 const MatScalar *aa = a->a, *v; 398 PetscInt *AiUp, *AjUp; 399 PetscInt i, nz, nzUpper, offset; 400 401 PetscFunctionBegin; 402 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 403 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 404 try { 405 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 406 nzUpper = adiag[0] - adiag[n]; 407 if (!upTriFactor) { 408 PetscScalar *AAUp; 409 410 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 411 412 /* Allocate Space for the upper triangular matrix */ 413 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 414 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 415 416 /* Fill the upper triangular matrix */ 417 AiUp[0] = (PetscInt)0; 418 AiUp[n] = nzUpper; 419 offset = nzUpper; 420 for (i = n - 1; i >= 0; i--) { 421 v = aa + adiag[i + 1] + 1; 422 vi = aj + adiag[i + 1] + 1; 423 424 /* number of elements NOT on the diagonal */ 425 nz = adiag[i] - adiag[i + 1] - 1; 426 427 /* decrement the offset */ 428 offset -= (nz + 1); 429 430 /* first, set the diagonal elements */ 431 AjUp[offset] = (PetscInt)i; 432 AAUp[offset] = (MatScalar)1. / v[nz]; 433 AiUp[i] = AiUp[i + 1] - (nz + 1); 434 435 PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 436 PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 437 } 438 439 /* allocate space for the triangular factor information */ 440 PetscCall(PetscNew(&upTriFactor)); 441 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 442 443 /* Create the matrix description */ 444 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 445 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 446 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 447 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 448 #else 449 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 450 #endif 451 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 452 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 453 454 /* set the operation */ 455 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 456 457 /* set the matrix */ 458 upTriFactor->csrMat = new CsrMatrix; 459 upTriFactor->csrMat->num_rows = n; 460 upTriFactor->csrMat->num_cols = n; 461 upTriFactor->csrMat->num_entries = nzUpper; 462 463 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 464 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 465 466 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 467 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 468 469 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 470 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 471 472 /* Create the solve analysis information */ 473 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 474 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 475 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 476 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 477 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 478 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 479 #endif 480 481 /* perform the solve analysis */ 482 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 483 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 484 485 PetscCallCUDA(WaitForCUDA()); 486 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 487 488 /* assign the pointer */ 489 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 490 upTriFactor->AA_h = AAUp; 491 PetscCallCUDA(cudaFreeHost(AiUp)); 492 PetscCallCUDA(cudaFreeHost(AjUp)); 493 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 494 } else { 495 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 496 /* Fill the upper triangular matrix */ 497 offset = nzUpper; 498 for (i = n - 1; i >= 0; i--) { 499 v = aa + adiag[i + 1] + 1; 500 501 /* number of elements NOT on the diagonal */ 502 nz = adiag[i] - adiag[i + 1] - 1; 503 504 /* decrement the offset */ 505 offset -= (nz + 1); 506 507 /* first, set the diagonal elements */ 508 upTriFactor->AA_h[offset] = 1. / v[nz]; 509 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 510 } 511 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 512 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 513 } 514 } catch (char *ex) { 515 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 516 } 517 } 518 PetscFunctionReturn(PETSC_SUCCESS); 519 } 520 521 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 522 { 523 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 524 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 525 IS isrow = a->row, iscol = a->icol; 526 PetscBool row_identity, col_identity; 527 PetscInt n = A->rmap->n; 528 529 PetscFunctionBegin; 530 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 531 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 532 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 533 534 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 535 cusparseTriFactors->nnz = a->nz; 536 537 A->offloadmask = PETSC_OFFLOAD_BOTH; 538 /* lower triangular indices */ 539 PetscCall(ISIdentity(isrow, &row_identity)); 540 if (!row_identity && !cusparseTriFactors->rpermIndices) { 541 const PetscInt *r; 542 543 PetscCall(ISGetIndices(isrow, &r)); 544 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 545 cusparseTriFactors->rpermIndices->assign(r, r + n); 546 PetscCall(ISRestoreIndices(isrow, &r)); 547 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 548 } 549 550 /* upper triangular indices */ 551 PetscCall(ISIdentity(iscol, &col_identity)); 552 if (!col_identity && !cusparseTriFactors->cpermIndices) { 553 const PetscInt *c; 554 555 PetscCall(ISGetIndices(iscol, &c)); 556 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 557 cusparseTriFactors->cpermIndices->assign(c, c + n); 558 PetscCall(ISRestoreIndices(iscol, &c)); 559 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 560 } 561 PetscFunctionReturn(PETSC_SUCCESS); 562 } 563 564 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 565 { 566 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 567 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 568 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 569 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 570 PetscInt *AiUp, *AjUp; 571 PetscScalar *AAUp; 572 PetscScalar *AALo; 573 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 574 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 575 const PetscInt *ai = b->i, *aj = b->j, *vj; 576 const MatScalar *aa = b->a, *v; 577 578 PetscFunctionBegin; 579 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 580 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 581 try { 582 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 583 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 584 if (!upTriFactor && !loTriFactor) { 585 /* Allocate Space for the upper triangular matrix */ 586 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 587 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 588 589 /* Fill the upper triangular matrix */ 590 AiUp[0] = (PetscInt)0; 591 AiUp[n] = nzUpper; 592 offset = 0; 593 for (i = 0; i < n; i++) { 594 /* set the pointers */ 595 v = aa + ai[i]; 596 vj = aj + ai[i]; 597 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 598 599 /* first, set the diagonal elements */ 600 AjUp[offset] = (PetscInt)i; 601 AAUp[offset] = (MatScalar)1.0 / v[nz]; 602 AiUp[i] = offset; 603 AALo[offset] = (MatScalar)1.0 / v[nz]; 604 605 offset += 1; 606 if (nz > 0) { 607 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 608 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 609 for (j = offset; j < offset + nz; j++) { 610 AAUp[j] = -AAUp[j]; 611 AALo[j] = AAUp[j] / v[nz]; 612 } 613 offset += nz; 614 } 615 } 616 617 /* allocate space for the triangular factor information */ 618 PetscCall(PetscNew(&upTriFactor)); 619 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 620 621 /* Create the matrix description */ 622 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 623 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 624 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 625 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 626 #else 627 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 628 #endif 629 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 630 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 631 632 /* set the matrix */ 633 upTriFactor->csrMat = new CsrMatrix; 634 upTriFactor->csrMat->num_rows = A->rmap->n; 635 upTriFactor->csrMat->num_cols = A->cmap->n; 636 upTriFactor->csrMat->num_entries = a->nz; 637 638 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 639 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 640 641 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 642 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 643 644 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 645 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 646 647 /* set the operation */ 648 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 649 650 /* Create the solve analysis information */ 651 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 652 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 653 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 654 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 655 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 656 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 657 #endif 658 659 /* perform the solve analysis */ 660 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 661 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 662 663 PetscCallCUDA(WaitForCUDA()); 664 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 665 666 /* assign the pointer */ 667 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 668 669 /* allocate space for the triangular factor information */ 670 PetscCall(PetscNew(&loTriFactor)); 671 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 672 673 /* Create the matrix description */ 674 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 675 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 676 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 677 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 678 #else 679 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 680 #endif 681 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 682 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 683 684 /* set the operation */ 685 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 686 687 /* set the matrix */ 688 loTriFactor->csrMat = new CsrMatrix; 689 loTriFactor->csrMat->num_rows = A->rmap->n; 690 loTriFactor->csrMat->num_cols = A->cmap->n; 691 loTriFactor->csrMat->num_entries = a->nz; 692 693 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 694 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 695 696 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 697 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 698 699 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 700 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 701 702 /* Create the solve analysis information */ 703 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 704 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 705 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 706 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 707 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 708 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 709 #endif 710 711 /* perform the solve analysis */ 712 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 713 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 714 715 PetscCallCUDA(WaitForCUDA()); 716 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 717 718 /* assign the pointer */ 719 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 720 721 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 722 PetscCallCUDA(cudaFreeHost(AiUp)); 723 PetscCallCUDA(cudaFreeHost(AjUp)); 724 } else { 725 /* Fill the upper triangular matrix */ 726 offset = 0; 727 for (i = 0; i < n; i++) { 728 /* set the pointers */ 729 v = aa + ai[i]; 730 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 731 732 /* first, set the diagonal elements */ 733 AAUp[offset] = 1.0 / v[nz]; 734 AALo[offset] = 1.0 / v[nz]; 735 736 offset += 1; 737 if (nz > 0) { 738 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 739 for (j = offset; j < offset + nz; j++) { 740 AAUp[j] = -AAUp[j]; 741 AALo[j] = AAUp[j] / v[nz]; 742 } 743 offset += nz; 744 } 745 } 746 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 747 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 748 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 749 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 750 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 751 } 752 PetscCallCUDA(cudaFreeHost(AAUp)); 753 PetscCallCUDA(cudaFreeHost(AALo)); 754 } catch (char *ex) { 755 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 756 } 757 } 758 PetscFunctionReturn(PETSC_SUCCESS); 759 } 760 761 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 762 { 763 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 764 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 765 IS ip = a->row; 766 PetscBool perm_identity; 767 PetscInt n = A->rmap->n; 768 769 PetscFunctionBegin; 770 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 771 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 772 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 773 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 774 775 A->offloadmask = PETSC_OFFLOAD_BOTH; 776 777 /* lower triangular indices */ 778 PetscCall(ISIdentity(ip, &perm_identity)); 779 if (!perm_identity) { 780 IS iip; 781 const PetscInt *irip, *rip; 782 783 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 784 PetscCall(ISGetIndices(iip, &irip)); 785 PetscCall(ISGetIndices(ip, &rip)); 786 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 787 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 788 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 789 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 790 PetscCall(ISRestoreIndices(iip, &irip)); 791 PetscCall(ISDestroy(&iip)); 792 PetscCall(ISRestoreIndices(ip, &rip)); 793 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 794 } 795 PetscFunctionReturn(PETSC_SUCCESS); 796 } 797 798 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 799 { 800 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 801 IS ip = b->row; 802 PetscBool perm_identity; 803 804 PetscFunctionBegin; 805 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 806 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 807 B->offloadmask = PETSC_OFFLOAD_CPU; 808 /* determine which version of MatSolve needs to be used. */ 809 PetscCall(ISIdentity(ip, &perm_identity)); 810 if (perm_identity) { 811 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 812 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 813 B->ops->matsolve = NULL; 814 B->ops->matsolvetranspose = NULL; 815 } else { 816 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 817 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 818 B->ops->matsolve = NULL; 819 B->ops->matsolvetranspose = NULL; 820 } 821 822 /* get the triangular factors */ 823 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 824 PetscFunctionReturn(PETSC_SUCCESS); 825 } 826 827 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 828 { 829 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 830 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 831 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 832 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 833 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 834 cusparseIndexBase_t indexBase; 835 cusparseMatrixType_t matrixType; 836 cusparseFillMode_t fillMode; 837 cusparseDiagType_t diagType; 838 839 PetscFunctionBegin; 840 /* allocate space for the transpose of the lower triangular factor */ 841 PetscCall(PetscNew(&loTriFactorT)); 842 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 843 844 /* set the matrix descriptors of the lower triangular factor */ 845 matrixType = cusparseGetMatType(loTriFactor->descr); 846 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 847 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 848 diagType = cusparseGetMatDiagType(loTriFactor->descr); 849 850 /* Create the matrix description */ 851 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 852 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 853 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 854 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 855 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 856 857 /* set the operation */ 858 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 859 860 /* allocate GPU space for the CSC of the lower triangular factor*/ 861 loTriFactorT->csrMat = new CsrMatrix; 862 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 863 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 864 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 865 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 866 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 867 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 868 869 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 870 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 871 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 872 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 873 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 874 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 875 #endif 876 877 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 878 { 879 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 880 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 881 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 882 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 883 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 884 #else 885 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 886 #endif 887 PetscCallCUSPARSE(stat); 888 } 889 890 PetscCallCUDA(WaitForCUDA()); 891 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 892 893 /* Create the solve analysis information */ 894 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 895 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 896 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 897 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 898 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 899 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 900 #endif 901 902 /* perform the solve analysis */ 903 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 904 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 905 906 PetscCallCUDA(WaitForCUDA()); 907 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 908 909 /* assign the pointer */ 910 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 911 912 /*********************************************/ 913 /* Now the Transpose of the Upper Tri Factor */ 914 /*********************************************/ 915 916 /* allocate space for the transpose of the upper triangular factor */ 917 PetscCall(PetscNew(&upTriFactorT)); 918 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 919 920 /* set the matrix descriptors of the upper triangular factor */ 921 matrixType = cusparseGetMatType(upTriFactor->descr); 922 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 923 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 924 diagType = cusparseGetMatDiagType(upTriFactor->descr); 925 926 /* Create the matrix description */ 927 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 928 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 929 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 930 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 931 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 932 933 /* set the operation */ 934 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 935 936 /* allocate GPU space for the CSC of the upper triangular factor*/ 937 upTriFactorT->csrMat = new CsrMatrix; 938 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 939 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 940 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 941 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 942 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 943 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 944 945 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 946 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 947 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 948 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 949 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 950 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 951 #endif 952 953 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 954 { 955 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 956 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 957 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 958 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 959 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 960 #else 961 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 962 #endif 963 PetscCallCUSPARSE(stat); 964 } 965 966 PetscCallCUDA(WaitForCUDA()); 967 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 968 969 /* Create the solve analysis information */ 970 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 971 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 972 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 973 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 974 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 975 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 976 #endif 977 978 /* perform the solve analysis */ 979 /* christ, would it have killed you to put this stuff in a function????????? */ 980 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 981 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 982 983 PetscCallCUDA(WaitForCUDA()); 984 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 985 986 /* assign the pointer */ 987 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 988 PetscFunctionReturn(PETSC_SUCCESS); 989 } 990 991 struct PetscScalarToPetscInt { 992 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 993 }; 994 995 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 996 { 997 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 998 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 999 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1000 cusparseStatus_t stat; 1001 cusparseIndexBase_t indexBase; 1002 1003 PetscFunctionBegin; 1004 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1005 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1006 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1007 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1008 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1009 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1010 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1011 PetscCall(PetscLogGpuTimeBegin()); 1012 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1013 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1014 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1015 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1016 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1017 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1018 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1019 1020 /* set alpha and beta */ 1021 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 1022 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 1023 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1024 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1025 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1026 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1027 1028 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1029 CsrMatrix *matrixT = new CsrMatrix; 1030 matstructT->mat = matrixT; 1031 matrixT->num_rows = A->cmap->n; 1032 matrixT->num_cols = A->rmap->n; 1033 matrixT->num_entries = a->nz; 1034 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1035 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1036 matrixT->values = new THRUSTARRAY(a->nz); 1037 1038 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1039 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1040 1041 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1042 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1043 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1044 indexBase, cusparse_scalartype); 1045 PetscCallCUSPARSE(stat); 1046 #else 1047 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1048 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1049 1050 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1051 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1052 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1053 */ 1054 if (matrixT->num_entries) { 1055 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1056 PetscCallCUSPARSE(stat); 1057 1058 } else { 1059 matstructT->matDescr = NULL; 1060 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1061 } 1062 #endif 1063 #endif 1064 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1065 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1066 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1067 #else 1068 CsrMatrix *temp = new CsrMatrix; 1069 CsrMatrix *tempT = new CsrMatrix; 1070 /* First convert HYB to CSR */ 1071 temp->num_rows = A->rmap->n; 1072 temp->num_cols = A->cmap->n; 1073 temp->num_entries = a->nz; 1074 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1075 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1076 temp->values = new THRUSTARRAY(a->nz); 1077 1078 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1079 PetscCallCUSPARSE(stat); 1080 1081 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1082 tempT->num_rows = A->rmap->n; 1083 tempT->num_cols = A->cmap->n; 1084 tempT->num_entries = a->nz; 1085 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1086 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1087 tempT->values = new THRUSTARRAY(a->nz); 1088 1089 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1090 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1091 PetscCallCUSPARSE(stat); 1092 1093 /* Last, convert CSC to HYB */ 1094 cusparseHybMat_t hybMat; 1095 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1096 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1097 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1098 PetscCallCUSPARSE(stat); 1099 1100 /* assign the pointer */ 1101 matstructT->mat = hybMat; 1102 A->transupdated = PETSC_TRUE; 1103 /* delete temporaries */ 1104 if (tempT) { 1105 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1106 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1107 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1108 delete (CsrMatrix *)tempT; 1109 } 1110 if (temp) { 1111 if (temp->values) delete (THRUSTARRAY *)temp->values; 1112 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1113 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1114 delete (CsrMatrix *)temp; 1115 } 1116 #endif 1117 } 1118 } 1119 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1120 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1121 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1122 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1123 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1124 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1125 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1126 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1127 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1128 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1129 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1130 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1131 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1132 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1133 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1134 } 1135 if (!cusparsestruct->csr2csc_i) { 1136 THRUSTARRAY csr2csc_a(matrix->num_entries); 1137 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1138 1139 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1140 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1141 void *csr2cscBuffer; 1142 size_t csr2cscBufferSize; 1143 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1144 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1145 PetscCallCUSPARSE(stat); 1146 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1147 #endif 1148 1149 if (matrix->num_entries) { 1150 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1151 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1152 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1153 1154 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1155 should be filled with indexBase. So I just take a shortcut here. 1156 */ 1157 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1158 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1159 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1160 PetscCallCUSPARSE(stat); 1161 #else 1162 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1163 PetscCallCUSPARSE(stat); 1164 #endif 1165 } else { 1166 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1167 } 1168 1169 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1170 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1171 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1172 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1173 #endif 1174 } 1175 PetscCallThrust( 1176 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1177 } 1178 PetscCall(PetscLogGpuTimeEnd()); 1179 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1180 /* the compressed row indices is not used for matTranspose */ 1181 matstructT->cprowIndices = NULL; 1182 /* assign the pointer */ 1183 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1184 A->transupdated = PETSC_TRUE; 1185 PetscFunctionReturn(PETSC_SUCCESS); 1186 } 1187 1188 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1189 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1190 { 1191 PetscInt n = xx->map->n; 1192 const PetscScalar *barray; 1193 PetscScalar *xarray; 1194 thrust::device_ptr<const PetscScalar> bGPU; 1195 thrust::device_ptr<PetscScalar> xGPU; 1196 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1197 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1198 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1199 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1200 1201 PetscFunctionBegin; 1202 /* Analyze the matrix and create the transpose ... on the fly */ 1203 if (!loTriFactorT && !upTriFactorT) { 1204 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1205 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1206 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1207 } 1208 1209 /* Get the GPU pointers */ 1210 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1211 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1212 xGPU = thrust::device_pointer_cast(xarray); 1213 bGPU = thrust::device_pointer_cast(barray); 1214 1215 PetscCall(PetscLogGpuTimeBegin()); 1216 /* First, reorder with the row permutation */ 1217 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1218 1219 /* First, solve U */ 1220 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1221 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1222 1223 /* Then, solve L */ 1224 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1225 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1226 1227 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1228 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1229 1230 /* Copy the temporary to the full solution. */ 1231 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1232 1233 /* restore */ 1234 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1235 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1236 PetscCall(PetscLogGpuTimeEnd()); 1237 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1238 PetscFunctionReturn(PETSC_SUCCESS); 1239 } 1240 1241 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1242 { 1243 const PetscScalar *barray; 1244 PetscScalar *xarray; 1245 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1246 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1247 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1248 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1249 1250 PetscFunctionBegin; 1251 /* Analyze the matrix and create the transpose ... on the fly */ 1252 if (!loTriFactorT && !upTriFactorT) { 1253 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1254 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1255 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1256 } 1257 1258 /* Get the GPU pointers */ 1259 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1260 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1261 1262 PetscCall(PetscLogGpuTimeBegin()); 1263 /* First, solve U */ 1264 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1265 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1266 1267 /* Then, solve L */ 1268 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1269 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1270 1271 /* restore */ 1272 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1273 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1274 PetscCall(PetscLogGpuTimeEnd()); 1275 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1276 PetscFunctionReturn(PETSC_SUCCESS); 1277 } 1278 1279 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1280 { 1281 const PetscScalar *barray; 1282 PetscScalar *xarray; 1283 thrust::device_ptr<const PetscScalar> bGPU; 1284 thrust::device_ptr<PetscScalar> xGPU; 1285 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1286 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1287 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1288 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1289 1290 PetscFunctionBegin; 1291 /* Get the GPU pointers */ 1292 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1293 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1294 xGPU = thrust::device_pointer_cast(xarray); 1295 bGPU = thrust::device_pointer_cast(barray); 1296 1297 PetscCall(PetscLogGpuTimeBegin()); 1298 /* First, reorder with the row permutation */ 1299 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1300 1301 /* Next, solve L */ 1302 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1303 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1304 1305 /* Then, solve U */ 1306 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1307 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1308 1309 /* Last, reorder with the column permutation */ 1310 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1311 1312 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1313 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1314 PetscCall(PetscLogGpuTimeEnd()); 1315 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1316 PetscFunctionReturn(PETSC_SUCCESS); 1317 } 1318 1319 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1320 { 1321 const PetscScalar *barray; 1322 PetscScalar *xarray; 1323 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1324 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1325 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1326 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1327 1328 PetscFunctionBegin; 1329 /* Get the GPU pointers */ 1330 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1331 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1332 1333 PetscCall(PetscLogGpuTimeBegin()); 1334 /* First, solve L */ 1335 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1336 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1337 1338 /* Next, solve U */ 1339 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1340 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1341 1342 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1343 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1344 PetscCall(PetscLogGpuTimeEnd()); 1345 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1346 PetscFunctionReturn(PETSC_SUCCESS); 1347 } 1348 1349 #if CUSPARSE_VERSION >= 11500 1350 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1351 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) 1352 { 1353 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1354 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1355 const PetscScalar *barray; 1356 PetscScalar *xarray; 1357 1358 PetscFunctionBegin; 1359 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1360 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1361 PetscCall(PetscLogGpuTimeBegin()); 1362 1363 /* Solve L*y = b */ 1364 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1365 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1366 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1367 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, 1368 fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1369 1370 /* Solve U*x = y */ 1371 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1372 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 1373 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); 1374 1375 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1376 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1377 1378 PetscCall(PetscLogGpuTimeEnd()); 1379 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1380 PetscFunctionReturn(PETSC_SUCCESS); 1381 } 1382 1383 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) 1384 { 1385 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1386 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1387 const PetscScalar *barray; 1388 PetscScalar *xarray; 1389 1390 PetscFunctionBegin; 1391 if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1392 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1393 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1394 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1395 1396 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1397 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1398 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1399 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1400 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1401 } 1402 1403 if (!fs->updatedTransposeSpSVAnalysis) { 1404 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1405 1406 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1407 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1408 } 1409 1410 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1411 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1412 PetscCall(PetscLogGpuTimeBegin()); 1413 1414 /* Solve Ut*y = b */ 1415 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1416 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1417 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 1418 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); 1419 1420 /* Solve Lt*x = y */ 1421 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1422 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1423 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1424 1425 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1426 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1427 PetscCall(PetscLogGpuTimeEnd()); 1428 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1429 PetscFunctionReturn(PETSC_SUCCESS); 1430 } 1431 1432 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1433 { 1434 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1435 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1436 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1437 CsrMatrix *Acsr; 1438 PetscInt m, nz; 1439 PetscBool flg; 1440 1441 PetscFunctionBegin; 1442 if (PetscDefined(USE_DEBUG)) { 1443 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1444 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1445 } 1446 1447 /* Copy A's value to fact */ 1448 m = fact->rmap->n; 1449 nz = aij->nz; 1450 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1451 Acsr = (CsrMatrix *)Acusp->mat->mat; 1452 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1453 1454 /* Factorize fact inplace */ 1455 if (m) 1456 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1457 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1458 if (PetscDefined(USE_DEBUG)) { 1459 int numerical_zero; 1460 cusparseStatus_t status; 1461 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1462 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1463 } 1464 1465 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1466 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1467 */ 1468 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1469 1470 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1471 1472 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1473 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1474 1475 fact->offloadmask = PETSC_OFFLOAD_GPU; 1476 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1477 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1478 fact->ops->matsolve = NULL; 1479 fact->ops->matsolvetranspose = NULL; 1480 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1481 PetscFunctionReturn(PETSC_SUCCESS); 1482 } 1483 1484 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1485 { 1486 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1487 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1488 PetscInt m, nz; 1489 1490 PetscFunctionBegin; 1491 if (PetscDefined(USE_DEBUG)) { 1492 PetscInt i; 1493 PetscBool flg, missing; 1494 1495 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1496 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1497 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1498 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1499 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1500 } 1501 1502 /* Free the old stale stuff */ 1503 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1504 1505 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1506 but they will not be used. Allocate them just for easy debugging. 1507 */ 1508 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1509 1510 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1511 fact->factortype = MAT_FACTOR_ILU; 1512 fact->info.factor_mallocs = 0; 1513 fact->info.fill_ratio_given = info->fill; 1514 fact->info.fill_ratio_needed = 1.0; 1515 1516 aij->row = NULL; 1517 aij->col = NULL; 1518 1519 /* ====================================================================== */ 1520 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1521 /* We'll do in-place factorization on fact */ 1522 /* ====================================================================== */ 1523 const int *Ai, *Aj; 1524 1525 m = fact->rmap->n; 1526 nz = aij->nz; 1527 1528 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1529 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1530 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1531 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1532 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1533 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1534 1535 /* ====================================================================== */ 1536 /* Create descriptors for M, L, U */ 1537 /* ====================================================================== */ 1538 cusparseFillMode_t fillMode; 1539 cusparseDiagType_t diagType; 1540 1541 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1542 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1543 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1544 1545 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1546 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1547 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1548 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1549 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1550 */ 1551 fillMode = CUSPARSE_FILL_MODE_LOWER; 1552 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1553 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1554 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1555 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1556 1557 fillMode = CUSPARSE_FILL_MODE_UPPER; 1558 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1559 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1560 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1561 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1562 1563 /* ========================================================================= */ 1564 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1565 /* ========================================================================= */ 1566 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1567 if (m) 1568 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1569 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M)); 1570 1571 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1572 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1573 1574 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1575 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1576 1577 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1578 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1579 1580 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1581 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1582 1583 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1584 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1585 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1586 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1587 */ 1588 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1589 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1590 fs->spsvBuffer_L = fs->factBuffer_M; 1591 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1592 } else { 1593 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1594 fs->spsvBuffer_U = fs->factBuffer_M; 1595 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1596 } 1597 1598 /* ========================================================================== */ 1599 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1600 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1601 /* ========================================================================== */ 1602 int structural_zero; 1603 cusparseStatus_t status; 1604 1605 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1606 if (m) 1607 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1608 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1609 if (PetscDefined(USE_DEBUG)) { 1610 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1611 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1612 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1613 } 1614 1615 /* Estimate FLOPs of the numeric factorization */ 1616 { 1617 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1618 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1619 PetscLogDouble flops = 0.0; 1620 1621 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1622 Ai = Aseq->i; 1623 Adiag = Aseq->diag; 1624 for (PetscInt i = 0; i < m; i++) { 1625 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1626 nzRow = Ai[i + 1] - Ai[i]; 1627 nzLeft = Adiag[i] - Ai[i]; 1628 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1629 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1630 */ 1631 nzLeft = (nzRow - 1) / 2; 1632 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1633 } 1634 } 1635 fs->numericFactFlops = flops; 1636 } 1637 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1638 PetscFunctionReturn(PETSC_SUCCESS); 1639 } 1640 1641 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1642 { 1643 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1644 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1645 const PetscScalar *barray; 1646 PetscScalar *xarray; 1647 1648 PetscFunctionBegin; 1649 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1650 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1651 PetscCall(PetscLogGpuTimeBegin()); 1652 1653 /* Solve L*y = b */ 1654 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1655 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1656 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1657 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1658 1659 /* Solve Lt*x = y */ 1660 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1661 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1662 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1663 1664 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1665 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1666 1667 PetscCall(PetscLogGpuTimeEnd()); 1668 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1669 PetscFunctionReturn(PETSC_SUCCESS); 1670 } 1671 1672 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1673 { 1674 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1675 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1676 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1677 CsrMatrix *Acsr; 1678 PetscInt m, nz; 1679 PetscBool flg; 1680 1681 PetscFunctionBegin; 1682 if (PetscDefined(USE_DEBUG)) { 1683 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1684 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1685 } 1686 1687 /* Copy A's value to fact */ 1688 m = fact->rmap->n; 1689 nz = aij->nz; 1690 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1691 Acsr = (CsrMatrix *)Acusp->mat->mat; 1692 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1693 1694 /* Factorize fact inplace */ 1695 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1696 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1697 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1698 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1699 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1700 */ 1701 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1702 if (PetscDefined(USE_DEBUG)) { 1703 int numerical_zero; 1704 cusparseStatus_t status; 1705 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1706 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1707 } 1708 1709 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1710 1711 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1712 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1713 */ 1714 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1715 1716 fact->offloadmask = PETSC_OFFLOAD_GPU; 1717 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1718 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1719 fact->ops->matsolve = NULL; 1720 fact->ops->matsolvetranspose = NULL; 1721 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1722 PetscFunctionReturn(PETSC_SUCCESS); 1723 } 1724 1725 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 1726 { 1727 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1728 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1729 PetscInt m, nz; 1730 1731 PetscFunctionBegin; 1732 if (PetscDefined(USE_DEBUG)) { 1733 PetscInt i; 1734 PetscBool flg, missing; 1735 1736 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1737 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1738 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1739 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1740 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1741 } 1742 1743 /* Free the old stale stuff */ 1744 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1745 1746 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1747 but they will not be used. Allocate them just for easy debugging. 1748 */ 1749 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1750 1751 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1752 fact->factortype = MAT_FACTOR_ICC; 1753 fact->info.factor_mallocs = 0; 1754 fact->info.fill_ratio_given = info->fill; 1755 fact->info.fill_ratio_needed = 1.0; 1756 1757 aij->row = NULL; 1758 aij->col = NULL; 1759 1760 /* ====================================================================== */ 1761 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1762 /* We'll do in-place factorization on fact */ 1763 /* ====================================================================== */ 1764 const int *Ai, *Aj; 1765 1766 m = fact->rmap->n; 1767 nz = aij->nz; 1768 1769 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1770 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1771 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1772 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1773 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1774 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1775 1776 /* ====================================================================== */ 1777 /* Create mat descriptors for M, L */ 1778 /* ====================================================================== */ 1779 cusparseFillMode_t fillMode; 1780 cusparseDiagType_t diagType; 1781 1782 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1783 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1784 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1785 1786 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1787 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1788 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1789 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1790 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1791 */ 1792 fillMode = CUSPARSE_FILL_MODE_LOWER; 1793 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1794 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1795 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1796 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1797 1798 /* ========================================================================= */ 1799 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 1800 /* ========================================================================= */ 1801 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 1802 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M)); 1803 1804 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1805 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1806 1807 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1808 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1809 1810 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1811 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1812 1813 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1814 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1815 1816 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 1817 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 1818 */ 1819 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 1820 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1821 fs->spsvBuffer_L = fs->factBuffer_M; 1822 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1823 } else { 1824 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 1825 fs->spsvBuffer_Lt = fs->factBuffer_M; 1826 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1827 } 1828 1829 /* ========================================================================== */ 1830 /* Perform analysis of ic0 on M */ 1831 /* The lower triangular part of M has the same sparsity pattern as L */ 1832 /* ========================================================================== */ 1833 int structural_zero; 1834 cusparseStatus_t status; 1835 1836 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1837 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1838 if (PetscDefined(USE_DEBUG)) { 1839 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1840 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 1841 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 1842 } 1843 1844 /* Estimate FLOPs of the numeric factorization */ 1845 { 1846 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1847 PetscInt *Ai, nzRow, nzLeft; 1848 PetscLogDouble flops = 0.0; 1849 1850 Ai = Aseq->i; 1851 for (PetscInt i = 0; i < m; i++) { 1852 nzRow = Ai[i + 1] - Ai[i]; 1853 if (nzRow > 1) { 1854 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1855 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1856 */ 1857 nzLeft = (nzRow - 1) / 2; 1858 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1859 } 1860 } 1861 fs->numericFactFlops = flops; 1862 } 1863 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 1864 PetscFunctionReturn(PETSC_SUCCESS); 1865 } 1866 #endif 1867 1868 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1869 { 1870 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1871 1872 PetscFunctionBegin; 1873 #if CUSPARSE_VERSION >= 11500 1874 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 1875 if (cusparseTriFactors->factorizeOnDevice) { 1876 PetscCall(ISIdentity(isrow, &row_identity)); 1877 PetscCall(ISIdentity(iscol, &col_identity)); 1878 } 1879 if (!info->levels && row_identity && col_identity) { 1880 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 1881 } else 1882 #endif 1883 { 1884 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1885 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1886 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1887 } 1888 PetscFunctionReturn(PETSC_SUCCESS); 1889 } 1890 1891 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1892 { 1893 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1894 1895 PetscFunctionBegin; 1896 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1897 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1898 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1899 PetscFunctionReturn(PETSC_SUCCESS); 1900 } 1901 1902 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 1903 { 1904 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1905 1906 PetscFunctionBegin; 1907 #if CUSPARSE_VERSION >= 11500 1908 PetscBool perm_identity = PETSC_FALSE; 1909 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 1910 if (!info->levels && perm_identity) { 1911 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 1912 } else 1913 #endif 1914 { 1915 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1916 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 1917 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1918 } 1919 PetscFunctionReturn(PETSC_SUCCESS); 1920 } 1921 1922 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 1923 { 1924 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1925 1926 PetscFunctionBegin; 1927 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1928 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 1929 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1930 PetscFunctionReturn(PETSC_SUCCESS); 1931 } 1932 1933 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 1934 { 1935 PetscFunctionBegin; 1936 *type = MATSOLVERCUSPARSE; 1937 PetscFunctionReturn(PETSC_SUCCESS); 1938 } 1939 1940 /*MC 1941 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 1942 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 1943 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 1944 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 1945 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 1946 algorithms are not recommended. This class does NOT support direct solver operations. 1947 1948 Level: beginner 1949 1950 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 1951 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 1952 M*/ 1953 1954 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 1955 { 1956 PetscInt n = A->rmap->n; 1957 PetscBool factOnDevice, factOnHost; 1958 char *prefix; 1959 char factPlace[32] = "device"; /* the default */ 1960 1961 PetscFunctionBegin; 1962 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1963 PetscCall(MatSetSizes(*B, n, n, n, n)); 1964 (*B)->factortype = ftype; 1965 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 1966 1967 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 1968 PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 1969 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 1970 PetscOptionsEnd(); 1971 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 1972 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 1973 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 1974 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 1975 1976 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 1977 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 1978 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1979 if (!A->boundtocpu) { 1980 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1981 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1982 } else { 1983 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 1984 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 1985 } 1986 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1987 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 1988 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 1989 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 1990 if (!A->boundtocpu) { 1991 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 1992 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1993 } else { 1994 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 1995 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 1996 } 1997 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 1998 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 1999 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2000 2001 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2002 (*B)->canuseordering = PETSC_TRUE; 2003 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2004 PetscFunctionReturn(PETSC_SUCCESS); 2005 } 2006 2007 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2008 { 2009 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2010 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2011 #if CUSPARSE_VERSION >= 13500 2012 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2013 #endif 2014 2015 PetscFunctionBegin; 2016 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2017 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2018 if (A->factortype == MAT_FACTOR_NONE) { 2019 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2020 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2021 } 2022 #if CUSPARSE_VERSION >= 13500 2023 else if (fs->csrVal) { 2024 /* We have a factorized matrix on device and are able to copy it to host */ 2025 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2026 } 2027 #endif 2028 else 2029 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2030 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2031 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2032 A->offloadmask = PETSC_OFFLOAD_BOTH; 2033 } 2034 PetscFunctionReturn(PETSC_SUCCESS); 2035 } 2036 2037 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2038 { 2039 PetscFunctionBegin; 2040 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2041 *array = ((Mat_SeqAIJ *)A->data)->a; 2042 PetscFunctionReturn(PETSC_SUCCESS); 2043 } 2044 2045 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2046 { 2047 PetscFunctionBegin; 2048 A->offloadmask = PETSC_OFFLOAD_CPU; 2049 *array = NULL; 2050 PetscFunctionReturn(PETSC_SUCCESS); 2051 } 2052 2053 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2054 { 2055 PetscFunctionBegin; 2056 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2057 *array = ((Mat_SeqAIJ *)A->data)->a; 2058 PetscFunctionReturn(PETSC_SUCCESS); 2059 } 2060 2061 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2062 { 2063 PetscFunctionBegin; 2064 *array = NULL; 2065 PetscFunctionReturn(PETSC_SUCCESS); 2066 } 2067 2068 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2069 { 2070 PetscFunctionBegin; 2071 *array = ((Mat_SeqAIJ *)A->data)->a; 2072 PetscFunctionReturn(PETSC_SUCCESS); 2073 } 2074 2075 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2076 { 2077 PetscFunctionBegin; 2078 A->offloadmask = PETSC_OFFLOAD_CPU; 2079 *array = NULL; 2080 PetscFunctionReturn(PETSC_SUCCESS); 2081 } 2082 2083 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2084 { 2085 Mat_SeqAIJCUSPARSE *cusp; 2086 CsrMatrix *matrix; 2087 2088 PetscFunctionBegin; 2089 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2090 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2091 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2092 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2093 matrix = (CsrMatrix *)cusp->mat->mat; 2094 2095 if (i) { 2096 #if !defined(PETSC_USE_64BIT_INDICES) 2097 *i = matrix->row_offsets->data().get(); 2098 #else 2099 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2100 #endif 2101 } 2102 if (j) { 2103 #if !defined(PETSC_USE_64BIT_INDICES) 2104 *j = matrix->column_indices->data().get(); 2105 #else 2106 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2107 #endif 2108 } 2109 if (a) *a = matrix->values->data().get(); 2110 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2111 PetscFunctionReturn(PETSC_SUCCESS); 2112 } 2113 2114 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2115 { 2116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2117 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2118 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2119 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2120 cusparseStatus_t stat; 2121 PetscBool both = PETSC_TRUE; 2122 2123 PetscFunctionBegin; 2124 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2125 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2126 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2127 CsrMatrix *matrix; 2128 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2129 2130 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2131 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2132 matrix->values->assign(a->a, a->a + a->nz); 2133 PetscCallCUDA(WaitForCUDA()); 2134 PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 2135 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2136 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2137 } else { 2138 PetscInt nnz; 2139 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2140 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2141 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2142 delete cusparsestruct->workVector; 2143 delete cusparsestruct->rowoffsets_gpu; 2144 cusparsestruct->workVector = NULL; 2145 cusparsestruct->rowoffsets_gpu = NULL; 2146 try { 2147 if (a->compressedrow.use) { 2148 m = a->compressedrow.nrows; 2149 ii = a->compressedrow.i; 2150 ridx = a->compressedrow.rindex; 2151 } else { 2152 m = A->rmap->n; 2153 ii = a->i; 2154 ridx = NULL; 2155 } 2156 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2157 if (!a->a) { 2158 nnz = ii[m]; 2159 both = PETSC_FALSE; 2160 } else nnz = a->nz; 2161 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2162 2163 /* create cusparse matrix */ 2164 cusparsestruct->nrows = m; 2165 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2166 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2167 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2168 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2169 2170 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 2171 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 2172 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2173 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2174 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2175 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2176 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2177 2178 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2179 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2180 /* set the matrix */ 2181 CsrMatrix *mat = new CsrMatrix; 2182 mat->num_rows = m; 2183 mat->num_cols = A->cmap->n; 2184 mat->num_entries = nnz; 2185 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2186 mat->row_offsets->assign(ii, ii + m + 1); 2187 2188 mat->column_indices = new THRUSTINTARRAY32(nnz); 2189 mat->column_indices->assign(a->j, a->j + nnz); 2190 2191 mat->values = new THRUSTARRAY(nnz); 2192 if (a->a) mat->values->assign(a->a, a->a + nnz); 2193 2194 /* assign the pointer */ 2195 matstruct->mat = mat; 2196 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2197 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2198 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2199 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2200 PetscCallCUSPARSE(stat); 2201 } 2202 #endif 2203 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2205 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2206 #else 2207 CsrMatrix *mat = new CsrMatrix; 2208 mat->num_rows = m; 2209 mat->num_cols = A->cmap->n; 2210 mat->num_entries = nnz; 2211 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2212 mat->row_offsets->assign(ii, ii + m + 1); 2213 2214 mat->column_indices = new THRUSTINTARRAY32(nnz); 2215 mat->column_indices->assign(a->j, a->j + nnz); 2216 2217 mat->values = new THRUSTARRAY(nnz); 2218 if (a->a) mat->values->assign(a->a, a->a + nnz); 2219 2220 cusparseHybMat_t hybMat; 2221 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2222 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2223 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2224 PetscCallCUSPARSE(stat); 2225 /* assign the pointer */ 2226 matstruct->mat = hybMat; 2227 2228 if (mat) { 2229 if (mat->values) delete (THRUSTARRAY *)mat->values; 2230 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2231 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2232 delete (CsrMatrix *)mat; 2233 } 2234 #endif 2235 } 2236 2237 /* assign the compressed row indices */ 2238 if (a->compressedrow.use) { 2239 cusparsestruct->workVector = new THRUSTARRAY(m); 2240 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2241 matstruct->cprowIndices->assign(ridx, ridx + m); 2242 tmp = m; 2243 } else { 2244 cusparsestruct->workVector = NULL; 2245 matstruct->cprowIndices = NULL; 2246 tmp = 0; 2247 } 2248 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2249 2250 /* assign the pointer */ 2251 cusparsestruct->mat = matstruct; 2252 } catch (char *ex) { 2253 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2254 } 2255 PetscCallCUDA(WaitForCUDA()); 2256 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2257 cusparsestruct->nonzerostate = A->nonzerostate; 2258 } 2259 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2260 } 2261 PetscFunctionReturn(PETSC_SUCCESS); 2262 } 2263 2264 struct VecCUDAPlusEquals { 2265 template <typename Tuple> 2266 __host__ __device__ void operator()(Tuple t) 2267 { 2268 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2269 } 2270 }; 2271 2272 struct VecCUDAEquals { 2273 template <typename Tuple> 2274 __host__ __device__ void operator()(Tuple t) 2275 { 2276 thrust::get<1>(t) = thrust::get<0>(t); 2277 } 2278 }; 2279 2280 struct VecCUDAEqualsReverse { 2281 template <typename Tuple> 2282 __host__ __device__ void operator()(Tuple t) 2283 { 2284 thrust::get<0>(t) = thrust::get<1>(t); 2285 } 2286 }; 2287 2288 struct MatMatCusparse { 2289 PetscBool cisdense; 2290 PetscScalar *Bt; 2291 Mat X; 2292 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2293 PetscLogDouble flops; 2294 CsrMatrix *Bcsr; 2295 2296 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2297 cusparseSpMatDescr_t matSpBDescr; 2298 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2299 cusparseDnMatDescr_t matBDescr; 2300 cusparseDnMatDescr_t matCDescr; 2301 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2302 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2303 void *dBuffer4; 2304 void *dBuffer5; 2305 #endif 2306 size_t mmBufferSize; 2307 void *mmBuffer; 2308 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2309 cusparseSpGEMMDescr_t spgemmDesc; 2310 #endif 2311 }; 2312 2313 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2314 { 2315 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2316 2317 PetscFunctionBegin; 2318 PetscCallCUDA(cudaFree(mmdata->Bt)); 2319 delete mmdata->Bcsr; 2320 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2321 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2322 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2323 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2324 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2325 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2326 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2327 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2328 #endif 2329 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2330 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2331 #endif 2332 PetscCall(MatDestroy(&mmdata->X)); 2333 PetscCall(PetscFree(data)); 2334 PetscFunctionReturn(PETSC_SUCCESS); 2335 } 2336 2337 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2338 2339 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2340 { 2341 Mat_Product *product = C->product; 2342 Mat A, B; 2343 PetscInt m, n, blda, clda; 2344 PetscBool flg, biscuda; 2345 Mat_SeqAIJCUSPARSE *cusp; 2346 cusparseStatus_t stat; 2347 cusparseOperation_t opA; 2348 const PetscScalar *barray; 2349 PetscScalar *carray; 2350 MatMatCusparse *mmdata; 2351 Mat_SeqAIJCUSPARSEMultStruct *mat; 2352 CsrMatrix *csrmat; 2353 2354 PetscFunctionBegin; 2355 MatCheckProduct(C, 1); 2356 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2357 mmdata = (MatMatCusparse *)product->data; 2358 A = product->A; 2359 B = product->B; 2360 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2361 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2362 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2363 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2364 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2365 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2366 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2367 switch (product->type) { 2368 case MATPRODUCT_AB: 2369 case MATPRODUCT_PtAP: 2370 mat = cusp->mat; 2371 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2372 m = A->rmap->n; 2373 n = B->cmap->n; 2374 break; 2375 case MATPRODUCT_AtB: 2376 if (!A->form_explicit_transpose) { 2377 mat = cusp->mat; 2378 opA = CUSPARSE_OPERATION_TRANSPOSE; 2379 } else { 2380 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2381 mat = cusp->matTranspose; 2382 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2383 } 2384 m = A->cmap->n; 2385 n = B->cmap->n; 2386 break; 2387 case MATPRODUCT_ABt: 2388 case MATPRODUCT_RARt: 2389 mat = cusp->mat; 2390 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2391 m = A->rmap->n; 2392 n = B->rmap->n; 2393 break; 2394 default: 2395 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2396 } 2397 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2398 csrmat = (CsrMatrix *)mat->mat; 2399 /* if the user passed a CPU matrix, copy the data to the GPU */ 2400 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2401 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2402 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2403 2404 PetscCall(MatDenseGetLDA(B, &blda)); 2405 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2406 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2407 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2408 } else { 2409 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2410 PetscCall(MatDenseGetLDA(C, &clda)); 2411 } 2412 2413 PetscCall(PetscLogGpuTimeBegin()); 2414 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2415 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2416 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2417 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2418 size_t mmBufferSize; 2419 if (mmdata->initialized && mmdata->Blda != blda) { 2420 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2421 mmdata->matBDescr = NULL; 2422 } 2423 if (!mmdata->matBDescr) { 2424 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2425 mmdata->Blda = blda; 2426 } 2427 2428 if (mmdata->initialized && mmdata->Clda != clda) { 2429 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2430 mmdata->matCDescr = NULL; 2431 } 2432 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2433 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2434 mmdata->Clda = clda; 2435 } 2436 2437 if (!mat->matDescr) { 2438 stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2439 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2440 PetscCallCUSPARSE(stat); 2441 } 2442 stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 2443 PetscCallCUSPARSE(stat); 2444 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2445 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2446 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2447 mmdata->mmBufferSize = mmBufferSize; 2448 } 2449 mmdata->initialized = PETSC_TRUE; 2450 } else { 2451 /* to be safe, always update pointers of the mats */ 2452 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2453 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2454 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2455 } 2456 2457 /* do cusparseSpMM, which supports transpose on B */ 2458 stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 2459 PetscCallCUSPARSE(stat); 2460 #else 2461 PetscInt k; 2462 /* cusparseXcsrmm does not support transpose on B */ 2463 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2464 cublasHandle_t cublasv2handle; 2465 cublasStatus_t cerr; 2466 2467 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2468 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2469 PetscCallCUBLAS(cerr); 2470 blda = B->cmap->n; 2471 k = B->cmap->n; 2472 } else { 2473 k = B->rmap->n; 2474 } 2475 2476 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2477 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2478 PetscCallCUSPARSE(stat); 2479 #endif 2480 PetscCall(PetscLogGpuTimeEnd()); 2481 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2482 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2483 if (product->type == MATPRODUCT_RARt) { 2484 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2485 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2486 } else if (product->type == MATPRODUCT_PtAP) { 2487 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2488 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2489 } else { 2490 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2491 } 2492 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2493 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2494 PetscFunctionReturn(PETSC_SUCCESS); 2495 } 2496 2497 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2498 { 2499 Mat_Product *product = C->product; 2500 Mat A, B; 2501 PetscInt m, n; 2502 PetscBool cisdense, flg; 2503 MatMatCusparse *mmdata; 2504 Mat_SeqAIJCUSPARSE *cusp; 2505 2506 PetscFunctionBegin; 2507 MatCheckProduct(C, 1); 2508 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2509 A = product->A; 2510 B = product->B; 2511 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2512 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2513 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2514 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2515 switch (product->type) { 2516 case MATPRODUCT_AB: 2517 m = A->rmap->n; 2518 n = B->cmap->n; 2519 break; 2520 case MATPRODUCT_AtB: 2521 m = A->cmap->n; 2522 n = B->cmap->n; 2523 break; 2524 case MATPRODUCT_ABt: 2525 m = A->rmap->n; 2526 n = B->rmap->n; 2527 break; 2528 case MATPRODUCT_PtAP: 2529 m = B->cmap->n; 2530 n = B->cmap->n; 2531 break; 2532 case MATPRODUCT_RARt: 2533 m = B->rmap->n; 2534 n = B->rmap->n; 2535 break; 2536 default: 2537 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2538 } 2539 PetscCall(MatSetSizes(C, m, n, m, n)); 2540 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2541 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2542 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2543 2544 /* product data */ 2545 PetscCall(PetscNew(&mmdata)); 2546 mmdata->cisdense = cisdense; 2547 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2548 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2549 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2550 #endif 2551 /* for these products we need intermediate storage */ 2552 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2553 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2554 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2555 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2556 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2557 } else { 2558 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2559 } 2560 } 2561 C->product->data = mmdata; 2562 C->product->destroy = MatDestroy_MatMatCusparse; 2563 2564 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2565 PetscFunctionReturn(PETSC_SUCCESS); 2566 } 2567 2568 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2569 { 2570 Mat_Product *product = C->product; 2571 Mat A, B; 2572 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2573 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2574 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2575 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2576 PetscBool flg; 2577 cusparseStatus_t stat; 2578 MatProductType ptype; 2579 MatMatCusparse *mmdata; 2580 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2581 cusparseSpMatDescr_t BmatSpDescr; 2582 #endif 2583 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2584 2585 PetscFunctionBegin; 2586 MatCheckProduct(C, 1); 2587 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2588 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2589 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2590 mmdata = (MatMatCusparse *)C->product->data; 2591 A = product->A; 2592 B = product->B; 2593 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2594 mmdata->reusesym = PETSC_FALSE; 2595 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2596 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2597 Cmat = Ccusp->mat; 2598 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2599 Ccsr = (CsrMatrix *)Cmat->mat; 2600 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2601 goto finalize; 2602 } 2603 if (!c->nz) goto finalize; 2604 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2605 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2606 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2607 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2608 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2609 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2610 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2611 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2612 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2613 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2614 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2615 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2616 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2617 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2618 2619 ptype = product->type; 2620 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2621 ptype = MATPRODUCT_AB; 2622 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2623 } 2624 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2625 ptype = MATPRODUCT_AB; 2626 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2627 } 2628 switch (ptype) { 2629 case MATPRODUCT_AB: 2630 Amat = Acusp->mat; 2631 Bmat = Bcusp->mat; 2632 break; 2633 case MATPRODUCT_AtB: 2634 Amat = Acusp->matTranspose; 2635 Bmat = Bcusp->mat; 2636 break; 2637 case MATPRODUCT_ABt: 2638 Amat = Acusp->mat; 2639 Bmat = Bcusp->matTranspose; 2640 break; 2641 default: 2642 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2643 } 2644 Cmat = Ccusp->mat; 2645 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2646 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2647 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2648 Acsr = (CsrMatrix *)Amat->mat; 2649 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2650 Ccsr = (CsrMatrix *)Cmat->mat; 2651 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2652 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2653 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2654 PetscCall(PetscLogGpuTimeBegin()); 2655 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2656 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2657 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2658 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2659 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2660 PetscCallCUSPARSE(stat); 2661 #else 2662 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2663 PetscCallCUSPARSE(stat); 2664 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2665 PetscCallCUSPARSE(stat); 2666 #endif 2667 #else 2668 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2669 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2670 PetscCallCUSPARSE(stat); 2671 #endif 2672 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2673 PetscCallCUDA(WaitForCUDA()); 2674 PetscCall(PetscLogGpuTimeEnd()); 2675 C->offloadmask = PETSC_OFFLOAD_GPU; 2676 finalize: 2677 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2678 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2679 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2680 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2681 c->reallocs = 0; 2682 C->info.mallocs += 0; 2683 C->info.nz_unneeded = 0; 2684 C->assembled = C->was_assembled = PETSC_TRUE; 2685 C->num_ass++; 2686 PetscFunctionReturn(PETSC_SUCCESS); 2687 } 2688 2689 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2690 { 2691 Mat_Product *product = C->product; 2692 Mat A, B; 2693 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2694 Mat_SeqAIJ *a, *b, *c; 2695 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2696 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2697 PetscInt i, j, m, n, k; 2698 PetscBool flg; 2699 cusparseStatus_t stat; 2700 MatProductType ptype; 2701 MatMatCusparse *mmdata; 2702 PetscLogDouble flops; 2703 PetscBool biscompressed, ciscompressed; 2704 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2705 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2706 cusparseSpMatDescr_t BmatSpDescr; 2707 #else 2708 int cnz; 2709 #endif 2710 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2711 2712 PetscFunctionBegin; 2713 MatCheckProduct(C, 1); 2714 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2715 A = product->A; 2716 B = product->B; 2717 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2718 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2719 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2720 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2721 a = (Mat_SeqAIJ *)A->data; 2722 b = (Mat_SeqAIJ *)B->data; 2723 /* product data */ 2724 PetscCall(PetscNew(&mmdata)); 2725 C->product->data = mmdata; 2726 C->product->destroy = MatDestroy_MatMatCusparse; 2727 2728 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2729 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2730 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2731 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2732 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2733 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2734 2735 ptype = product->type; 2736 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2737 ptype = MATPRODUCT_AB; 2738 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2739 } 2740 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2741 ptype = MATPRODUCT_AB; 2742 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2743 } 2744 biscompressed = PETSC_FALSE; 2745 ciscompressed = PETSC_FALSE; 2746 switch (ptype) { 2747 case MATPRODUCT_AB: 2748 m = A->rmap->n; 2749 n = B->cmap->n; 2750 k = A->cmap->n; 2751 Amat = Acusp->mat; 2752 Bmat = Bcusp->mat; 2753 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2754 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2755 break; 2756 case MATPRODUCT_AtB: 2757 m = A->cmap->n; 2758 n = B->cmap->n; 2759 k = A->rmap->n; 2760 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2761 Amat = Acusp->matTranspose; 2762 Bmat = Bcusp->mat; 2763 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2764 break; 2765 case MATPRODUCT_ABt: 2766 m = A->rmap->n; 2767 n = B->rmap->n; 2768 k = A->cmap->n; 2769 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2770 Amat = Acusp->mat; 2771 Bmat = Bcusp->matTranspose; 2772 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2773 break; 2774 default: 2775 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2776 } 2777 2778 /* create cusparse matrix */ 2779 PetscCall(MatSetSizes(C, m, n, m, n)); 2780 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 2781 c = (Mat_SeqAIJ *)C->data; 2782 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2783 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2784 Ccsr = new CsrMatrix; 2785 2786 c->compressedrow.use = ciscompressed; 2787 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2788 c->compressedrow.nrows = a->compressedrow.nrows; 2789 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 2790 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 2791 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2792 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2793 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 2794 } else { 2795 c->compressedrow.nrows = 0; 2796 c->compressedrow.i = NULL; 2797 c->compressedrow.rindex = NULL; 2798 Ccusp->workVector = NULL; 2799 Cmat->cprowIndices = NULL; 2800 } 2801 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2802 Ccusp->mat = Cmat; 2803 Ccusp->mat->mat = Ccsr; 2804 Ccsr->num_rows = Ccusp->nrows; 2805 Ccsr->num_cols = n; 2806 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 2807 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2808 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2809 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2810 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 2811 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 2812 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2813 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2814 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2815 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2816 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2817 thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0); 2818 c->nz = 0; 2819 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2820 Ccsr->values = new THRUSTARRAY(c->nz); 2821 goto finalizesym; 2822 } 2823 2824 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2825 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2826 Acsr = (CsrMatrix *)Amat->mat; 2827 if (!biscompressed) { 2828 Bcsr = (CsrMatrix *)Bmat->mat; 2829 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2830 BmatSpDescr = Bmat->matDescr; 2831 #endif 2832 } else { /* we need to use row offsets for the full matrix */ 2833 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 2834 Bcsr = new CsrMatrix; 2835 Bcsr->num_rows = B->rmap->n; 2836 Bcsr->num_cols = cBcsr->num_cols; 2837 Bcsr->num_entries = cBcsr->num_entries; 2838 Bcsr->column_indices = cBcsr->column_indices; 2839 Bcsr->values = cBcsr->values; 2840 if (!Bcusp->rowoffsets_gpu) { 2841 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2842 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 2843 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 2844 } 2845 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2846 mmdata->Bcsr = Bcsr; 2847 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2848 if (Bcsr->num_rows && Bcsr->num_cols) { 2849 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2850 PetscCallCUSPARSE(stat); 2851 } 2852 BmatSpDescr = mmdata->matSpBDescr; 2853 #endif 2854 } 2855 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2856 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2857 /* precompute flops count */ 2858 if (ptype == MATPRODUCT_AB) { 2859 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2860 const PetscInt st = a->i[i]; 2861 const PetscInt en = a->i[i + 1]; 2862 for (j = st; j < en; j++) { 2863 const PetscInt brow = a->j[j]; 2864 flops += 2. * (b->i[brow + 1] - b->i[brow]); 2865 } 2866 } 2867 } else if (ptype == MATPRODUCT_AtB) { 2868 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2869 const PetscInt anzi = a->i[i + 1] - a->i[i]; 2870 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 2871 flops += (2. * anzi) * bnzi; 2872 } 2873 } else { /* TODO */ 2874 flops = 0.; 2875 } 2876 2877 mmdata->flops = flops; 2878 PetscCall(PetscLogGpuTimeBegin()); 2879 2880 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2881 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2882 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2883 PetscCallCUSPARSE(stat); 2884 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2885 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2886 { 2887 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2888 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2889 */ 2890 void *dBuffer1 = NULL; 2891 void *dBuffer2 = NULL; 2892 void *dBuffer3 = NULL; 2893 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2894 size_t bufferSize1 = 0; 2895 size_t bufferSize2 = 0; 2896 size_t bufferSize3 = 0; 2897 size_t bufferSize4 = 0; 2898 size_t bufferSize5 = 0; 2899 2900 /* ask bufferSize1 bytes for external memory */ 2901 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 2902 PetscCallCUSPARSE(stat); 2903 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 2904 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2905 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 2906 PetscCallCUSPARSE(stat); 2907 2908 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 2909 PetscCallCUSPARSE(stat); 2910 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 2911 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 2912 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 2913 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 2914 PetscCallCUSPARSE(stat); 2915 PetscCallCUDA(cudaFree(dBuffer1)); 2916 PetscCallCUDA(cudaFree(dBuffer2)); 2917 2918 /* get matrix C non-zero entries C_nnz1 */ 2919 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2920 c->nz = (PetscInt)C_nnz1; 2921 /* allocate matrix C */ 2922 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2923 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2924 Ccsr->values = new THRUSTARRAY(c->nz); 2925 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2926 /* update matC with the new pointers */ 2927 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 2928 PetscCallCUSPARSE(stat); 2929 2930 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 2931 PetscCallCUSPARSE(stat); 2932 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 2933 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 2934 PetscCallCUSPARSE(stat); 2935 PetscCallCUDA(cudaFree(dBuffer3)); 2936 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2937 PetscCallCUSPARSE(stat); 2938 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 2939 } 2940 #else 2941 size_t bufSize2; 2942 /* ask bufferSize bytes for external memory */ 2943 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 2944 PetscCallCUSPARSE(stat); 2945 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 2946 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2947 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 2948 PetscCallCUSPARSE(stat); 2949 /* ask bufferSize again bytes for external memory */ 2950 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 2951 PetscCallCUSPARSE(stat); 2952 /* The CUSPARSE documentation is not clear, nor the API 2953 We need both buffers to perform the operations properly! 2954 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2955 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2956 is stored in the descriptor! What a messy API... */ 2957 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 2958 /* compute the intermediate product of A * B */ 2959 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2960 PetscCallCUSPARSE(stat); 2961 /* get matrix C non-zero entries C_nnz1 */ 2962 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2963 c->nz = (PetscInt)C_nnz1; 2964 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 2965 mmdata->mmBufferSize / 1024)); 2966 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2967 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2968 Ccsr->values = new THRUSTARRAY(c->nz); 2969 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2970 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 2971 PetscCallCUSPARSE(stat); 2972 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2973 PetscCallCUSPARSE(stat); 2974 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2975 #else 2976 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2977 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2978 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 2979 PetscCallCUSPARSE(stat); 2980 c->nz = cnz; 2981 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2982 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2983 Ccsr->values = new THRUSTARRAY(c->nz); 2984 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2985 2986 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2987 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2988 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2989 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2990 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2991 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2992 PetscCallCUSPARSE(stat); 2993 #endif 2994 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2995 PetscCall(PetscLogGpuTimeEnd()); 2996 finalizesym: 2997 c->singlemalloc = PETSC_FALSE; 2998 c->free_a = PETSC_TRUE; 2999 c->free_ij = PETSC_TRUE; 3000 PetscCall(PetscMalloc1(m + 1, &c->i)); 3001 PetscCall(PetscMalloc1(c->nz, &c->j)); 3002 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3003 PetscInt *d_i = c->i; 3004 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3005 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3006 ii = *Ccsr->row_offsets; 3007 jj = *Ccsr->column_indices; 3008 if (ciscompressed) d_i = c->compressedrow.i; 3009 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3010 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3011 } else { 3012 PetscInt *d_i = c->i; 3013 if (ciscompressed) d_i = c->compressedrow.i; 3014 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3015 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3016 } 3017 if (ciscompressed) { /* need to expand host row offsets */ 3018 PetscInt r = 0; 3019 c->i[0] = 0; 3020 for (k = 0; k < c->compressedrow.nrows; k++) { 3021 const PetscInt next = c->compressedrow.rindex[k]; 3022 const PetscInt old = c->compressedrow.i[k]; 3023 for (; r < next; r++) c->i[r + 1] = old; 3024 } 3025 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3026 } 3027 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3028 PetscCall(PetscMalloc1(m, &c->ilen)); 3029 PetscCall(PetscMalloc1(m, &c->imax)); 3030 c->maxnz = c->nz; 3031 c->nonzerorowcnt = 0; 3032 c->rmax = 0; 3033 for (k = 0; k < m; k++) { 3034 const PetscInt nn = c->i[k + 1] - c->i[k]; 3035 c->ilen[k] = c->imax[k] = nn; 3036 c->nonzerorowcnt += (PetscInt) !!nn; 3037 c->rmax = PetscMax(c->rmax, nn); 3038 } 3039 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3040 PetscCall(PetscMalloc1(c->nz, &c->a)); 3041 Ccsr->num_entries = c->nz; 3042 3043 C->nonzerostate++; 3044 PetscCall(PetscLayoutSetUp(C->rmap)); 3045 PetscCall(PetscLayoutSetUp(C->cmap)); 3046 Ccusp->nonzerostate = C->nonzerostate; 3047 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3048 C->preallocated = PETSC_TRUE; 3049 C->assembled = PETSC_FALSE; 3050 C->was_assembled = PETSC_FALSE; 3051 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3052 mmdata->reusesym = PETSC_TRUE; 3053 C->offloadmask = PETSC_OFFLOAD_GPU; 3054 } 3055 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3056 PetscFunctionReturn(PETSC_SUCCESS); 3057 } 3058 3059 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3060 3061 /* handles sparse or dense B */ 3062 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3063 { 3064 Mat_Product *product = mat->product; 3065 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3066 3067 PetscFunctionBegin; 3068 MatCheckProduct(mat, 1); 3069 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3070 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3071 if (product->type == MATPRODUCT_ABC) { 3072 Ciscusp = PETSC_FALSE; 3073 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3074 } 3075 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3076 PetscBool usecpu = PETSC_FALSE; 3077 switch (product->type) { 3078 case MATPRODUCT_AB: 3079 if (product->api_user) { 3080 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3081 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3082 PetscOptionsEnd(); 3083 } else { 3084 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3085 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3086 PetscOptionsEnd(); 3087 } 3088 break; 3089 case MATPRODUCT_AtB: 3090 if (product->api_user) { 3091 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3092 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3093 PetscOptionsEnd(); 3094 } else { 3095 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3096 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3097 PetscOptionsEnd(); 3098 } 3099 break; 3100 case MATPRODUCT_PtAP: 3101 if (product->api_user) { 3102 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3103 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3104 PetscOptionsEnd(); 3105 } else { 3106 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3107 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3108 PetscOptionsEnd(); 3109 } 3110 break; 3111 case MATPRODUCT_RARt: 3112 if (product->api_user) { 3113 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3114 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3115 PetscOptionsEnd(); 3116 } else { 3117 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3118 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3119 PetscOptionsEnd(); 3120 } 3121 break; 3122 case MATPRODUCT_ABC: 3123 if (product->api_user) { 3124 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3125 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3126 PetscOptionsEnd(); 3127 } else { 3128 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3129 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3130 PetscOptionsEnd(); 3131 } 3132 break; 3133 default: 3134 break; 3135 } 3136 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3137 } 3138 /* dispatch */ 3139 if (isdense) { 3140 switch (product->type) { 3141 case MATPRODUCT_AB: 3142 case MATPRODUCT_AtB: 3143 case MATPRODUCT_ABt: 3144 case MATPRODUCT_PtAP: 3145 case MATPRODUCT_RARt: 3146 if (product->A->boundtocpu) { 3147 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3148 } else { 3149 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3150 } 3151 break; 3152 case MATPRODUCT_ABC: 3153 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3154 break; 3155 default: 3156 break; 3157 } 3158 } else if (Biscusp && Ciscusp) { 3159 switch (product->type) { 3160 case MATPRODUCT_AB: 3161 case MATPRODUCT_AtB: 3162 case MATPRODUCT_ABt: 3163 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3164 break; 3165 case MATPRODUCT_PtAP: 3166 case MATPRODUCT_RARt: 3167 case MATPRODUCT_ABC: 3168 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3169 break; 3170 default: 3171 break; 3172 } 3173 } else { /* fallback for AIJ */ 3174 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3175 } 3176 PetscFunctionReturn(PETSC_SUCCESS); 3177 } 3178 3179 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3180 { 3181 PetscFunctionBegin; 3182 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3183 PetscFunctionReturn(PETSC_SUCCESS); 3184 } 3185 3186 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3187 { 3188 PetscFunctionBegin; 3189 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3190 PetscFunctionReturn(PETSC_SUCCESS); 3191 } 3192 3193 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3194 { 3195 PetscFunctionBegin; 3196 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3197 PetscFunctionReturn(PETSC_SUCCESS); 3198 } 3199 3200 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3201 { 3202 PetscFunctionBegin; 3203 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3204 PetscFunctionReturn(PETSC_SUCCESS); 3205 } 3206 3207 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3208 { 3209 PetscFunctionBegin; 3210 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3211 PetscFunctionReturn(PETSC_SUCCESS); 3212 } 3213 3214 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3215 { 3216 int i = blockIdx.x * blockDim.x + threadIdx.x; 3217 if (i < n) y[idx[i]] += x[i]; 3218 } 3219 3220 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3221 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3222 { 3223 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3224 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3225 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3226 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3227 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3228 PetscBool compressed; 3229 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3230 PetscInt nx, ny; 3231 #endif 3232 3233 PetscFunctionBegin; 3234 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3235 if (!a->nz) { 3236 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3237 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3238 PetscFunctionReturn(PETSC_SUCCESS); 3239 } 3240 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3241 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3242 if (!trans) { 3243 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3244 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3245 } else { 3246 if (herm || !A->form_explicit_transpose) { 3247 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3248 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3249 } else { 3250 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3251 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3252 } 3253 } 3254 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3255 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3256 3257 try { 3258 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3259 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3260 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3261 3262 PetscCall(PetscLogGpuTimeBegin()); 3263 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3264 /* z = A x + beta y. 3265 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3266 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3267 */ 3268 xptr = xarray; 3269 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3270 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3271 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3272 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3273 allocated to accommodate different uses. So we get the length info directly from mat. 3274 */ 3275 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3276 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3277 nx = mat->num_cols; 3278 ny = mat->num_rows; 3279 } 3280 #endif 3281 } else { 3282 /* z = A^T x + beta y 3283 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3284 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3285 */ 3286 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3287 dptr = zarray; 3288 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3289 if (compressed) { /* Scatter x to work vector */ 3290 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3291 3292 thrust::for_each( 3293 #if PetscDefined(HAVE_THRUST_ASYNC) 3294 thrust::cuda::par.on(PetscDefaultCudaStream), 3295 #endif 3296 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3297 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3298 } 3299 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3300 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3301 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3302 nx = mat->num_rows; 3303 ny = mat->num_cols; 3304 } 3305 #endif 3306 } 3307 3308 /* csr_spmv does y = alpha op(A) x + beta y */ 3309 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3310 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3311 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3312 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3313 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3314 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3315 PetscCallCUSPARSE( 3316 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3317 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3318 3319 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3320 } else { 3321 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3322 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3323 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3324 } 3325 3326 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3327 matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3328 #else 3329 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3330 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3331 #endif 3332 } else { 3333 if (cusparsestruct->nrows) { 3334 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3335 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3336 #else 3337 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3338 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3339 #endif 3340 } 3341 } 3342 PetscCall(PetscLogGpuTimeEnd()); 3343 3344 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3345 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3346 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3347 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3348 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3349 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3350 } 3351 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3352 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3353 } 3354 3355 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3356 if (compressed) { 3357 PetscCall(PetscLogGpuTimeBegin()); 3358 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3359 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3360 prevent that. So I just add a ScatterAdd kernel. 3361 */ 3362 #if 0 3363 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3364 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3365 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3366 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3367 VecCUDAPlusEquals()); 3368 #else 3369 PetscInt n = matstruct->cprowIndices->size(); 3370 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3371 #endif 3372 PetscCall(PetscLogGpuTimeEnd()); 3373 } 3374 } else { 3375 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3376 } 3377 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3378 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3379 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3380 } catch (char *ex) { 3381 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3382 } 3383 if (yy) { 3384 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3385 } else { 3386 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3387 } 3388 PetscFunctionReturn(PETSC_SUCCESS); 3389 } 3390 3391 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3392 { 3393 PetscFunctionBegin; 3394 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3395 PetscFunctionReturn(PETSC_SUCCESS); 3396 } 3397 3398 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3399 { 3400 PetscObjectState onnz = A->nonzerostate; 3401 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3402 3403 PetscFunctionBegin; 3404 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3405 if (onnz != A->nonzerostate && cusp->deviceMat) { 3406 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 3407 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3408 cusp->deviceMat = NULL; 3409 } 3410 PetscFunctionReturn(PETSC_SUCCESS); 3411 } 3412 3413 /*@ 3414 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3415 (the default parallel PETSc format). This matrix will ultimately pushed down 3416 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 3417 assembly performance the user should preallocate the matrix storage by setting 3418 the parameter `nz` (or the array `nnz`). 3419 3420 Collective 3421 3422 Input Parameters: 3423 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3424 . m - number of rows 3425 . n - number of columns 3426 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3427 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3428 3429 Output Parameter: 3430 . A - the matrix 3431 3432 Level: intermediate 3433 3434 Notes: 3435 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3436 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3437 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3438 3439 The AIJ format, also called 3440 compressed row storage, is fully compatible with standard Fortran 3441 storage. That is, the stored row and column indices can begin at 3442 either one (as in Fortran) or zero. 3443 3444 Specify the preallocated storage with either nz or nnz (not both). 3445 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3446 allocation. 3447 3448 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3449 @*/ 3450 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3451 { 3452 PetscFunctionBegin; 3453 PetscCall(MatCreate(comm, A)); 3454 PetscCall(MatSetSizes(*A, m, n, m, n)); 3455 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3456 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3457 PetscFunctionReturn(PETSC_SUCCESS); 3458 } 3459 3460 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3461 { 3462 PetscFunctionBegin; 3463 if (A->factortype == MAT_FACTOR_NONE) { 3464 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 3465 } else { 3466 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3467 } 3468 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3469 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3470 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3471 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3472 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3473 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3474 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3475 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3476 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3477 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3478 PetscCall(MatDestroy_SeqAIJ(A)); 3479 PetscFunctionReturn(PETSC_SUCCESS); 3480 } 3481 3482 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3483 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3484 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3485 { 3486 PetscFunctionBegin; 3487 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3488 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3489 PetscFunctionReturn(PETSC_SUCCESS); 3490 } 3491 3492 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3493 { 3494 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3495 Mat_SeqAIJCUSPARSE *cy; 3496 Mat_SeqAIJCUSPARSE *cx; 3497 PetscScalar *ay; 3498 const PetscScalar *ax; 3499 CsrMatrix *csry, *csrx; 3500 3501 PetscFunctionBegin; 3502 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3503 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3504 if (X->ops->axpy != Y->ops->axpy) { 3505 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3506 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3507 PetscFunctionReturn(PETSC_SUCCESS); 3508 } 3509 /* if we are here, it means both matrices are bound to GPU */ 3510 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3511 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3512 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3513 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3514 csry = (CsrMatrix *)cy->mat->mat; 3515 csrx = (CsrMatrix *)cx->mat->mat; 3516 /* see if we can turn this into a cublas axpy */ 3517 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3518 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3519 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3520 if (eq) str = SAME_NONZERO_PATTERN; 3521 } 3522 /* spgeam is buggy with one column */ 3523 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3524 3525 if (str == SUBSET_NONZERO_PATTERN) { 3526 PetscScalar b = 1.0; 3527 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3528 size_t bufferSize; 3529 void *buffer; 3530 #endif 3531 3532 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3533 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3534 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3535 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3536 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3537 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3538 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3539 PetscCall(PetscLogGpuTimeBegin()); 3540 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3541 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3542 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3543 PetscCall(PetscLogGpuTimeEnd()); 3544 PetscCallCUDA(cudaFree(buffer)); 3545 #else 3546 PetscCall(PetscLogGpuTimeBegin()); 3547 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3548 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3549 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3550 PetscCall(PetscLogGpuTimeEnd()); 3551 #endif 3552 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3553 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3554 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3555 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3556 } else if (str == SAME_NONZERO_PATTERN) { 3557 cublasHandle_t cublasv2handle; 3558 PetscBLASInt one = 1, bnz = 1; 3559 3560 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3561 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3562 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3563 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3564 PetscCall(PetscLogGpuTimeBegin()); 3565 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3566 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3567 PetscCall(PetscLogGpuTimeEnd()); 3568 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3569 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3570 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3571 } else { 3572 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3573 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3574 } 3575 PetscFunctionReturn(PETSC_SUCCESS); 3576 } 3577 3578 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3579 { 3580 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3581 PetscScalar *ay; 3582 cublasHandle_t cublasv2handle; 3583 PetscBLASInt one = 1, bnz = 1; 3584 3585 PetscFunctionBegin; 3586 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3587 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3588 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3589 PetscCall(PetscLogGpuTimeBegin()); 3590 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3591 PetscCall(PetscLogGpuFlops(bnz)); 3592 PetscCall(PetscLogGpuTimeEnd()); 3593 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3594 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3595 PetscFunctionReturn(PETSC_SUCCESS); 3596 } 3597 3598 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3599 { 3600 PetscBool both = PETSC_FALSE; 3601 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3602 3603 PetscFunctionBegin; 3604 if (A->factortype == MAT_FACTOR_NONE) { 3605 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3606 if (spptr->mat) { 3607 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3608 if (matrix->values) { 3609 both = PETSC_TRUE; 3610 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3611 } 3612 } 3613 if (spptr->matTranspose) { 3614 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3615 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3616 } 3617 } 3618 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3619 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3620 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3621 else A->offloadmask = PETSC_OFFLOAD_CPU; 3622 PetscFunctionReturn(PETSC_SUCCESS); 3623 } 3624 3625 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3626 { 3627 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3628 3629 PetscFunctionBegin; 3630 if (A->factortype != MAT_FACTOR_NONE) { 3631 A->boundtocpu = flg; 3632 PetscFunctionReturn(PETSC_SUCCESS); 3633 } 3634 if (flg) { 3635 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3636 3637 A->ops->scale = MatScale_SeqAIJ; 3638 A->ops->axpy = MatAXPY_SeqAIJ; 3639 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3640 A->ops->mult = MatMult_SeqAIJ; 3641 A->ops->multadd = MatMultAdd_SeqAIJ; 3642 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3643 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3644 A->ops->multhermitiantranspose = NULL; 3645 A->ops->multhermitiantransposeadd = NULL; 3646 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3647 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3648 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3649 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3650 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3651 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3652 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3653 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3654 } else { 3655 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3656 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3657 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3658 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3659 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3660 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3661 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3662 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3663 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3664 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3665 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3666 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3667 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3668 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3669 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3670 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3671 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3672 3673 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3674 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3675 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3676 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3677 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 3678 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3679 } 3680 A->boundtocpu = flg; 3681 if (flg && a->inode.size) { 3682 a->inode.use = PETSC_TRUE; 3683 } else { 3684 a->inode.use = PETSC_FALSE; 3685 } 3686 PetscFunctionReturn(PETSC_SUCCESS); 3687 } 3688 3689 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 3690 { 3691 Mat B; 3692 3693 PetscFunctionBegin; 3694 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3695 if (reuse == MAT_INITIAL_MATRIX) { 3696 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3697 } else if (reuse == MAT_REUSE_MATRIX) { 3698 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 3699 } 3700 B = *newmat; 3701 3702 PetscCall(PetscFree(B->defaultvectype)); 3703 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 3704 3705 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3706 if (B->factortype == MAT_FACTOR_NONE) { 3707 Mat_SeqAIJCUSPARSE *spptr; 3708 PetscCall(PetscNew(&spptr)); 3709 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3710 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3711 spptr->format = MAT_CUSPARSE_CSR; 3712 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3713 #if CUSPARSE_VERSION > 11301 3714 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3715 #else 3716 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3717 #endif 3718 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3719 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3720 #endif 3721 B->spptr = spptr; 3722 } else { 3723 Mat_SeqAIJCUSPARSETriFactors *spptr; 3724 3725 PetscCall(PetscNew(&spptr)); 3726 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3727 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3728 B->spptr = spptr; 3729 } 3730 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3731 } 3732 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3733 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3734 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3735 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3736 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3737 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3738 3739 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 3740 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 3741 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3742 #if defined(PETSC_HAVE_HYPRE) 3743 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 3744 #endif 3745 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3746 PetscFunctionReturn(PETSC_SUCCESS); 3747 } 3748 3749 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3750 { 3751 PetscFunctionBegin; 3752 PetscCall(MatCreate_SeqAIJ(B)); 3753 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 3754 PetscFunctionReturn(PETSC_SUCCESS); 3755 } 3756 3757 /*MC 3758 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3759 3760 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 3761 CSR, ELL, or Hybrid format. 3762 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 3763 3764 Options Database Keys: 3765 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 3766 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 3767 Other options include ell (ellpack) or hyb (hybrid). 3768 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 3769 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 3770 3771 Level: beginner 3772 3773 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3774 M*/ 3775 3776 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 3777 3778 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3779 { 3780 PetscFunctionBegin; 3781 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 3782 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 3783 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 3784 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 3785 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 3786 3787 PetscFunctionReturn(PETSC_SUCCESS); 3788 } 3789 3790 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3791 { 3792 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 3793 3794 PetscFunctionBegin; 3795 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 3796 delete cusp->cooPerm; 3797 delete cusp->cooPerm_a; 3798 cusp->cooPerm = NULL; 3799 cusp->cooPerm_a = NULL; 3800 if (cusp->use_extended_coo) { 3801 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3802 PetscCallCUDA(cudaFree(cusp->perm_d)); 3803 } 3804 cusp->use_extended_coo = PETSC_FALSE; 3805 PetscFunctionReturn(PETSC_SUCCESS); 3806 } 3807 3808 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3809 { 3810 PetscFunctionBegin; 3811 if (*cusparsestruct) { 3812 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 3813 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 3814 delete (*cusparsestruct)->workVector; 3815 delete (*cusparsestruct)->rowoffsets_gpu; 3816 delete (*cusparsestruct)->cooPerm; 3817 delete (*cusparsestruct)->cooPerm_a; 3818 delete (*cusparsestruct)->csr2csc_i; 3819 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3820 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3821 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3822 PetscCall(PetscFree(*cusparsestruct)); 3823 } 3824 PetscFunctionReturn(PETSC_SUCCESS); 3825 } 3826 3827 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3828 { 3829 PetscFunctionBegin; 3830 if (*mat) { 3831 delete (*mat)->values; 3832 delete (*mat)->column_indices; 3833 delete (*mat)->row_offsets; 3834 delete *mat; 3835 *mat = 0; 3836 } 3837 PetscFunctionReturn(PETSC_SUCCESS); 3838 } 3839 3840 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3841 { 3842 PetscFunctionBegin; 3843 if (*trifactor) { 3844 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3845 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 3846 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3847 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3848 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3849 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3850 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3851 #endif 3852 PetscCall(PetscFree(*trifactor)); 3853 } 3854 PetscFunctionReturn(PETSC_SUCCESS); 3855 } 3856 3857 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 3858 { 3859 CsrMatrix *mat; 3860 3861 PetscFunctionBegin; 3862 if (*matstruct) { 3863 if ((*matstruct)->mat) { 3864 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 3865 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3866 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3867 #else 3868 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3869 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3870 #endif 3871 } else { 3872 mat = (CsrMatrix *)(*matstruct)->mat; 3873 PetscCall(CsrMatrix_Destroy(&mat)); 3874 } 3875 } 3876 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3877 delete (*matstruct)->cprowIndices; 3878 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3879 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3880 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3881 3882 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3883 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3884 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3885 for (int i = 0; i < 3; i++) { 3886 if (mdata->cuSpMV[i].initialized) { 3887 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3888 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3889 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3890 } 3891 } 3892 #endif 3893 delete *matstruct; 3894 *matstruct = NULL; 3895 } 3896 PetscFunctionReturn(PETSC_SUCCESS); 3897 } 3898 3899 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 3900 { 3901 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 3902 3903 PetscFunctionBegin; 3904 if (fs) { 3905 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 3906 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 3907 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 3908 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 3909 delete fs->rpermIndices; 3910 delete fs->cpermIndices; 3911 delete fs->workVector; 3912 fs->rpermIndices = NULL; 3913 fs->cpermIndices = NULL; 3914 fs->workVector = NULL; 3915 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 3916 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 3917 fs->init_dev_prop = PETSC_FALSE; 3918 #if CUSPARSE_VERSION >= 11500 3919 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 3920 PetscCallCUDA(cudaFree(fs->csrColIdx)); 3921 PetscCallCUDA(cudaFree(fs->csrVal)); 3922 PetscCallCUDA(cudaFree(fs->X)); 3923 PetscCallCUDA(cudaFree(fs->Y)); 3924 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 3925 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 3926 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 3927 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 3928 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 3929 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 3930 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 3931 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 3932 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 3933 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 3934 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 3935 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 3936 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 3937 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 3938 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 3939 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 3940 3941 fs->createdTransposeSpSVDescr = PETSC_FALSE; 3942 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 3943 #endif 3944 } 3945 PetscFunctionReturn(PETSC_SUCCESS); 3946 } 3947 3948 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 3949 { 3950 PetscFunctionBegin; 3951 if (*trifactors) { 3952 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3953 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 3954 PetscCall(PetscFree(*trifactors)); 3955 } 3956 PetscFunctionReturn(PETSC_SUCCESS); 3957 } 3958 3959 struct IJCompare { 3960 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3961 { 3962 if (t1.get<0>() < t2.get<0>()) return true; 3963 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3964 return false; 3965 } 3966 }; 3967 3968 struct IJEqual { 3969 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3970 { 3971 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3972 return true; 3973 } 3974 }; 3975 3976 struct IJDiff { 3977 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 3978 }; 3979 3980 struct IJSum { 3981 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 3982 }; 3983 3984 #include <thrust/iterator/discard_iterator.h> 3985 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3986 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 3987 { 3988 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3989 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3990 THRUSTARRAY *cooPerm_v = NULL; 3991 thrust::device_ptr<const PetscScalar> d_v; 3992 CsrMatrix *matrix; 3993 PetscInt n; 3994 3995 PetscFunctionBegin; 3996 PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 3997 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 3998 if (!cusp->cooPerm) { 3999 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 4000 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 4001 PetscFunctionReturn(PETSC_SUCCESS); 4002 } 4003 matrix = (CsrMatrix *)cusp->mat->mat; 4004 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4005 if (!v) { 4006 if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4007 goto finalize; 4008 } 4009 n = cusp->cooPerm->size(); 4010 if (isCudaMem(v)) { 4011 d_v = thrust::device_pointer_cast(v); 4012 } else { 4013 cooPerm_v = new THRUSTARRAY(n); 4014 cooPerm_v->assign(v, v + n); 4015 d_v = cooPerm_v->data(); 4016 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4017 } 4018 PetscCall(PetscLogGpuTimeBegin()); 4019 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4020 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4021 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4022 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4023 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4024 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4025 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4026 */ 4027 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4028 thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4029 delete cooPerm_w; 4030 } else { 4031 /* all nonzeros in d_v[] are unique entries */ 4032 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4033 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4034 thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4035 } 4036 } else { 4037 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4038 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4039 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4040 } else { 4041 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4042 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4043 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4044 } 4045 } 4046 PetscCall(PetscLogGpuTimeEnd()); 4047 finalize: 4048 delete cooPerm_v; 4049 A->offloadmask = PETSC_OFFLOAD_GPU; 4050 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4051 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4052 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 4053 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 4054 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4055 a->reallocs = 0; 4056 A->info.mallocs += 0; 4057 A->info.nz_unneeded = 0; 4058 A->assembled = A->was_assembled = PETSC_TRUE; 4059 A->num_ass++; 4060 PetscFunctionReturn(PETSC_SUCCESS); 4061 } 4062 4063 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4064 { 4065 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4066 4067 PetscFunctionBegin; 4068 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4069 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4070 if (destroy) { 4071 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4072 delete cusp->csr2csc_i; 4073 cusp->csr2csc_i = NULL; 4074 } 4075 A->transupdated = PETSC_FALSE; 4076 PetscFunctionReturn(PETSC_SUCCESS); 4077 } 4078 4079 #include <thrust/binary_search.h> 4080 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4081 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 4082 { 4083 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4084 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4085 PetscInt cooPerm_n, nzr = 0; 4086 4087 PetscFunctionBegin; 4088 PetscCall(PetscLayoutSetUp(A->rmap)); 4089 PetscCall(PetscLayoutSetUp(A->cmap)); 4090 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4091 if (n != cooPerm_n) { 4092 delete cusp->cooPerm; 4093 delete cusp->cooPerm_a; 4094 cusp->cooPerm = NULL; 4095 cusp->cooPerm_a = NULL; 4096 } 4097 if (n) { 4098 thrust::device_ptr<PetscInt> d_i, d_j; 4099 PetscInt *d_raw_i, *d_raw_j; 4100 PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4101 PetscMemType imtype, jmtype; 4102 4103 PetscCall(PetscGetMemType(coo_i, &imtype)); 4104 if (PetscMemTypeHost(imtype)) { 4105 PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4106 PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4107 d_i = thrust::device_pointer_cast(d_raw_i); 4108 free_raw_i = PETSC_TRUE; 4109 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4110 } else { 4111 d_i = thrust::device_pointer_cast(coo_i); 4112 } 4113 4114 PetscCall(PetscGetMemType(coo_j, &jmtype)); 4115 if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4116 PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4117 PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4118 d_j = thrust::device_pointer_cast(d_raw_j); 4119 free_raw_j = PETSC_TRUE; 4120 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4121 } else { 4122 d_j = thrust::device_pointer_cast(coo_j); 4123 } 4124 4125 THRUSTINTARRAY ii(A->rmap->n); 4126 4127 if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4128 if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 4129 4130 /* Ex. 4131 n = 6 4132 coo_i = [3,3,1,4,1,4] 4133 coo_j = [3,2,2,5,2,6] 4134 */ 4135 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4136 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 4137 4138 PetscCall(PetscLogGpuTimeBegin()); 4139 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4140 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4141 (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4142 THRUSTINTARRAY w(d_j, d_j + n); 4143 4144 /* 4145 d_i = [1,1,3,3,4,4] 4146 d_j = [2,2,2,3,5,6] 4147 cooPerm = [2,4,1,0,3,5] 4148 */ 4149 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4150 4151 /* 4152 d_i = [1,3,3,4,4,x] 4153 ^ekey 4154 d_j = [2,2,3,5,6,x] 4155 ^nekye 4156 */ 4157 if (nekey == ekey) { /* all entries are unique */ 4158 delete cusp->cooPerm_a; 4159 cusp->cooPerm_a = NULL; 4160 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4161 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4162 adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4163 adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4164 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4165 w[0] = 0; 4166 thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4167 thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4168 } 4169 thrust::counting_iterator<PetscInt> search_begin(0); 4170 thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4171 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4172 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4173 PetscCall(PetscLogGpuTimeEnd()); 4174 4175 PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 4176 a->singlemalloc = PETSC_FALSE; 4177 a->free_a = PETSC_TRUE; 4178 a->free_ij = PETSC_TRUE; 4179 PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4180 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4181 PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4182 a->nz = a->maxnz = a->i[A->rmap->n]; 4183 a->rmax = 0; 4184 PetscCall(PetscMalloc1(a->nz, &a->a)); 4185 PetscCall(PetscMalloc1(a->nz, &a->j)); 4186 PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4187 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 4188 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 4189 for (PetscInt i = 0; i < A->rmap->n; i++) { 4190 const PetscInt nnzr = a->i[i + 1] - a->i[i]; 4191 nzr += (PetscInt) !!(nnzr); 4192 a->ilen[i] = a->imax[i] = nnzr; 4193 a->rmax = PetscMax(a->rmax, nnzr); 4194 } 4195 a->nonzerorowcnt = nzr; 4196 A->preallocated = PETSC_TRUE; 4197 PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 4198 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4199 if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4200 if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 4201 } else { 4202 PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 4203 } 4204 PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 4205 4206 /* We want to allocate the CUSPARSE struct for matvec now. 4207 The code is so convoluted now that I prefer to copy zeros */ 4208 PetscCall(PetscArrayzero(a->a, a->nz)); 4209 PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 4210 A->offloadmask = PETSC_OFFLOAD_CPU; 4211 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4212 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 4213 PetscFunctionReturn(PETSC_SUCCESS); 4214 } 4215 4216 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4217 { 4218 Mat_SeqAIJ *seq; 4219 Mat_SeqAIJCUSPARSE *dev; 4220 PetscBool coo_basic = PETSC_TRUE; 4221 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4222 4223 PetscFunctionBegin; 4224 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4225 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4226 if (coo_i) { 4227 PetscCall(PetscGetMemType(coo_i, &mtype)); 4228 if (PetscMemTypeHost(mtype)) { 4229 for (PetscCount k = 0; k < coo_n; k++) { 4230 if (coo_i[k] < 0 || coo_j[k] < 0) { 4231 coo_basic = PETSC_FALSE; 4232 break; 4233 } 4234 } 4235 } 4236 } 4237 4238 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4239 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4240 } else { 4241 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4242 mat->offloadmask = PETSC_OFFLOAD_CPU; 4243 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4244 seq = static_cast<Mat_SeqAIJ *>(mat->data); 4245 dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4246 PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 4247 PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4248 PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 4249 PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4250 dev->use_extended_coo = PETSC_TRUE; 4251 } 4252 PetscFunctionReturn(PETSC_SUCCESS); 4253 } 4254 4255 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4256 { 4257 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4258 const PetscCount grid_size = gridDim.x * blockDim.x; 4259 for (; i < nnz; i += grid_size) { 4260 PetscScalar sum = 0.0; 4261 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4262 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4263 } 4264 } 4265 4266 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4267 { 4268 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4269 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4270 PetscCount Annz = seq->nz; 4271 PetscMemType memtype; 4272 const PetscScalar *v1 = v; 4273 PetscScalar *Aa; 4274 4275 PetscFunctionBegin; 4276 if (dev->use_extended_coo) { 4277 PetscCall(PetscGetMemType(v, &memtype)); 4278 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4279 PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 4280 PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4281 } 4282 4283 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4284 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4285 4286 if (Annz) { 4287 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 4288 PetscCallCUDA(cudaPeekAtLastError()); 4289 } 4290 4291 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4292 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4293 4294 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4295 } else { 4296 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4297 } 4298 PetscFunctionReturn(PETSC_SUCCESS); 4299 } 4300 4301 /*@C 4302 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4303 4304 Not Collective 4305 4306 Input Parameters: 4307 + A - the matrix 4308 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4309 4310 Output Parameters: 4311 + i - the CSR row pointers 4312 - j - the CSR column indices 4313 4314 Level: developer 4315 4316 Note: 4317 When compressed is true, the CSR structure does not contain empty rows 4318 4319 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4320 @*/ 4321 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4322 { 4323 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4324 CsrMatrix *csr; 4325 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4326 4327 PetscFunctionBegin; 4328 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4329 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4330 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4331 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4332 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4333 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4334 csr = (CsrMatrix *)cusp->mat->mat; 4335 if (i) { 4336 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4337 if (!cusp->rowoffsets_gpu) { 4338 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4339 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4340 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4341 } 4342 *i = cusp->rowoffsets_gpu->data().get(); 4343 } else *i = csr->row_offsets->data().get(); 4344 } 4345 if (j) *j = csr->column_indices->data().get(); 4346 PetscFunctionReturn(PETSC_SUCCESS); 4347 } 4348 4349 /*@C 4350 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4351 4352 Not Collective 4353 4354 Input Parameters: 4355 + A - the matrix 4356 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4357 . i - the CSR row pointers 4358 - j - the CSR column indices 4359 4360 Level: developer 4361 4362 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4363 @*/ 4364 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4365 { 4366 PetscFunctionBegin; 4367 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4368 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4369 if (i) *i = NULL; 4370 if (j) *j = NULL; 4371 (void)compressed; 4372 PetscFunctionReturn(PETSC_SUCCESS); 4373 } 4374 4375 /*@C 4376 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4377 4378 Not Collective 4379 4380 Input Parameter: 4381 . A - a `MATSEQAIJCUSPARSE` matrix 4382 4383 Output Parameter: 4384 . a - pointer to the device data 4385 4386 Level: developer 4387 4388 Note: 4389 May trigger host-device copies if up-to-date matrix data is on host 4390 4391 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4392 @*/ 4393 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4394 { 4395 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4396 CsrMatrix *csr; 4397 4398 PetscFunctionBegin; 4399 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4400 PetscValidPointer(a, 2); 4401 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4402 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4403 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4404 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4405 csr = (CsrMatrix *)cusp->mat->mat; 4406 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4407 *a = csr->values->data().get(); 4408 PetscFunctionReturn(PETSC_SUCCESS); 4409 } 4410 4411 /*@C 4412 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4413 4414 Not Collective 4415 4416 Input Parameters: 4417 + A - a `MATSEQAIJCUSPARSE` matrix 4418 - a - pointer to the device data 4419 4420 Level: developer 4421 4422 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4423 @*/ 4424 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4425 { 4426 PetscFunctionBegin; 4427 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4428 PetscValidPointer(a, 2); 4429 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4430 *a = NULL; 4431 PetscFunctionReturn(PETSC_SUCCESS); 4432 } 4433 4434 /*@C 4435 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4436 4437 Not Collective 4438 4439 Input Parameter: 4440 . A - a `MATSEQAIJCUSPARSE` matrix 4441 4442 Output Parameter: 4443 . a - pointer to the device data 4444 4445 Level: developer 4446 4447 Note: 4448 May trigger host-device copies if up-to-date matrix data is on host 4449 4450 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4451 @*/ 4452 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4453 { 4454 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4455 CsrMatrix *csr; 4456 4457 PetscFunctionBegin; 4458 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4459 PetscValidPointer(a, 2); 4460 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4461 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4462 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4463 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4464 csr = (CsrMatrix *)cusp->mat->mat; 4465 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4466 *a = csr->values->data().get(); 4467 A->offloadmask = PETSC_OFFLOAD_GPU; 4468 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4469 PetscFunctionReturn(PETSC_SUCCESS); 4470 } 4471 /*@C 4472 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4473 4474 Not Collective 4475 4476 Input Parameters: 4477 + A - a `MATSEQAIJCUSPARSE` matrix 4478 - a - pointer to the device data 4479 4480 Level: developer 4481 4482 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4483 @*/ 4484 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4485 { 4486 PetscFunctionBegin; 4487 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4488 PetscValidPointer(a, 2); 4489 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4490 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4491 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4492 *a = NULL; 4493 PetscFunctionReturn(PETSC_SUCCESS); 4494 } 4495 4496 /*@C 4497 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4498 4499 Not Collective 4500 4501 Input Parameter: 4502 . A - a `MATSEQAIJCUSPARSE` matrix 4503 4504 Output Parameter: 4505 . a - pointer to the device data 4506 4507 Level: developer 4508 4509 Note: 4510 Does not trigger host-device copies and flags data validity on the GPU 4511 4512 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4513 @*/ 4514 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4515 { 4516 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4517 CsrMatrix *csr; 4518 4519 PetscFunctionBegin; 4520 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4521 PetscValidPointer(a, 2); 4522 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4523 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4524 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4525 csr = (CsrMatrix *)cusp->mat->mat; 4526 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4527 *a = csr->values->data().get(); 4528 A->offloadmask = PETSC_OFFLOAD_GPU; 4529 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4530 PetscFunctionReturn(PETSC_SUCCESS); 4531 } 4532 4533 /*@C 4534 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4535 4536 Not Collective 4537 4538 Input Parameters: 4539 + A - a `MATSEQAIJCUSPARSE` matrix 4540 - a - pointer to the device data 4541 4542 Level: developer 4543 4544 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4545 @*/ 4546 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4547 { 4548 PetscFunctionBegin; 4549 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4550 PetscValidPointer(a, 2); 4551 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4552 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4553 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4554 *a = NULL; 4555 PetscFunctionReturn(PETSC_SUCCESS); 4556 } 4557 4558 struct IJCompare4 { 4559 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4560 { 4561 if (t1.get<0>() < t2.get<0>()) return true; 4562 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4563 return false; 4564 } 4565 }; 4566 4567 struct Shift { 4568 int _shift; 4569 4570 Shift(int shift) : _shift(shift) { } 4571 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4572 }; 4573 4574 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4575 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4576 { 4577 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4578 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4579 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4580 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4581 PetscInt Annz, Bnnz; 4582 cusparseStatus_t stat; 4583 PetscInt i, m, n, zero = 0; 4584 4585 PetscFunctionBegin; 4586 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4587 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4588 PetscValidPointer(C, 4); 4589 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4590 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4591 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4592 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4593 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4594 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4595 if (reuse == MAT_INITIAL_MATRIX) { 4596 m = A->rmap->n; 4597 n = A->cmap->n + B->cmap->n; 4598 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4599 PetscCall(MatSetSizes(*C, m, n, m, n)); 4600 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4601 c = (Mat_SeqAIJ *)(*C)->data; 4602 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4603 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4604 Ccsr = new CsrMatrix; 4605 Cmat->cprowIndices = NULL; 4606 c->compressedrow.use = PETSC_FALSE; 4607 c->compressedrow.nrows = 0; 4608 c->compressedrow.i = NULL; 4609 c->compressedrow.rindex = NULL; 4610 Ccusp->workVector = NULL; 4611 Ccusp->nrows = m; 4612 Ccusp->mat = Cmat; 4613 Ccusp->mat->mat = Ccsr; 4614 Ccsr->num_rows = m; 4615 Ccsr->num_cols = n; 4616 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4617 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4618 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4619 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 4620 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 4621 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4622 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4623 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4624 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4625 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4626 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4627 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4628 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4629 4630 Acsr = (CsrMatrix *)Acusp->mat->mat; 4631 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4632 Annz = (PetscInt)Acsr->column_indices->size(); 4633 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4634 c->nz = Annz + Bnnz; 4635 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4636 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4637 Ccsr->values = new THRUSTARRAY(c->nz); 4638 Ccsr->num_entries = c->nz; 4639 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4640 if (c->nz) { 4641 auto Acoo = new THRUSTINTARRAY32(Annz); 4642 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4643 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4644 THRUSTINTARRAY32 *Aroff, *Broff; 4645 4646 if (a->compressedrow.use) { /* need full row offset */ 4647 if (!Acusp->rowoffsets_gpu) { 4648 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4649 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4650 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4651 } 4652 Aroff = Acusp->rowoffsets_gpu; 4653 } else Aroff = Acsr->row_offsets; 4654 if (b->compressedrow.use) { /* need full row offset */ 4655 if (!Bcusp->rowoffsets_gpu) { 4656 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4657 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4658 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4659 } 4660 Broff = Bcusp->rowoffsets_gpu; 4661 } else Broff = Bcsr->row_offsets; 4662 PetscCall(PetscLogGpuTimeBegin()); 4663 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4664 PetscCallCUSPARSE(stat); 4665 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4666 PetscCallCUSPARSE(stat); 4667 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4668 auto Aperm = thrust::make_constant_iterator(1); 4669 auto Bperm = thrust::make_constant_iterator(0); 4670 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4671 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4672 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4673 #else 4674 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4675 auto Bcib = Bcsr->column_indices->begin(); 4676 auto Bcie = Bcsr->column_indices->end(); 4677 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4678 #endif 4679 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4680 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4681 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4682 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4683 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4684 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4685 auto p1 = Ccusp->cooPerm->begin(); 4686 auto p2 = Ccusp->cooPerm->begin(); 4687 thrust::advance(p2, Annz); 4688 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4689 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4690 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4691 #endif 4692 auto cci = thrust::make_counting_iterator(zero); 4693 auto cce = thrust::make_counting_iterator(c->nz); 4694 #if 0 //Errors on SUMMIT cuda 11.1.0 4695 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4696 #else 4697 auto pred = thrust::identity<int>(); 4698 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4699 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4700 #endif 4701 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4702 PetscCallCUSPARSE(stat); 4703 PetscCall(PetscLogGpuTimeEnd()); 4704 delete wPerm; 4705 delete Acoo; 4706 delete Bcoo; 4707 delete Ccoo; 4708 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4709 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4710 PetscCallCUSPARSE(stat); 4711 #endif 4712 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4713 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4714 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4715 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4716 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4717 CsrMatrix *CcsrT = new CsrMatrix; 4718 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4719 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4720 4721 (*C)->form_explicit_transpose = PETSC_TRUE; 4722 (*C)->transupdated = PETSC_TRUE; 4723 Ccusp->rowoffsets_gpu = NULL; 4724 CmatT->cprowIndices = NULL; 4725 CmatT->mat = CcsrT; 4726 CcsrT->num_rows = n; 4727 CcsrT->num_cols = m; 4728 CcsrT->num_entries = c->nz; 4729 4730 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4731 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4732 CcsrT->values = new THRUSTARRAY(c->nz); 4733 4734 PetscCall(PetscLogGpuTimeBegin()); 4735 auto rT = CcsrT->row_offsets->begin(); 4736 if (AT) { 4737 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4738 thrust::advance(rT, -1); 4739 } 4740 if (BT) { 4741 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4742 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4743 thrust::copy(titb, tite, rT); 4744 } 4745 auto cT = CcsrT->column_indices->begin(); 4746 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4747 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4748 auto vT = CcsrT->values->begin(); 4749 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4750 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4751 PetscCall(PetscLogGpuTimeEnd()); 4752 4753 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4754 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4755 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4756 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 4757 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 4758 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4759 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4760 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4761 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4762 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4763 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4764 PetscCallCUSPARSE(stat); 4765 #endif 4766 Ccusp->matTranspose = CmatT; 4767 } 4768 } 4769 4770 c->singlemalloc = PETSC_FALSE; 4771 c->free_a = PETSC_TRUE; 4772 c->free_ij = PETSC_TRUE; 4773 PetscCall(PetscMalloc1(m + 1, &c->i)); 4774 PetscCall(PetscMalloc1(c->nz, &c->j)); 4775 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4776 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4777 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4778 ii = *Ccsr->row_offsets; 4779 jj = *Ccsr->column_indices; 4780 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4781 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4782 } else { 4783 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4784 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4785 } 4786 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4787 PetscCall(PetscMalloc1(m, &c->ilen)); 4788 PetscCall(PetscMalloc1(m, &c->imax)); 4789 c->maxnz = c->nz; 4790 c->nonzerorowcnt = 0; 4791 c->rmax = 0; 4792 for (i = 0; i < m; i++) { 4793 const PetscInt nn = c->i[i + 1] - c->i[i]; 4794 c->ilen[i] = c->imax[i] = nn; 4795 c->nonzerorowcnt += (PetscInt) !!nn; 4796 c->rmax = PetscMax(c->rmax, nn); 4797 } 4798 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4799 PetscCall(PetscMalloc1(c->nz, &c->a)); 4800 (*C)->nonzerostate++; 4801 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4802 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4803 Ccusp->nonzerostate = (*C)->nonzerostate; 4804 (*C)->preallocated = PETSC_TRUE; 4805 } else { 4806 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4807 c = (Mat_SeqAIJ *)(*C)->data; 4808 if (c->nz) { 4809 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4810 PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 4811 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4812 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4813 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4814 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4815 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4816 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4817 Acsr = (CsrMatrix *)Acusp->mat->mat; 4818 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4819 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4820 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4821 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4822 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4823 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4824 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 4825 auto pmid = Ccusp->cooPerm->begin(); 4826 thrust::advance(pmid, Acsr->num_entries); 4827 PetscCall(PetscLogGpuTimeBegin()); 4828 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 4829 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4830 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4831 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4832 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 4833 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4834 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4835 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4836 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4837 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4838 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4839 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4840 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4841 auto vT = CcsrT->values->begin(); 4842 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4843 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4844 (*C)->transupdated = PETSC_TRUE; 4845 } 4846 PetscCall(PetscLogGpuTimeEnd()); 4847 } 4848 } 4849 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4850 (*C)->assembled = PETSC_TRUE; 4851 (*C)->was_assembled = PETSC_FALSE; 4852 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4853 PetscFunctionReturn(PETSC_SUCCESS); 4854 } 4855 4856 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4857 { 4858 bool dmem; 4859 const PetscScalar *av; 4860 4861 PetscFunctionBegin; 4862 dmem = isCudaMem(v); 4863 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4864 if (n && idx) { 4865 THRUSTINTARRAY widx(n); 4866 widx.assign(idx, idx + n); 4867 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4868 4869 THRUSTARRAY *w = NULL; 4870 thrust::device_ptr<PetscScalar> dv; 4871 if (dmem) { 4872 dv = thrust::device_pointer_cast(v); 4873 } else { 4874 w = new THRUSTARRAY(n); 4875 dv = w->data(); 4876 } 4877 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4878 4879 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4880 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4881 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4882 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4883 delete w; 4884 } else { 4885 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4886 } 4887 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4888 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 4889 PetscFunctionReturn(PETSC_SUCCESS); 4890 } 4891