1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #include <thrust/async/for_each.h> 19 #endif 20 #include <thrust/iterator/constant_iterator.h> 21 #include <thrust/remove.h> 22 #include <thrust/sort.h> 23 #include <thrust/unique.h> 24 25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 28 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 29 30 typedef enum { 31 CUSPARSE_MV_ALG_DEFAULT = 0, 32 CUSPARSE_COOMV_ALG = 1, 33 CUSPARSE_CSRMV_ALG1 = 2, 34 CUSPARSE_CSRMV_ALG2 = 3 35 } cusparseSpMVAlg_t; 36 37 typedef enum { 38 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 39 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 40 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 41 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 42 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 43 CUSPARSE_SPMM_ALG_DEFAULT = 0, 44 CUSPARSE_SPMM_COO_ALG1 = 1, 45 CUSPARSE_SPMM_COO_ALG2 = 2, 46 CUSPARSE_SPMM_COO_ALG3 = 3, 47 CUSPARSE_SPMM_COO_ALG4 = 5, 48 CUSPARSE_SPMM_CSR_ALG1 = 4, 49 CUSPARSE_SPMM_CSR_ALG2 = 6, 50 } cusparseSpMMAlg_t; 51 52 typedef enum { 53 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 54 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 55 } cusparseCsr2CscAlg_t; 56 */ 57 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 58 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 60 #endif 61 62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 65 66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 67 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 68 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 69 70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 72 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 84 85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 88 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 89 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 90 91 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 92 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 93 94 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 95 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 96 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 97 98 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 99 { 100 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 101 102 PetscFunctionBegin; 103 switch (op) { 104 case MAT_CUSPARSE_MULT: 105 cusparsestruct->format = format; 106 break; 107 case MAT_CUSPARSE_ALL: 108 cusparsestruct->format = format; 109 break; 110 default: 111 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 112 } 113 PetscFunctionReturn(PETSC_SUCCESS); 114 } 115 116 /*@ 117 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 118 operation. Only the `MatMult()` operation can use different GPU storage formats 119 120 Not Collective 121 122 Input Parameters: 123 + A - Matrix of type `MATSEQAIJCUSPARSE` 124 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 125 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 126 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 127 128 Level: intermediate 129 130 .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 131 @*/ 132 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 133 { 134 PetscFunctionBegin; 135 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 136 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 137 PetscFunctionReturn(PETSC_SUCCESS); 138 } 139 140 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 141 { 142 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 143 144 PetscFunctionBegin; 145 cusparsestruct->use_cpu_solve = use_cpu; 146 PetscFunctionReturn(PETSC_SUCCESS); 147 } 148 149 /*@ 150 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 151 152 Input Parameters: 153 + A - Matrix of type `MATSEQAIJCUSPARSE` 154 - use_cpu - set flag for using the built-in CPU `MatSolve()` 155 156 Level: intermediate 157 158 Note: 159 The cuSparse LU solver currently computes the factors with the built-in CPU method 160 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 161 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 162 163 .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 164 @*/ 165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 166 { 167 PetscFunctionBegin; 168 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 169 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 170 PetscFunctionReturn(PETSC_SUCCESS); 171 } 172 173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 174 { 175 PetscFunctionBegin; 176 switch (op) { 177 case MAT_FORM_EXPLICIT_TRANSPOSE: 178 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 179 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 180 A->form_explicit_transpose = flg; 181 break; 182 default: 183 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 184 break; 185 } 186 PetscFunctionReturn(PETSC_SUCCESS); 187 } 188 189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 190 191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 192 { 193 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 194 IS isrow = b->row, iscol = b->col; 195 PetscBool row_identity, col_identity; 196 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr; 197 198 PetscFunctionBegin; 199 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 200 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 201 B->offloadmask = PETSC_OFFLOAD_CPU; 202 /* determine which version of MatSolve needs to be used. */ 203 PetscCall(ISIdentity(isrow, &row_identity)); 204 PetscCall(ISIdentity(iscol, &col_identity)); 205 206 if (!cusparsestruct->use_cpu_solve) { 207 if (row_identity && col_identity) { 208 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 209 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 210 } else { 211 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 212 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 213 } 214 } 215 B->ops->matsolve = NULL; 216 B->ops->matsolvetranspose = NULL; 217 218 /* get the triangular factors */ 219 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 220 PetscFunctionReturn(PETSC_SUCCESS); 221 } 222 223 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 224 { 225 MatCUSPARSEStorageFormat format; 226 PetscBool flg; 227 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 228 229 PetscFunctionBegin; 230 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 231 if (A->factortype == MAT_FACTOR_NONE) { 232 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 233 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 234 235 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 236 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 237 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 238 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 239 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 240 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 241 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 242 #if CUSPARSE_VERSION > 11301 243 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 244 #else 245 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 246 #endif 247 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 248 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 249 250 PetscCall( 251 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 252 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 253 #endif 254 } 255 PetscOptionsHeadEnd(); 256 PetscFunctionReturn(PETSC_SUCCESS); 257 } 258 259 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 260 { 261 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 262 PetscInt n = A->rmap->n; 263 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 264 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 265 const PetscInt *ai = a->i, *aj = a->j, *vi; 266 const MatScalar *aa = a->a, *v; 267 PetscInt *AiLo, *AjLo; 268 PetscInt i, nz, nzLower, offset, rowOffset; 269 270 PetscFunctionBegin; 271 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 272 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 273 try { 274 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 275 nzLower = n + ai[n] - ai[1]; 276 if (!loTriFactor) { 277 PetscScalar *AALo; 278 279 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 280 281 /* Allocate Space for the lower triangular matrix */ 282 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 283 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 284 285 /* Fill the lower triangular matrix */ 286 AiLo[0] = (PetscInt)0; 287 AiLo[n] = nzLower; 288 AjLo[0] = (PetscInt)0; 289 AALo[0] = (MatScalar)1.0; 290 v = aa; 291 vi = aj; 292 offset = 1; 293 rowOffset = 1; 294 for (i = 1; i < n; i++) { 295 nz = ai[i + 1] - ai[i]; 296 /* additional 1 for the term on the diagonal */ 297 AiLo[i] = rowOffset; 298 rowOffset += nz + 1; 299 300 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 301 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 302 303 offset += nz; 304 AjLo[offset] = (PetscInt)i; 305 AALo[offset] = (MatScalar)1.0; 306 offset += 1; 307 308 v += nz; 309 vi += nz; 310 } 311 312 /* allocate space for the triangular factor information */ 313 PetscCall(PetscNew(&loTriFactor)); 314 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 315 /* Create the matrix description */ 316 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 317 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 318 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 319 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 320 #else 321 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 322 #endif 323 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 324 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 325 326 /* set the operation */ 327 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 328 329 /* set the matrix */ 330 loTriFactor->csrMat = new CsrMatrix; 331 loTriFactor->csrMat->num_rows = n; 332 loTriFactor->csrMat->num_cols = n; 333 loTriFactor->csrMat->num_entries = nzLower; 334 335 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 336 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 337 338 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 339 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 340 341 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 342 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 343 344 /* Create the solve analysis information */ 345 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 346 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 347 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 348 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 349 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 350 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 351 #endif 352 353 /* perform the solve analysis */ 354 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 355 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 356 PetscCallCUDA(WaitForCUDA()); 357 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 358 359 /* assign the pointer */ 360 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 361 loTriFactor->AA_h = AALo; 362 PetscCallCUDA(cudaFreeHost(AiLo)); 363 PetscCallCUDA(cudaFreeHost(AjLo)); 364 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 365 } else { /* update values only */ 366 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 367 /* Fill the lower triangular matrix */ 368 loTriFactor->AA_h[0] = 1.0; 369 v = aa; 370 vi = aj; 371 offset = 1; 372 for (i = 1; i < n; i++) { 373 nz = ai[i + 1] - ai[i]; 374 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 375 offset += nz; 376 loTriFactor->AA_h[offset] = 1.0; 377 offset += 1; 378 v += nz; 379 } 380 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 381 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 382 } 383 } catch (char *ex) { 384 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 385 } 386 } 387 PetscFunctionReturn(PETSC_SUCCESS); 388 } 389 390 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 391 { 392 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 393 PetscInt n = A->rmap->n; 394 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 395 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 396 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 397 const MatScalar *aa = a->a, *v; 398 PetscInt *AiUp, *AjUp; 399 PetscInt i, nz, nzUpper, offset; 400 401 PetscFunctionBegin; 402 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 403 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 404 try { 405 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 406 nzUpper = adiag[0] - adiag[n]; 407 if (!upTriFactor) { 408 PetscScalar *AAUp; 409 410 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 411 412 /* Allocate Space for the upper triangular matrix */ 413 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 414 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 415 416 /* Fill the upper triangular matrix */ 417 AiUp[0] = (PetscInt)0; 418 AiUp[n] = nzUpper; 419 offset = nzUpper; 420 for (i = n - 1; i >= 0; i--) { 421 v = aa + adiag[i + 1] + 1; 422 vi = aj + adiag[i + 1] + 1; 423 424 /* number of elements NOT on the diagonal */ 425 nz = adiag[i] - adiag[i + 1] - 1; 426 427 /* decrement the offset */ 428 offset -= (nz + 1); 429 430 /* first, set the diagonal elements */ 431 AjUp[offset] = (PetscInt)i; 432 AAUp[offset] = (MatScalar)1. / v[nz]; 433 AiUp[i] = AiUp[i + 1] - (nz + 1); 434 435 PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 436 PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 437 } 438 439 /* allocate space for the triangular factor information */ 440 PetscCall(PetscNew(&upTriFactor)); 441 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 442 443 /* Create the matrix description */ 444 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 445 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 446 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 447 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 448 #else 449 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 450 #endif 451 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 452 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 453 454 /* set the operation */ 455 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 456 457 /* set the matrix */ 458 upTriFactor->csrMat = new CsrMatrix; 459 upTriFactor->csrMat->num_rows = n; 460 upTriFactor->csrMat->num_cols = n; 461 upTriFactor->csrMat->num_entries = nzUpper; 462 463 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 464 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 465 466 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 467 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 468 469 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 470 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 471 472 /* Create the solve analysis information */ 473 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 474 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 475 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 476 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 477 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 478 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 479 #endif 480 481 /* perform the solve analysis */ 482 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 483 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 484 485 PetscCallCUDA(WaitForCUDA()); 486 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 487 488 /* assign the pointer */ 489 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 490 upTriFactor->AA_h = AAUp; 491 PetscCallCUDA(cudaFreeHost(AiUp)); 492 PetscCallCUDA(cudaFreeHost(AjUp)); 493 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 494 } else { 495 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 496 /* Fill the upper triangular matrix */ 497 offset = nzUpper; 498 for (i = n - 1; i >= 0; i--) { 499 v = aa + adiag[i + 1] + 1; 500 501 /* number of elements NOT on the diagonal */ 502 nz = adiag[i] - adiag[i + 1] - 1; 503 504 /* decrement the offset */ 505 offset -= (nz + 1); 506 507 /* first, set the diagonal elements */ 508 upTriFactor->AA_h[offset] = 1. / v[nz]; 509 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 510 } 511 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 512 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 513 } 514 } catch (char *ex) { 515 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 516 } 517 } 518 PetscFunctionReturn(PETSC_SUCCESS); 519 } 520 521 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 522 { 523 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 524 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 525 IS isrow = a->row, iscol = a->icol; 526 PetscBool row_identity, col_identity; 527 PetscInt n = A->rmap->n; 528 529 PetscFunctionBegin; 530 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 531 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 532 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 533 534 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 535 cusparseTriFactors->nnz = a->nz; 536 537 A->offloadmask = PETSC_OFFLOAD_BOTH; 538 /* lower triangular indices */ 539 PetscCall(ISIdentity(isrow, &row_identity)); 540 if (!row_identity && !cusparseTriFactors->rpermIndices) { 541 const PetscInt *r; 542 543 PetscCall(ISGetIndices(isrow, &r)); 544 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 545 cusparseTriFactors->rpermIndices->assign(r, r + n); 546 PetscCall(ISRestoreIndices(isrow, &r)); 547 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 548 } 549 550 /* upper triangular indices */ 551 PetscCall(ISIdentity(iscol, &col_identity)); 552 if (!col_identity && !cusparseTriFactors->cpermIndices) { 553 const PetscInt *c; 554 555 PetscCall(ISGetIndices(iscol, &c)); 556 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 557 cusparseTriFactors->cpermIndices->assign(c, c + n); 558 PetscCall(ISRestoreIndices(iscol, &c)); 559 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 560 } 561 PetscFunctionReturn(PETSC_SUCCESS); 562 } 563 564 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 565 { 566 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 567 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 568 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 569 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 570 PetscInt *AiUp, *AjUp; 571 PetscScalar *AAUp; 572 PetscScalar *AALo; 573 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 574 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 575 const PetscInt *ai = b->i, *aj = b->j, *vj; 576 const MatScalar *aa = b->a, *v; 577 578 PetscFunctionBegin; 579 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 580 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 581 try { 582 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 583 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 584 if (!upTriFactor && !loTriFactor) { 585 /* Allocate Space for the upper triangular matrix */ 586 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 587 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 588 589 /* Fill the upper triangular matrix */ 590 AiUp[0] = (PetscInt)0; 591 AiUp[n] = nzUpper; 592 offset = 0; 593 for (i = 0; i < n; i++) { 594 /* set the pointers */ 595 v = aa + ai[i]; 596 vj = aj + ai[i]; 597 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 598 599 /* first, set the diagonal elements */ 600 AjUp[offset] = (PetscInt)i; 601 AAUp[offset] = (MatScalar)1.0 / v[nz]; 602 AiUp[i] = offset; 603 AALo[offset] = (MatScalar)1.0 / v[nz]; 604 605 offset += 1; 606 if (nz > 0) { 607 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 608 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 609 for (j = offset; j < offset + nz; j++) { 610 AAUp[j] = -AAUp[j]; 611 AALo[j] = AAUp[j] / v[nz]; 612 } 613 offset += nz; 614 } 615 } 616 617 /* allocate space for the triangular factor information */ 618 PetscCall(PetscNew(&upTriFactor)); 619 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 620 621 /* Create the matrix description */ 622 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 623 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 624 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 625 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 626 #else 627 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 628 #endif 629 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 630 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 631 632 /* set the matrix */ 633 upTriFactor->csrMat = new CsrMatrix; 634 upTriFactor->csrMat->num_rows = A->rmap->n; 635 upTriFactor->csrMat->num_cols = A->cmap->n; 636 upTriFactor->csrMat->num_entries = a->nz; 637 638 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 639 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 640 641 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 642 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 643 644 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 645 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 646 647 /* set the operation */ 648 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 649 650 /* Create the solve analysis information */ 651 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 652 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 653 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 654 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 655 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 656 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 657 #endif 658 659 /* perform the solve analysis */ 660 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 661 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 662 663 PetscCallCUDA(WaitForCUDA()); 664 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 665 666 /* assign the pointer */ 667 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 668 669 /* allocate space for the triangular factor information */ 670 PetscCall(PetscNew(&loTriFactor)); 671 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 672 673 /* Create the matrix description */ 674 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 675 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 676 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 677 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 678 #else 679 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 680 #endif 681 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 682 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 683 684 /* set the operation */ 685 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 686 687 /* set the matrix */ 688 loTriFactor->csrMat = new CsrMatrix; 689 loTriFactor->csrMat->num_rows = A->rmap->n; 690 loTriFactor->csrMat->num_cols = A->cmap->n; 691 loTriFactor->csrMat->num_entries = a->nz; 692 693 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 694 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 695 696 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 697 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 698 699 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 700 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 701 702 /* Create the solve analysis information */ 703 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 704 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 705 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 706 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 707 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 708 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 709 #endif 710 711 /* perform the solve analysis */ 712 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 713 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 714 715 PetscCallCUDA(WaitForCUDA()); 716 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 717 718 /* assign the pointer */ 719 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 720 721 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 722 PetscCallCUDA(cudaFreeHost(AiUp)); 723 PetscCallCUDA(cudaFreeHost(AjUp)); 724 } else { 725 /* Fill the upper triangular matrix */ 726 offset = 0; 727 for (i = 0; i < n; i++) { 728 /* set the pointers */ 729 v = aa + ai[i]; 730 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 731 732 /* first, set the diagonal elements */ 733 AAUp[offset] = 1.0 / v[nz]; 734 AALo[offset] = 1.0 / v[nz]; 735 736 offset += 1; 737 if (nz > 0) { 738 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 739 for (j = offset; j < offset + nz; j++) { 740 AAUp[j] = -AAUp[j]; 741 AALo[j] = AAUp[j] / v[nz]; 742 } 743 offset += nz; 744 } 745 } 746 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 747 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 748 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 749 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 750 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 751 } 752 PetscCallCUDA(cudaFreeHost(AAUp)); 753 PetscCallCUDA(cudaFreeHost(AALo)); 754 } catch (char *ex) { 755 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 756 } 757 } 758 PetscFunctionReturn(PETSC_SUCCESS); 759 } 760 761 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 762 { 763 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 764 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 765 IS ip = a->row; 766 PetscBool perm_identity; 767 PetscInt n = A->rmap->n; 768 769 PetscFunctionBegin; 770 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 771 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 772 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 773 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 774 775 A->offloadmask = PETSC_OFFLOAD_BOTH; 776 777 /* lower triangular indices */ 778 PetscCall(ISIdentity(ip, &perm_identity)); 779 if (!perm_identity) { 780 IS iip; 781 const PetscInt *irip, *rip; 782 783 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 784 PetscCall(ISGetIndices(iip, &irip)); 785 PetscCall(ISGetIndices(ip, &rip)); 786 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 787 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 788 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 789 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 790 PetscCall(ISRestoreIndices(iip, &irip)); 791 PetscCall(ISDestroy(&iip)); 792 PetscCall(ISRestoreIndices(ip, &rip)); 793 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 794 } 795 PetscFunctionReturn(PETSC_SUCCESS); 796 } 797 798 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 799 { 800 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 801 IS ip = b->row; 802 PetscBool perm_identity; 803 804 PetscFunctionBegin; 805 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 806 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 807 B->offloadmask = PETSC_OFFLOAD_CPU; 808 /* determine which version of MatSolve needs to be used. */ 809 PetscCall(ISIdentity(ip, &perm_identity)); 810 if (perm_identity) { 811 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 812 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 813 B->ops->matsolve = NULL; 814 B->ops->matsolvetranspose = NULL; 815 } else { 816 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 817 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 818 B->ops->matsolve = NULL; 819 B->ops->matsolvetranspose = NULL; 820 } 821 822 /* get the triangular factors */ 823 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 824 PetscFunctionReturn(PETSC_SUCCESS); 825 } 826 827 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 828 { 829 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 830 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 831 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 832 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 833 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 834 cusparseIndexBase_t indexBase; 835 cusparseMatrixType_t matrixType; 836 cusparseFillMode_t fillMode; 837 cusparseDiagType_t diagType; 838 839 PetscFunctionBegin; 840 /* allocate space for the transpose of the lower triangular factor */ 841 PetscCall(PetscNew(&loTriFactorT)); 842 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 843 844 /* set the matrix descriptors of the lower triangular factor */ 845 matrixType = cusparseGetMatType(loTriFactor->descr); 846 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 847 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 848 diagType = cusparseGetMatDiagType(loTriFactor->descr); 849 850 /* Create the matrix description */ 851 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 852 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 853 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 854 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 855 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 856 857 /* set the operation */ 858 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 859 860 /* allocate GPU space for the CSC of the lower triangular factor*/ 861 loTriFactorT->csrMat = new CsrMatrix; 862 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 863 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 864 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 865 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 866 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 867 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 868 869 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 870 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 871 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 872 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 873 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 874 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 875 #endif 876 877 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 878 { 879 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 880 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 881 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 882 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 883 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 884 #else 885 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 886 #endif 887 PetscCallCUSPARSE(stat); 888 } 889 890 PetscCallCUDA(WaitForCUDA()); 891 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 892 893 /* Create the solve analysis information */ 894 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 895 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 896 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 897 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 898 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 899 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 900 #endif 901 902 /* perform the solve analysis */ 903 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 904 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 905 906 PetscCallCUDA(WaitForCUDA()); 907 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 908 909 /* assign the pointer */ 910 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 911 912 /*********************************************/ 913 /* Now the Transpose of the Upper Tri Factor */ 914 /*********************************************/ 915 916 /* allocate space for the transpose of the upper triangular factor */ 917 PetscCall(PetscNew(&upTriFactorT)); 918 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 919 920 /* set the matrix descriptors of the upper triangular factor */ 921 matrixType = cusparseGetMatType(upTriFactor->descr); 922 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 923 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 924 diagType = cusparseGetMatDiagType(upTriFactor->descr); 925 926 /* Create the matrix description */ 927 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 928 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 929 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 930 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 931 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 932 933 /* set the operation */ 934 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 935 936 /* allocate GPU space for the CSC of the upper triangular factor*/ 937 upTriFactorT->csrMat = new CsrMatrix; 938 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 939 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 940 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 941 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 942 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 943 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 944 945 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 946 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 947 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 948 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 949 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 950 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 951 #endif 952 953 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 954 { 955 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 956 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 957 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 958 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 959 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 960 #else 961 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 962 #endif 963 PetscCallCUSPARSE(stat); 964 } 965 966 PetscCallCUDA(WaitForCUDA()); 967 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 968 969 /* Create the solve analysis information */ 970 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 971 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 972 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 973 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 974 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 975 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 976 #endif 977 978 /* perform the solve analysis */ 979 /* christ, would it have killed you to put this stuff in a function????????? */ 980 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 981 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 982 983 PetscCallCUDA(WaitForCUDA()); 984 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 985 986 /* assign the pointer */ 987 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 988 PetscFunctionReturn(PETSC_SUCCESS); 989 } 990 991 struct PetscScalarToPetscInt { 992 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 993 }; 994 995 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 996 { 997 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 998 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 999 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1000 cusparseStatus_t stat; 1001 cusparseIndexBase_t indexBase; 1002 1003 PetscFunctionBegin; 1004 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1005 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1006 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1007 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1008 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1009 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1010 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1011 PetscCall(PetscLogGpuTimeBegin()); 1012 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1013 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1014 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1015 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1016 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1017 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1018 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1019 1020 /* set alpha and beta */ 1021 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 1022 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 1023 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1024 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1025 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1026 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1027 1028 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1029 CsrMatrix *matrixT = new CsrMatrix; 1030 matstructT->mat = matrixT; 1031 matrixT->num_rows = A->cmap->n; 1032 matrixT->num_cols = A->rmap->n; 1033 matrixT->num_entries = a->nz; 1034 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1035 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1036 matrixT->values = new THRUSTARRAY(a->nz); 1037 1038 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1039 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1040 1041 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1042 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1043 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1044 indexBase, cusparse_scalartype); 1045 PetscCallCUSPARSE(stat); 1046 #else 1047 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1048 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1049 1050 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1051 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1052 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1053 */ 1054 if (matrixT->num_entries) { 1055 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1056 PetscCallCUSPARSE(stat); 1057 1058 } else { 1059 matstructT->matDescr = NULL; 1060 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1061 } 1062 #endif 1063 #endif 1064 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1065 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1066 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1067 #else 1068 CsrMatrix *temp = new CsrMatrix; 1069 CsrMatrix *tempT = new CsrMatrix; 1070 /* First convert HYB to CSR */ 1071 temp->num_rows = A->rmap->n; 1072 temp->num_cols = A->cmap->n; 1073 temp->num_entries = a->nz; 1074 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1075 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1076 temp->values = new THRUSTARRAY(a->nz); 1077 1078 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1079 PetscCallCUSPARSE(stat); 1080 1081 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1082 tempT->num_rows = A->rmap->n; 1083 tempT->num_cols = A->cmap->n; 1084 tempT->num_entries = a->nz; 1085 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1086 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1087 tempT->values = new THRUSTARRAY(a->nz); 1088 1089 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1090 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1091 PetscCallCUSPARSE(stat); 1092 1093 /* Last, convert CSC to HYB */ 1094 cusparseHybMat_t hybMat; 1095 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1096 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1097 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1098 PetscCallCUSPARSE(stat); 1099 1100 /* assign the pointer */ 1101 matstructT->mat = hybMat; 1102 A->transupdated = PETSC_TRUE; 1103 /* delete temporaries */ 1104 if (tempT) { 1105 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1106 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1107 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1108 delete (CsrMatrix *)tempT; 1109 } 1110 if (temp) { 1111 if (temp->values) delete (THRUSTARRAY *)temp->values; 1112 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1113 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1114 delete (CsrMatrix *)temp; 1115 } 1116 #endif 1117 } 1118 } 1119 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1120 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1121 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1122 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1123 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1124 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1125 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1126 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1127 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1128 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1129 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1130 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1131 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1132 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1133 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1134 } 1135 if (!cusparsestruct->csr2csc_i) { 1136 THRUSTARRAY csr2csc_a(matrix->num_entries); 1137 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1138 1139 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1140 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1141 void *csr2cscBuffer; 1142 size_t csr2cscBufferSize; 1143 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1144 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1145 PetscCallCUSPARSE(stat); 1146 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1147 #endif 1148 1149 if (matrix->num_entries) { 1150 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1151 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1152 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1153 1154 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1155 should be filled with indexBase. So I just take a shortcut here. 1156 */ 1157 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1158 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1159 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1160 PetscCallCUSPARSE(stat); 1161 #else 1162 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1163 PetscCallCUSPARSE(stat); 1164 #endif 1165 } else { 1166 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1167 } 1168 1169 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1170 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1171 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1172 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1173 #endif 1174 } 1175 PetscCallThrust( 1176 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1177 } 1178 PetscCall(PetscLogGpuTimeEnd()); 1179 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1180 /* the compressed row indices is not used for matTranspose */ 1181 matstructT->cprowIndices = NULL; 1182 /* assign the pointer */ 1183 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1184 A->transupdated = PETSC_TRUE; 1185 PetscFunctionReturn(PETSC_SUCCESS); 1186 } 1187 1188 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1189 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1190 { 1191 PetscInt n = xx->map->n; 1192 const PetscScalar *barray; 1193 PetscScalar *xarray; 1194 thrust::device_ptr<const PetscScalar> bGPU; 1195 thrust::device_ptr<PetscScalar> xGPU; 1196 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1197 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1198 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1199 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1200 1201 PetscFunctionBegin; 1202 /* Analyze the matrix and create the transpose ... on the fly */ 1203 if (!loTriFactorT && !upTriFactorT) { 1204 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1205 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1206 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1207 } 1208 1209 /* Get the GPU pointers */ 1210 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1211 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1212 xGPU = thrust::device_pointer_cast(xarray); 1213 bGPU = thrust::device_pointer_cast(barray); 1214 1215 PetscCall(PetscLogGpuTimeBegin()); 1216 /* First, reorder with the row permutation */ 1217 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1218 1219 /* First, solve U */ 1220 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1221 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1222 1223 /* Then, solve L */ 1224 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1225 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1226 1227 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1228 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1229 1230 /* Copy the temporary to the full solution. */ 1231 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1232 1233 /* restore */ 1234 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1235 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1236 PetscCall(PetscLogGpuTimeEnd()); 1237 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1238 PetscFunctionReturn(PETSC_SUCCESS); 1239 } 1240 1241 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1242 { 1243 const PetscScalar *barray; 1244 PetscScalar *xarray; 1245 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1246 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1247 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1248 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1249 1250 PetscFunctionBegin; 1251 /* Analyze the matrix and create the transpose ... on the fly */ 1252 if (!loTriFactorT && !upTriFactorT) { 1253 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1254 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1255 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1256 } 1257 1258 /* Get the GPU pointers */ 1259 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1260 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1261 1262 PetscCall(PetscLogGpuTimeBegin()); 1263 /* First, solve U */ 1264 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1265 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1266 1267 /* Then, solve L */ 1268 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1269 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1270 1271 /* restore */ 1272 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1273 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1274 PetscCall(PetscLogGpuTimeEnd()); 1275 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1276 PetscFunctionReturn(PETSC_SUCCESS); 1277 } 1278 1279 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1280 { 1281 const PetscScalar *barray; 1282 PetscScalar *xarray; 1283 thrust::device_ptr<const PetscScalar> bGPU; 1284 thrust::device_ptr<PetscScalar> xGPU; 1285 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1286 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1287 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1288 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1289 1290 PetscFunctionBegin; 1291 /* Get the GPU pointers */ 1292 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1293 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1294 xGPU = thrust::device_pointer_cast(xarray); 1295 bGPU = thrust::device_pointer_cast(barray); 1296 1297 PetscCall(PetscLogGpuTimeBegin()); 1298 /* First, reorder with the row permutation */ 1299 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1300 1301 /* Next, solve L */ 1302 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1303 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1304 1305 /* Then, solve U */ 1306 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1307 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1308 1309 /* Last, reorder with the column permutation */ 1310 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1311 1312 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1313 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1314 PetscCall(PetscLogGpuTimeEnd()); 1315 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1316 PetscFunctionReturn(PETSC_SUCCESS); 1317 } 1318 1319 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1320 { 1321 const PetscScalar *barray; 1322 PetscScalar *xarray; 1323 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1324 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1325 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1326 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1327 1328 PetscFunctionBegin; 1329 /* Get the GPU pointers */ 1330 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1331 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1332 1333 PetscCall(PetscLogGpuTimeBegin()); 1334 /* First, solve L */ 1335 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1336 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1337 1338 /* Next, solve U */ 1339 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1340 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1341 1342 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1343 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1344 PetscCall(PetscLogGpuTimeEnd()); 1345 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1346 PetscFunctionReturn(PETSC_SUCCESS); 1347 } 1348 1349 #if CUSPARSE_VERSION >= 11500 1350 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1351 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) 1352 { 1353 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1354 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1355 const PetscScalar *barray; 1356 PetscScalar *xarray; 1357 1358 PetscFunctionBegin; 1359 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1360 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1361 PetscCall(PetscLogGpuTimeBegin()); 1362 1363 /* Solve L*y = b */ 1364 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1365 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1366 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1367 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, 1368 fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1369 1370 /* Solve U*x = y */ 1371 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1372 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 1373 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); 1374 1375 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1376 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1377 1378 PetscCall(PetscLogGpuTimeEnd()); 1379 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1380 PetscFunctionReturn(PETSC_SUCCESS); 1381 } 1382 1383 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) 1384 { 1385 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1386 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1387 const PetscScalar *barray; 1388 PetscScalar *xarray; 1389 1390 PetscFunctionBegin; 1391 if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1392 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1393 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1394 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1395 1396 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1397 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1398 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1399 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1400 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1401 } 1402 1403 if (!fs->updatedTransposeSpSVAnalysis) { 1404 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1405 1406 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1407 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1408 } 1409 1410 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1411 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1412 PetscCall(PetscLogGpuTimeBegin()); 1413 1414 /* Solve Ut*y = b */ 1415 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1416 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1417 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 1418 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); 1419 1420 /* Solve Lt*x = y */ 1421 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1422 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1423 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1424 1425 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1426 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1427 PetscCall(PetscLogGpuTimeEnd()); 1428 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1429 PetscFunctionReturn(PETSC_SUCCESS); 1430 } 1431 1432 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1433 { 1434 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1435 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1436 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1437 CsrMatrix *Acsr; 1438 PetscInt m, nz; 1439 PetscBool flg; 1440 1441 PetscFunctionBegin; 1442 if (PetscDefined(USE_DEBUG)) { 1443 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1444 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1445 } 1446 1447 /* Copy A's value to fact */ 1448 m = fact->rmap->n; 1449 nz = aij->nz; 1450 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1451 Acsr = (CsrMatrix *)Acusp->mat->mat; 1452 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1453 1454 /* Factorize fact inplace */ 1455 if (m) 1456 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1457 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1458 if (PetscDefined(USE_DEBUG)) { 1459 int numerical_zero; 1460 cusparseStatus_t status; 1461 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1462 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1463 } 1464 1465 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1466 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1467 */ 1468 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1469 1470 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1471 1472 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1473 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1474 1475 fact->offloadmask = PETSC_OFFLOAD_GPU; 1476 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1477 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1478 fact->ops->matsolve = NULL; 1479 fact->ops->matsolvetranspose = NULL; 1480 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1481 PetscFunctionReturn(PETSC_SUCCESS); 1482 } 1483 1484 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1485 { 1486 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1487 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1488 PetscInt m, nz; 1489 1490 PetscFunctionBegin; 1491 if (PetscDefined(USE_DEBUG)) { 1492 PetscInt i; 1493 PetscBool flg, missing; 1494 1495 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1496 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1497 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1498 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1499 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1500 } 1501 1502 /* Free the old stale stuff */ 1503 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1504 1505 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1506 but they will not be used. Allocate them just for easy debugging. 1507 */ 1508 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1509 1510 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1511 fact->factortype = MAT_FACTOR_ILU; 1512 fact->info.factor_mallocs = 0; 1513 fact->info.fill_ratio_given = info->fill; 1514 fact->info.fill_ratio_needed = 1.0; 1515 1516 aij->row = NULL; 1517 aij->col = NULL; 1518 1519 /* ====================================================================== */ 1520 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1521 /* We'll do in-place factorization on fact */ 1522 /* ====================================================================== */ 1523 const int *Ai, *Aj; 1524 1525 m = fact->rmap->n; 1526 nz = aij->nz; 1527 1528 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1529 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1530 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1531 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1532 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1533 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1534 1535 /* ====================================================================== */ 1536 /* Create descriptors for M, L, U */ 1537 /* ====================================================================== */ 1538 cusparseFillMode_t fillMode; 1539 cusparseDiagType_t diagType; 1540 1541 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1542 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1543 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1544 1545 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1546 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1547 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1548 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1549 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1550 */ 1551 fillMode = CUSPARSE_FILL_MODE_LOWER; 1552 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1553 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1554 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1555 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1556 1557 fillMode = CUSPARSE_FILL_MODE_UPPER; 1558 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1559 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1560 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1561 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1562 1563 /* ========================================================================= */ 1564 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1565 /* ========================================================================= */ 1566 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1567 if (m) 1568 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1569 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M)); 1570 1571 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1572 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1573 1574 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1575 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1576 1577 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1578 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1579 1580 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1581 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1582 1583 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1584 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1585 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1586 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1587 */ 1588 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1589 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1590 fs->spsvBuffer_L = fs->factBuffer_M; 1591 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1592 } else { 1593 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1594 fs->spsvBuffer_U = fs->factBuffer_M; 1595 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1596 } 1597 1598 /* ========================================================================== */ 1599 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1600 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1601 /* ========================================================================== */ 1602 int structural_zero; 1603 cusparseStatus_t status; 1604 1605 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1606 if (m) 1607 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1608 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1609 if (PetscDefined(USE_DEBUG)) { 1610 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1611 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1612 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1613 } 1614 1615 /* Estimate FLOPs of the numeric factorization */ 1616 { 1617 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1618 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1619 PetscLogDouble flops = 0.0; 1620 1621 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1622 Ai = Aseq->i; 1623 Adiag = Aseq->diag; 1624 for (PetscInt i = 0; i < m; i++) { 1625 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1626 nzRow = Ai[i + 1] - Ai[i]; 1627 nzLeft = Adiag[i] - Ai[i]; 1628 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1629 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1630 */ 1631 nzLeft = (nzRow - 1) / 2; 1632 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1633 } 1634 } 1635 fs->numericFactFlops = flops; 1636 } 1637 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1638 PetscFunctionReturn(PETSC_SUCCESS); 1639 } 1640 1641 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1642 { 1643 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1644 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1645 const PetscScalar *barray; 1646 PetscScalar *xarray; 1647 1648 PetscFunctionBegin; 1649 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1650 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1651 PetscCall(PetscLogGpuTimeBegin()); 1652 1653 /* Solve L*y = b */ 1654 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1655 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1656 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1657 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1658 1659 /* Solve Lt*x = y */ 1660 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1661 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1662 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1663 1664 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1665 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1666 1667 PetscCall(PetscLogGpuTimeEnd()); 1668 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1669 PetscFunctionReturn(PETSC_SUCCESS); 1670 } 1671 1672 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1673 { 1674 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1675 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1676 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1677 CsrMatrix *Acsr; 1678 PetscInt m, nz; 1679 PetscBool flg; 1680 1681 PetscFunctionBegin; 1682 if (PetscDefined(USE_DEBUG)) { 1683 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1684 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1685 } 1686 1687 /* Copy A's value to fact */ 1688 m = fact->rmap->n; 1689 nz = aij->nz; 1690 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1691 Acsr = (CsrMatrix *)Acusp->mat->mat; 1692 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1693 1694 /* Factorize fact inplace */ 1695 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1696 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1697 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1698 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1699 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1700 */ 1701 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1702 if (PetscDefined(USE_DEBUG)) { 1703 int numerical_zero; 1704 cusparseStatus_t status; 1705 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1706 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1707 } 1708 1709 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1710 1711 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1712 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1713 */ 1714 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1715 1716 fact->offloadmask = PETSC_OFFLOAD_GPU; 1717 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1718 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1719 fact->ops->matsolve = NULL; 1720 fact->ops->matsolvetranspose = NULL; 1721 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1722 PetscFunctionReturn(PETSC_SUCCESS); 1723 } 1724 1725 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 1726 { 1727 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1728 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1729 PetscInt m, nz; 1730 1731 PetscFunctionBegin; 1732 if (PetscDefined(USE_DEBUG)) { 1733 PetscInt i; 1734 PetscBool flg, missing; 1735 1736 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1737 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1738 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1739 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1740 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1741 } 1742 1743 /* Free the old stale stuff */ 1744 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1745 1746 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1747 but they will not be used. Allocate them just for easy debugging. 1748 */ 1749 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1750 1751 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1752 fact->factortype = MAT_FACTOR_ICC; 1753 fact->info.factor_mallocs = 0; 1754 fact->info.fill_ratio_given = info->fill; 1755 fact->info.fill_ratio_needed = 1.0; 1756 1757 aij->row = NULL; 1758 aij->col = NULL; 1759 1760 /* ====================================================================== */ 1761 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1762 /* We'll do in-place factorization on fact */ 1763 /* ====================================================================== */ 1764 const int *Ai, *Aj; 1765 1766 m = fact->rmap->n; 1767 nz = aij->nz; 1768 1769 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1770 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1771 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1772 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1773 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1774 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1775 1776 /* ====================================================================== */ 1777 /* Create mat descriptors for M, L */ 1778 /* ====================================================================== */ 1779 cusparseFillMode_t fillMode; 1780 cusparseDiagType_t diagType; 1781 1782 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1783 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1784 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1785 1786 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1787 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1788 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1789 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1790 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1791 */ 1792 fillMode = CUSPARSE_FILL_MODE_LOWER; 1793 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1794 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1795 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1796 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1797 1798 /* ========================================================================= */ 1799 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 1800 /* ========================================================================= */ 1801 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 1802 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M)); 1803 1804 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1805 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1806 1807 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1808 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1809 1810 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1811 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1812 1813 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1814 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1815 1816 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 1817 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 1818 */ 1819 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 1820 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1821 fs->spsvBuffer_L = fs->factBuffer_M; 1822 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1823 } else { 1824 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 1825 fs->spsvBuffer_Lt = fs->factBuffer_M; 1826 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1827 } 1828 1829 /* ========================================================================== */ 1830 /* Perform analysis of ic0 on M */ 1831 /* The lower triangular part of M has the same sparsity pattern as L */ 1832 /* ========================================================================== */ 1833 int structural_zero; 1834 cusparseStatus_t status; 1835 1836 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1837 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1838 if (PetscDefined(USE_DEBUG)) { 1839 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1840 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 1841 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 1842 } 1843 1844 /* Estimate FLOPs of the numeric factorization */ 1845 { 1846 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1847 PetscInt *Ai, nzRow, nzLeft; 1848 PetscLogDouble flops = 0.0; 1849 1850 Ai = Aseq->i; 1851 for (PetscInt i = 0; i < m; i++) { 1852 nzRow = Ai[i + 1] - Ai[i]; 1853 if (nzRow > 1) { 1854 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1855 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1856 */ 1857 nzLeft = (nzRow - 1) / 2; 1858 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1859 } 1860 } 1861 fs->numericFactFlops = flops; 1862 } 1863 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 1864 PetscFunctionReturn(PETSC_SUCCESS); 1865 } 1866 #endif 1867 1868 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1869 { 1870 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1871 1872 PetscFunctionBegin; 1873 #if CUSPARSE_VERSION >= 11500 1874 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 1875 if (cusparseTriFactors->factorizeOnDevice) { 1876 PetscCall(ISIdentity(isrow, &row_identity)); 1877 PetscCall(ISIdentity(iscol, &col_identity)); 1878 } 1879 if (!info->levels && row_identity && col_identity) { 1880 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 1881 } else 1882 #endif 1883 { 1884 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1885 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1886 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1887 } 1888 PetscFunctionReturn(PETSC_SUCCESS); 1889 } 1890 1891 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1892 { 1893 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1894 1895 PetscFunctionBegin; 1896 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1897 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1898 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1899 PetscFunctionReturn(PETSC_SUCCESS); 1900 } 1901 1902 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 1903 { 1904 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1905 1906 PetscFunctionBegin; 1907 #if CUSPARSE_VERSION >= 11500 1908 PetscBool perm_identity = PETSC_FALSE; 1909 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 1910 if (!info->levels && perm_identity) { 1911 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 1912 } else 1913 #endif 1914 { 1915 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1916 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 1917 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1918 } 1919 PetscFunctionReturn(PETSC_SUCCESS); 1920 } 1921 1922 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 1923 { 1924 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1925 1926 PetscFunctionBegin; 1927 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1928 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 1929 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1930 PetscFunctionReturn(PETSC_SUCCESS); 1931 } 1932 1933 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 1934 { 1935 PetscFunctionBegin; 1936 *type = MATSOLVERCUSPARSE; 1937 PetscFunctionReturn(PETSC_SUCCESS); 1938 } 1939 1940 /*MC 1941 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 1942 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 1943 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 1944 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 1945 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 1946 algorithms are not recommended. This class does NOT support direct solver operations. 1947 1948 Level: beginner 1949 1950 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 1951 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 1952 M*/ 1953 1954 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 1955 { 1956 PetscInt n = A->rmap->n; 1957 PetscBool factOnDevice, factOnHost; 1958 char *prefix; 1959 char factPlace[32] = "device"; /* the default */ 1960 1961 PetscFunctionBegin; 1962 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1963 PetscCall(MatSetSizes(*B, n, n, n, n)); 1964 (*B)->factortype = ftype; 1965 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 1966 1967 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 1968 PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 1969 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 1970 PetscOptionsEnd(); 1971 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 1972 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 1973 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 1974 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 1975 1976 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 1977 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 1978 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1979 if (!A->boundtocpu) { 1980 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1981 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1982 } else { 1983 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 1984 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 1985 } 1986 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1987 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 1988 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 1989 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 1990 if (!A->boundtocpu) { 1991 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 1992 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1993 } else { 1994 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 1995 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 1996 } 1997 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 1998 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 1999 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2000 2001 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2002 (*B)->canuseordering = PETSC_TRUE; 2003 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2004 PetscFunctionReturn(PETSC_SUCCESS); 2005 } 2006 2007 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2008 { 2009 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2010 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2011 #if CUSPARSE_VERSION >= 13500 2012 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2013 #endif 2014 2015 PetscFunctionBegin; 2016 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2017 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2018 if (A->factortype == MAT_FACTOR_NONE) { 2019 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2020 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2021 } 2022 #if CUSPARSE_VERSION >= 13500 2023 else if (fs->csrVal) { 2024 /* We have a factorized matrix on device and are able to copy it to host */ 2025 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2026 } 2027 #endif 2028 else 2029 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2030 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2031 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2032 A->offloadmask = PETSC_OFFLOAD_BOTH; 2033 } 2034 PetscFunctionReturn(PETSC_SUCCESS); 2035 } 2036 2037 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2038 { 2039 PetscFunctionBegin; 2040 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2041 *array = ((Mat_SeqAIJ *)A->data)->a; 2042 PetscFunctionReturn(PETSC_SUCCESS); 2043 } 2044 2045 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2046 { 2047 PetscFunctionBegin; 2048 A->offloadmask = PETSC_OFFLOAD_CPU; 2049 *array = NULL; 2050 PetscFunctionReturn(PETSC_SUCCESS); 2051 } 2052 2053 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2054 { 2055 PetscFunctionBegin; 2056 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2057 *array = ((Mat_SeqAIJ *)A->data)->a; 2058 PetscFunctionReturn(PETSC_SUCCESS); 2059 } 2060 2061 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2062 { 2063 PetscFunctionBegin; 2064 *array = NULL; 2065 PetscFunctionReturn(PETSC_SUCCESS); 2066 } 2067 2068 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2069 { 2070 PetscFunctionBegin; 2071 *array = ((Mat_SeqAIJ *)A->data)->a; 2072 PetscFunctionReturn(PETSC_SUCCESS); 2073 } 2074 2075 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2076 { 2077 PetscFunctionBegin; 2078 A->offloadmask = PETSC_OFFLOAD_CPU; 2079 *array = NULL; 2080 PetscFunctionReturn(PETSC_SUCCESS); 2081 } 2082 2083 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2084 { 2085 Mat_SeqAIJCUSPARSE *cusp; 2086 CsrMatrix *matrix; 2087 2088 PetscFunctionBegin; 2089 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2090 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2091 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2092 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2093 matrix = (CsrMatrix *)cusp->mat->mat; 2094 2095 if (i) { 2096 #if !defined(PETSC_USE_64BIT_INDICES) 2097 *i = matrix->row_offsets->data().get(); 2098 #else 2099 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2100 #endif 2101 } 2102 if (j) { 2103 #if !defined(PETSC_USE_64BIT_INDICES) 2104 *j = matrix->column_indices->data().get(); 2105 #else 2106 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2107 #endif 2108 } 2109 if (a) *a = matrix->values->data().get(); 2110 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2111 PetscFunctionReturn(PETSC_SUCCESS); 2112 } 2113 2114 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2115 { 2116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2117 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2118 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2119 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2120 cusparseStatus_t stat; 2121 PetscBool both = PETSC_TRUE; 2122 2123 PetscFunctionBegin; 2124 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2125 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2126 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2127 CsrMatrix *matrix; 2128 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2129 2130 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2131 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2132 matrix->values->assign(a->a, a->a + a->nz); 2133 PetscCallCUDA(WaitForCUDA()); 2134 PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 2135 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2136 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2137 } else { 2138 PetscInt nnz; 2139 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2140 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2141 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2142 delete cusparsestruct->workVector; 2143 delete cusparsestruct->rowoffsets_gpu; 2144 cusparsestruct->workVector = NULL; 2145 cusparsestruct->rowoffsets_gpu = NULL; 2146 try { 2147 if (a->compressedrow.use) { 2148 m = a->compressedrow.nrows; 2149 ii = a->compressedrow.i; 2150 ridx = a->compressedrow.rindex; 2151 } else { 2152 m = A->rmap->n; 2153 ii = a->i; 2154 ridx = NULL; 2155 } 2156 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2157 if (!a->a) { 2158 nnz = ii[m]; 2159 both = PETSC_FALSE; 2160 } else nnz = a->nz; 2161 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2162 2163 /* create cusparse matrix */ 2164 cusparsestruct->nrows = m; 2165 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2166 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2167 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2168 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2169 2170 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 2171 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 2172 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2173 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2174 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2175 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2176 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2177 2178 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2179 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2180 /* set the matrix */ 2181 CsrMatrix *mat = new CsrMatrix; 2182 mat->num_rows = m; 2183 mat->num_cols = A->cmap->n; 2184 mat->num_entries = nnz; 2185 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2186 mat->row_offsets->assign(ii, ii + m + 1); 2187 2188 mat->column_indices = new THRUSTINTARRAY32(nnz); 2189 mat->column_indices->assign(a->j, a->j + nnz); 2190 2191 mat->values = new THRUSTARRAY(nnz); 2192 if (a->a) mat->values->assign(a->a, a->a + nnz); 2193 2194 /* assign the pointer */ 2195 matstruct->mat = mat; 2196 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2197 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2198 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2199 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2200 PetscCallCUSPARSE(stat); 2201 } 2202 #endif 2203 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2205 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2206 #else 2207 CsrMatrix *mat = new CsrMatrix; 2208 mat->num_rows = m; 2209 mat->num_cols = A->cmap->n; 2210 mat->num_entries = nnz; 2211 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2212 mat->row_offsets->assign(ii, ii + m + 1); 2213 2214 mat->column_indices = new THRUSTINTARRAY32(nnz); 2215 mat->column_indices->assign(a->j, a->j + nnz); 2216 2217 mat->values = new THRUSTARRAY(nnz); 2218 if (a->a) mat->values->assign(a->a, a->a + nnz); 2219 2220 cusparseHybMat_t hybMat; 2221 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2222 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2223 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2224 PetscCallCUSPARSE(stat); 2225 /* assign the pointer */ 2226 matstruct->mat = hybMat; 2227 2228 if (mat) { 2229 if (mat->values) delete (THRUSTARRAY *)mat->values; 2230 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2231 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2232 delete (CsrMatrix *)mat; 2233 } 2234 #endif 2235 } 2236 2237 /* assign the compressed row indices */ 2238 if (a->compressedrow.use) { 2239 cusparsestruct->workVector = new THRUSTARRAY(m); 2240 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2241 matstruct->cprowIndices->assign(ridx, ridx + m); 2242 tmp = m; 2243 } else { 2244 cusparsestruct->workVector = NULL; 2245 matstruct->cprowIndices = NULL; 2246 tmp = 0; 2247 } 2248 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2249 2250 /* assign the pointer */ 2251 cusparsestruct->mat = matstruct; 2252 } catch (char *ex) { 2253 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2254 } 2255 PetscCallCUDA(WaitForCUDA()); 2256 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2257 cusparsestruct->nonzerostate = A->nonzerostate; 2258 } 2259 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2260 } 2261 PetscFunctionReturn(PETSC_SUCCESS); 2262 } 2263 2264 struct VecCUDAPlusEquals { 2265 template <typename Tuple> 2266 __host__ __device__ void operator()(Tuple t) 2267 { 2268 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2269 } 2270 }; 2271 2272 struct VecCUDAEquals { 2273 template <typename Tuple> 2274 __host__ __device__ void operator()(Tuple t) 2275 { 2276 thrust::get<1>(t) = thrust::get<0>(t); 2277 } 2278 }; 2279 2280 struct VecCUDAEqualsReverse { 2281 template <typename Tuple> 2282 __host__ __device__ void operator()(Tuple t) 2283 { 2284 thrust::get<0>(t) = thrust::get<1>(t); 2285 } 2286 }; 2287 2288 struct MatMatCusparse { 2289 PetscBool cisdense; 2290 PetscScalar *Bt; 2291 Mat X; 2292 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2293 PetscLogDouble flops; 2294 CsrMatrix *Bcsr; 2295 2296 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2297 cusparseSpMatDescr_t matSpBDescr; 2298 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2299 cusparseDnMatDescr_t matBDescr; 2300 cusparseDnMatDescr_t matCDescr; 2301 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2302 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2303 void *dBuffer4; 2304 void *dBuffer5; 2305 #endif 2306 size_t mmBufferSize; 2307 void *mmBuffer; 2308 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2309 cusparseSpGEMMDescr_t spgemmDesc; 2310 #endif 2311 }; 2312 2313 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2314 { 2315 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2316 2317 PetscFunctionBegin; 2318 PetscCallCUDA(cudaFree(mmdata->Bt)); 2319 delete mmdata->Bcsr; 2320 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2321 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2322 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2323 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2324 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2325 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2326 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2327 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2328 #endif 2329 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2330 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2331 #endif 2332 PetscCall(MatDestroy(&mmdata->X)); 2333 PetscCall(PetscFree(data)); 2334 PetscFunctionReturn(PETSC_SUCCESS); 2335 } 2336 2337 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool); 2338 2339 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2340 { 2341 Mat_Product *product = C->product; 2342 Mat A, B; 2343 PetscInt m, n, blda, clda; 2344 PetscBool flg, biscuda; 2345 Mat_SeqAIJCUSPARSE *cusp; 2346 cusparseStatus_t stat; 2347 cusparseOperation_t opA; 2348 const PetscScalar *barray; 2349 PetscScalar *carray; 2350 MatMatCusparse *mmdata; 2351 Mat_SeqAIJCUSPARSEMultStruct *mat; 2352 CsrMatrix *csrmat; 2353 2354 PetscFunctionBegin; 2355 MatCheckProduct(C, 1); 2356 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2357 mmdata = (MatMatCusparse *)product->data; 2358 A = product->A; 2359 B = product->B; 2360 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2361 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2362 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2363 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2364 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2365 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2366 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2367 switch (product->type) { 2368 case MATPRODUCT_AB: 2369 case MATPRODUCT_PtAP: 2370 mat = cusp->mat; 2371 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2372 m = A->rmap->n; 2373 n = B->cmap->n; 2374 break; 2375 case MATPRODUCT_AtB: 2376 if (!A->form_explicit_transpose) { 2377 mat = cusp->mat; 2378 opA = CUSPARSE_OPERATION_TRANSPOSE; 2379 } else { 2380 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2381 mat = cusp->matTranspose; 2382 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2383 } 2384 m = A->cmap->n; 2385 n = B->cmap->n; 2386 break; 2387 case MATPRODUCT_ABt: 2388 case MATPRODUCT_RARt: 2389 mat = cusp->mat; 2390 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2391 m = A->rmap->n; 2392 n = B->rmap->n; 2393 break; 2394 default: 2395 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2396 } 2397 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2398 csrmat = (CsrMatrix *)mat->mat; 2399 /* if the user passed a CPU matrix, copy the data to the GPU */ 2400 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2401 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2402 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2403 2404 PetscCall(MatDenseGetLDA(B, &blda)); 2405 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2406 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2407 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2408 } else { 2409 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2410 PetscCall(MatDenseGetLDA(C, &clda)); 2411 } 2412 2413 PetscCall(PetscLogGpuTimeBegin()); 2414 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2415 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2416 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2417 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2418 size_t mmBufferSize; 2419 if (mmdata->initialized && mmdata->Blda != blda) { 2420 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2421 mmdata->matBDescr = NULL; 2422 } 2423 if (!mmdata->matBDescr) { 2424 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2425 mmdata->Blda = blda; 2426 } 2427 2428 if (mmdata->initialized && mmdata->Clda != clda) { 2429 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2430 mmdata->matCDescr = NULL; 2431 } 2432 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2433 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2434 mmdata->Clda = clda; 2435 } 2436 2437 if (!mat->matDescr) { 2438 stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2439 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2440 PetscCallCUSPARSE(stat); 2441 } 2442 stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 2443 PetscCallCUSPARSE(stat); 2444 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2445 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2446 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2447 mmdata->mmBufferSize = mmBufferSize; 2448 } 2449 mmdata->initialized = PETSC_TRUE; 2450 } else { 2451 /* to be safe, always update pointers of the mats */ 2452 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2453 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2454 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2455 } 2456 2457 /* do cusparseSpMM, which supports transpose on B */ 2458 stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 2459 PetscCallCUSPARSE(stat); 2460 #else 2461 PetscInt k; 2462 /* cusparseXcsrmm does not support transpose on B */ 2463 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2464 cublasHandle_t cublasv2handle; 2465 cublasStatus_t cerr; 2466 2467 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2468 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2469 PetscCallCUBLAS(cerr); 2470 blda = B->cmap->n; 2471 k = B->cmap->n; 2472 } else { 2473 k = B->rmap->n; 2474 } 2475 2476 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2477 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2478 PetscCallCUSPARSE(stat); 2479 #endif 2480 PetscCall(PetscLogGpuTimeEnd()); 2481 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2482 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2483 if (product->type == MATPRODUCT_RARt) { 2484 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2485 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2486 } else if (product->type == MATPRODUCT_PtAP) { 2487 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2488 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2489 } else { 2490 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2491 } 2492 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2493 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2494 PetscFunctionReturn(PETSC_SUCCESS); 2495 } 2496 2497 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2498 { 2499 Mat_Product *product = C->product; 2500 Mat A, B; 2501 PetscInt m, n; 2502 PetscBool cisdense, flg; 2503 MatMatCusparse *mmdata; 2504 Mat_SeqAIJCUSPARSE *cusp; 2505 2506 PetscFunctionBegin; 2507 MatCheckProduct(C, 1); 2508 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2509 A = product->A; 2510 B = product->B; 2511 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2512 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2513 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2514 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2515 switch (product->type) { 2516 case MATPRODUCT_AB: 2517 m = A->rmap->n; 2518 n = B->cmap->n; 2519 break; 2520 case MATPRODUCT_AtB: 2521 m = A->cmap->n; 2522 n = B->cmap->n; 2523 break; 2524 case MATPRODUCT_ABt: 2525 m = A->rmap->n; 2526 n = B->rmap->n; 2527 break; 2528 case MATPRODUCT_PtAP: 2529 m = B->cmap->n; 2530 n = B->cmap->n; 2531 break; 2532 case MATPRODUCT_RARt: 2533 m = B->rmap->n; 2534 n = B->rmap->n; 2535 break; 2536 default: 2537 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2538 } 2539 PetscCall(MatSetSizes(C, m, n, m, n)); 2540 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2541 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2542 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2543 2544 /* product data */ 2545 PetscCall(PetscNew(&mmdata)); 2546 mmdata->cisdense = cisdense; 2547 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2548 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2549 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2550 #endif 2551 /* for these products we need intermediate storage */ 2552 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2553 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2554 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2555 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2556 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2557 } else { 2558 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2559 } 2560 } 2561 C->product->data = mmdata; 2562 C->product->destroy = MatDestroy_MatMatCusparse; 2563 2564 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2565 PetscFunctionReturn(PETSC_SUCCESS); 2566 } 2567 2568 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2569 { 2570 Mat_Product *product = C->product; 2571 Mat A, B; 2572 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2573 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2574 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2575 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2576 PetscBool flg; 2577 cusparseStatus_t stat; 2578 MatProductType ptype; 2579 MatMatCusparse *mmdata; 2580 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2581 cusparseSpMatDescr_t BmatSpDescr; 2582 #endif 2583 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2584 2585 PetscFunctionBegin; 2586 MatCheckProduct(C, 1); 2587 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2588 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2589 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2590 mmdata = (MatMatCusparse *)C->product->data; 2591 A = product->A; 2592 B = product->B; 2593 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2594 mmdata->reusesym = PETSC_FALSE; 2595 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2596 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2597 Cmat = Ccusp->mat; 2598 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2599 Ccsr = (CsrMatrix *)Cmat->mat; 2600 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2601 goto finalize; 2602 } 2603 if (!c->nz) goto finalize; 2604 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2605 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2606 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2607 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2608 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2609 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2610 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2611 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2612 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2613 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2614 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2615 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2616 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2617 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2618 2619 ptype = product->type; 2620 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2621 ptype = MATPRODUCT_AB; 2622 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2623 } 2624 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2625 ptype = MATPRODUCT_AB; 2626 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2627 } 2628 switch (ptype) { 2629 case MATPRODUCT_AB: 2630 Amat = Acusp->mat; 2631 Bmat = Bcusp->mat; 2632 break; 2633 case MATPRODUCT_AtB: 2634 Amat = Acusp->matTranspose; 2635 Bmat = Bcusp->mat; 2636 break; 2637 case MATPRODUCT_ABt: 2638 Amat = Acusp->mat; 2639 Bmat = Bcusp->matTranspose; 2640 break; 2641 default: 2642 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2643 } 2644 Cmat = Ccusp->mat; 2645 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2646 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2647 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2648 Acsr = (CsrMatrix *)Amat->mat; 2649 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2650 Ccsr = (CsrMatrix *)Cmat->mat; 2651 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2652 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2653 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2654 PetscCall(PetscLogGpuTimeBegin()); 2655 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2656 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2657 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2658 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2659 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2660 PetscCallCUSPARSE(stat); 2661 #else 2662 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2663 PetscCallCUSPARSE(stat); 2664 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2665 PetscCallCUSPARSE(stat); 2666 #endif 2667 #else 2668 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2669 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2670 PetscCallCUSPARSE(stat); 2671 #endif 2672 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2673 PetscCallCUDA(WaitForCUDA()); 2674 PetscCall(PetscLogGpuTimeEnd()); 2675 C->offloadmask = PETSC_OFFLOAD_GPU; 2676 finalize: 2677 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2678 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2679 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2680 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2681 c->reallocs = 0; 2682 C->info.mallocs += 0; 2683 C->info.nz_unneeded = 0; 2684 C->assembled = C->was_assembled = PETSC_TRUE; 2685 C->num_ass++; 2686 PetscFunctionReturn(PETSC_SUCCESS); 2687 } 2688 2689 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2690 { 2691 Mat_Product *product = C->product; 2692 Mat A, B; 2693 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2694 Mat_SeqAIJ *a, *b, *c; 2695 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2696 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2697 PetscInt i, j, m, n, k; 2698 PetscBool flg; 2699 cusparseStatus_t stat; 2700 MatProductType ptype; 2701 MatMatCusparse *mmdata; 2702 PetscLogDouble flops; 2703 PetscBool biscompressed, ciscompressed; 2704 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2705 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2706 cusparseSpMatDescr_t BmatSpDescr; 2707 #else 2708 int cnz; 2709 #endif 2710 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2711 2712 PetscFunctionBegin; 2713 MatCheckProduct(C, 1); 2714 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2715 A = product->A; 2716 B = product->B; 2717 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2718 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2719 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2720 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2721 a = (Mat_SeqAIJ *)A->data; 2722 b = (Mat_SeqAIJ *)B->data; 2723 /* product data */ 2724 PetscCall(PetscNew(&mmdata)); 2725 C->product->data = mmdata; 2726 C->product->destroy = MatDestroy_MatMatCusparse; 2727 2728 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2729 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2730 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2731 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2732 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2733 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2734 2735 ptype = product->type; 2736 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2737 ptype = MATPRODUCT_AB; 2738 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2739 } 2740 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2741 ptype = MATPRODUCT_AB; 2742 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2743 } 2744 biscompressed = PETSC_FALSE; 2745 ciscompressed = PETSC_FALSE; 2746 switch (ptype) { 2747 case MATPRODUCT_AB: 2748 m = A->rmap->n; 2749 n = B->cmap->n; 2750 k = A->cmap->n; 2751 Amat = Acusp->mat; 2752 Bmat = Bcusp->mat; 2753 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2754 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2755 break; 2756 case MATPRODUCT_AtB: 2757 m = A->cmap->n; 2758 n = B->cmap->n; 2759 k = A->rmap->n; 2760 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2761 Amat = Acusp->matTranspose; 2762 Bmat = Bcusp->mat; 2763 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2764 break; 2765 case MATPRODUCT_ABt: 2766 m = A->rmap->n; 2767 n = B->rmap->n; 2768 k = A->cmap->n; 2769 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2770 Amat = Acusp->mat; 2771 Bmat = Bcusp->matTranspose; 2772 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2773 break; 2774 default: 2775 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2776 } 2777 2778 /* create cusparse matrix */ 2779 PetscCall(MatSetSizes(C, m, n, m, n)); 2780 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 2781 c = (Mat_SeqAIJ *)C->data; 2782 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2783 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2784 Ccsr = new CsrMatrix; 2785 2786 c->compressedrow.use = ciscompressed; 2787 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2788 c->compressedrow.nrows = a->compressedrow.nrows; 2789 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 2790 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 2791 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2792 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2793 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 2794 } else { 2795 c->compressedrow.nrows = 0; 2796 c->compressedrow.i = NULL; 2797 c->compressedrow.rindex = NULL; 2798 Ccusp->workVector = NULL; 2799 Cmat->cprowIndices = NULL; 2800 } 2801 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2802 Ccusp->mat = Cmat; 2803 Ccusp->mat->mat = Ccsr; 2804 Ccsr->num_rows = Ccusp->nrows; 2805 Ccsr->num_cols = n; 2806 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 2807 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2808 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2809 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2810 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 2811 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 2812 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2813 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2814 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2815 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2816 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2817 thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0); 2818 c->nz = 0; 2819 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2820 Ccsr->values = new THRUSTARRAY(c->nz); 2821 goto finalizesym; 2822 } 2823 2824 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2825 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2826 Acsr = (CsrMatrix *)Amat->mat; 2827 if (!biscompressed) { 2828 Bcsr = (CsrMatrix *)Bmat->mat; 2829 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2830 BmatSpDescr = Bmat->matDescr; 2831 #endif 2832 } else { /* we need to use row offsets for the full matrix */ 2833 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 2834 Bcsr = new CsrMatrix; 2835 Bcsr->num_rows = B->rmap->n; 2836 Bcsr->num_cols = cBcsr->num_cols; 2837 Bcsr->num_entries = cBcsr->num_entries; 2838 Bcsr->column_indices = cBcsr->column_indices; 2839 Bcsr->values = cBcsr->values; 2840 if (!Bcusp->rowoffsets_gpu) { 2841 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2842 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 2843 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 2844 } 2845 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2846 mmdata->Bcsr = Bcsr; 2847 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2848 if (Bcsr->num_rows && Bcsr->num_cols) { 2849 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2850 PetscCallCUSPARSE(stat); 2851 } 2852 BmatSpDescr = mmdata->matSpBDescr; 2853 #endif 2854 } 2855 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2856 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2857 /* precompute flops count */ 2858 if (ptype == MATPRODUCT_AB) { 2859 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2860 const PetscInt st = a->i[i]; 2861 const PetscInt en = a->i[i + 1]; 2862 for (j = st; j < en; j++) { 2863 const PetscInt brow = a->j[j]; 2864 flops += 2. * (b->i[brow + 1] - b->i[brow]); 2865 } 2866 } 2867 } else if (ptype == MATPRODUCT_AtB) { 2868 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2869 const PetscInt anzi = a->i[i + 1] - a->i[i]; 2870 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 2871 flops += (2. * anzi) * bnzi; 2872 } 2873 } else { /* TODO */ 2874 flops = 0.; 2875 } 2876 2877 mmdata->flops = flops; 2878 PetscCall(PetscLogGpuTimeBegin()); 2879 2880 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2881 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2882 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2883 PetscCallCUSPARSE(stat); 2884 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2885 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2886 { 2887 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2888 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2889 */ 2890 void *dBuffer1 = NULL; 2891 void *dBuffer2 = NULL; 2892 void *dBuffer3 = NULL; 2893 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2894 size_t bufferSize1 = 0; 2895 size_t bufferSize2 = 0; 2896 size_t bufferSize3 = 0; 2897 size_t bufferSize4 = 0; 2898 size_t bufferSize5 = 0; 2899 2900 /* ask bufferSize1 bytes for external memory */ 2901 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 2902 PetscCallCUSPARSE(stat); 2903 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 2904 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2905 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 2906 PetscCallCUSPARSE(stat); 2907 2908 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 2909 PetscCallCUSPARSE(stat); 2910 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 2911 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 2912 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 2913 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 2914 PetscCallCUSPARSE(stat); 2915 PetscCallCUDA(cudaFree(dBuffer1)); 2916 PetscCallCUDA(cudaFree(dBuffer2)); 2917 2918 /* get matrix C non-zero entries C_nnz1 */ 2919 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2920 c->nz = (PetscInt)C_nnz1; 2921 /* allocate matrix C */ 2922 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2923 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2924 Ccsr->values = new THRUSTARRAY(c->nz); 2925 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2926 /* update matC with the new pointers */ 2927 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 2928 PetscCallCUSPARSE(stat); 2929 2930 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 2931 PetscCallCUSPARSE(stat); 2932 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 2933 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 2934 PetscCallCUSPARSE(stat); 2935 PetscCallCUDA(cudaFree(dBuffer3)); 2936 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2937 PetscCallCUSPARSE(stat); 2938 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 2939 } 2940 #else 2941 size_t bufSize2; 2942 /* ask bufferSize bytes for external memory */ 2943 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 2944 PetscCallCUSPARSE(stat); 2945 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 2946 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2947 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 2948 PetscCallCUSPARSE(stat); 2949 /* ask bufferSize again bytes for external memory */ 2950 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 2951 PetscCallCUSPARSE(stat); 2952 /* The CUSPARSE documentation is not clear, nor the API 2953 We need both buffers to perform the operations properly! 2954 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2955 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2956 is stored in the descriptor! What a messy API... */ 2957 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 2958 /* compute the intermediate product of A * B */ 2959 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2960 PetscCallCUSPARSE(stat); 2961 /* get matrix C non-zero entries C_nnz1 */ 2962 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2963 c->nz = (PetscInt)C_nnz1; 2964 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 2965 mmdata->mmBufferSize / 1024)); 2966 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2967 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2968 Ccsr->values = new THRUSTARRAY(c->nz); 2969 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2970 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 2971 PetscCallCUSPARSE(stat); 2972 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2973 PetscCallCUSPARSE(stat); 2974 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2975 #else 2976 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2977 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2978 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 2979 PetscCallCUSPARSE(stat); 2980 c->nz = cnz; 2981 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2982 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2983 Ccsr->values = new THRUSTARRAY(c->nz); 2984 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2985 2986 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2987 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2988 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2989 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2990 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2991 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2992 PetscCallCUSPARSE(stat); 2993 #endif 2994 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2995 PetscCall(PetscLogGpuTimeEnd()); 2996 finalizesym: 2997 c->singlemalloc = PETSC_FALSE; 2998 c->free_a = PETSC_TRUE; 2999 c->free_ij = PETSC_TRUE; 3000 PetscCall(PetscMalloc1(m + 1, &c->i)); 3001 PetscCall(PetscMalloc1(c->nz, &c->j)); 3002 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3003 PetscInt *d_i = c->i; 3004 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3005 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3006 ii = *Ccsr->row_offsets; 3007 jj = *Ccsr->column_indices; 3008 if (ciscompressed) d_i = c->compressedrow.i; 3009 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3010 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3011 } else { 3012 PetscInt *d_i = c->i; 3013 if (ciscompressed) d_i = c->compressedrow.i; 3014 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3015 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3016 } 3017 if (ciscompressed) { /* need to expand host row offsets */ 3018 PetscInt r = 0; 3019 c->i[0] = 0; 3020 for (k = 0; k < c->compressedrow.nrows; k++) { 3021 const PetscInt next = c->compressedrow.rindex[k]; 3022 const PetscInt old = c->compressedrow.i[k]; 3023 for (; r < next; r++) c->i[r + 1] = old; 3024 } 3025 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3026 } 3027 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3028 PetscCall(PetscMalloc1(m, &c->ilen)); 3029 PetscCall(PetscMalloc1(m, &c->imax)); 3030 c->maxnz = c->nz; 3031 c->nonzerorowcnt = 0; 3032 c->rmax = 0; 3033 for (k = 0; k < m; k++) { 3034 const PetscInt nn = c->i[k + 1] - c->i[k]; 3035 c->ilen[k] = c->imax[k] = nn; 3036 c->nonzerorowcnt += (PetscInt) !!nn; 3037 c->rmax = PetscMax(c->rmax, nn); 3038 } 3039 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3040 PetscCall(PetscMalloc1(c->nz, &c->a)); 3041 Ccsr->num_entries = c->nz; 3042 3043 C->nonzerostate++; 3044 PetscCall(PetscLayoutSetUp(C->rmap)); 3045 PetscCall(PetscLayoutSetUp(C->cmap)); 3046 Ccusp->nonzerostate = C->nonzerostate; 3047 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3048 C->preallocated = PETSC_TRUE; 3049 C->assembled = PETSC_FALSE; 3050 C->was_assembled = PETSC_FALSE; 3051 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3052 mmdata->reusesym = PETSC_TRUE; 3053 C->offloadmask = PETSC_OFFLOAD_GPU; 3054 } 3055 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3056 PetscFunctionReturn(PETSC_SUCCESS); 3057 } 3058 3059 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3060 3061 /* handles sparse or dense B */ 3062 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3063 { 3064 Mat_Product *product = mat->product; 3065 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3066 3067 PetscFunctionBegin; 3068 MatCheckProduct(mat, 1); 3069 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3070 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3071 if (product->type == MATPRODUCT_ABC) { 3072 Ciscusp = PETSC_FALSE; 3073 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3074 } 3075 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3076 PetscBool usecpu = PETSC_FALSE; 3077 switch (product->type) { 3078 case MATPRODUCT_AB: 3079 if (product->api_user) { 3080 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3081 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3082 PetscOptionsEnd(); 3083 } else { 3084 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3085 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3086 PetscOptionsEnd(); 3087 } 3088 break; 3089 case MATPRODUCT_AtB: 3090 if (product->api_user) { 3091 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3092 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3093 PetscOptionsEnd(); 3094 } else { 3095 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3096 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3097 PetscOptionsEnd(); 3098 } 3099 break; 3100 case MATPRODUCT_PtAP: 3101 if (product->api_user) { 3102 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3103 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3104 PetscOptionsEnd(); 3105 } else { 3106 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3107 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3108 PetscOptionsEnd(); 3109 } 3110 break; 3111 case MATPRODUCT_RARt: 3112 if (product->api_user) { 3113 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3114 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3115 PetscOptionsEnd(); 3116 } else { 3117 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3118 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3119 PetscOptionsEnd(); 3120 } 3121 break; 3122 case MATPRODUCT_ABC: 3123 if (product->api_user) { 3124 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3125 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3126 PetscOptionsEnd(); 3127 } else { 3128 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3129 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3130 PetscOptionsEnd(); 3131 } 3132 break; 3133 default: 3134 break; 3135 } 3136 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3137 } 3138 /* dispatch */ 3139 if (isdense) { 3140 switch (product->type) { 3141 case MATPRODUCT_AB: 3142 case MATPRODUCT_AtB: 3143 case MATPRODUCT_ABt: 3144 case MATPRODUCT_PtAP: 3145 case MATPRODUCT_RARt: 3146 if (product->A->boundtocpu) { 3147 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3148 } else { 3149 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3150 } 3151 break; 3152 case MATPRODUCT_ABC: 3153 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3154 break; 3155 default: 3156 break; 3157 } 3158 } else if (Biscusp && Ciscusp) { 3159 switch (product->type) { 3160 case MATPRODUCT_AB: 3161 case MATPRODUCT_AtB: 3162 case MATPRODUCT_ABt: 3163 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3164 break; 3165 case MATPRODUCT_PtAP: 3166 case MATPRODUCT_RARt: 3167 case MATPRODUCT_ABC: 3168 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3169 break; 3170 default: 3171 break; 3172 } 3173 } else { /* fallback for AIJ */ 3174 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3175 } 3176 PetscFunctionReturn(PETSC_SUCCESS); 3177 } 3178 3179 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3180 { 3181 PetscFunctionBegin; 3182 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3183 PetscFunctionReturn(PETSC_SUCCESS); 3184 } 3185 3186 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3187 { 3188 PetscFunctionBegin; 3189 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3190 PetscFunctionReturn(PETSC_SUCCESS); 3191 } 3192 3193 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3194 { 3195 PetscFunctionBegin; 3196 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3197 PetscFunctionReturn(PETSC_SUCCESS); 3198 } 3199 3200 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3201 { 3202 PetscFunctionBegin; 3203 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3204 PetscFunctionReturn(PETSC_SUCCESS); 3205 } 3206 3207 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3208 { 3209 PetscFunctionBegin; 3210 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3211 PetscFunctionReturn(PETSC_SUCCESS); 3212 } 3213 3214 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3215 { 3216 int i = blockIdx.x * blockDim.x + threadIdx.x; 3217 if (i < n) y[idx[i]] += x[i]; 3218 } 3219 3220 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3221 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3222 { 3223 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3224 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3225 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3226 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3227 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3228 PetscBool compressed; 3229 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3230 PetscInt nx, ny; 3231 #endif 3232 3233 PetscFunctionBegin; 3234 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3235 if (!a->nz) { 3236 if (yy) PetscCall(VecSeq_CUDA::copy(yy, zz)); 3237 else PetscCall(VecSeq_CUDA::set(zz, 0)); 3238 PetscFunctionReturn(PETSC_SUCCESS); 3239 } 3240 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3241 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3242 if (!trans) { 3243 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3244 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3245 } else { 3246 if (herm || !A->form_explicit_transpose) { 3247 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3248 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3249 } else { 3250 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3251 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3252 } 3253 } 3254 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3255 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3256 3257 try { 3258 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3259 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3260 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3261 3262 PetscCall(PetscLogGpuTimeBegin()); 3263 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3264 /* z = A x + beta y. 3265 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3266 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3267 */ 3268 xptr = xarray; 3269 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3270 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3271 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3272 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3273 allocated to accommodate different uses. So we get the length info directly from mat. 3274 */ 3275 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3276 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3277 nx = mat->num_cols; 3278 ny = mat->num_rows; 3279 } 3280 #endif 3281 } else { 3282 /* z = A^T x + beta y 3283 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3284 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3285 */ 3286 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3287 dptr = zarray; 3288 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3289 if (compressed) { /* Scatter x to work vector */ 3290 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3291 3292 thrust::for_each( 3293 #if PetscDefined(HAVE_THRUST_ASYNC) 3294 thrust::cuda::par.on(PetscDefaultCudaStream), 3295 #endif 3296 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3297 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3298 } 3299 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3300 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3301 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3302 nx = mat->num_rows; 3303 ny = mat->num_cols; 3304 } 3305 #endif 3306 } 3307 3308 /* csr_spmv does y = alpha op(A) x + beta y */ 3309 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3310 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3311 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3312 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3313 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3314 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3315 PetscCallCUSPARSE( 3316 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3317 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3318 3319 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3320 } else { 3321 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3322 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3323 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3324 } 3325 3326 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3327 matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3328 #else 3329 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3330 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3331 #endif 3332 } else { 3333 if (cusparsestruct->nrows) { 3334 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3335 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3336 #else 3337 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3338 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3339 #endif 3340 } 3341 } 3342 PetscCall(PetscLogGpuTimeEnd()); 3343 3344 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3345 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3346 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3347 PetscCall(VecSeq_CUDA::copy(yy, zz)); /* zz = yy */ 3348 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3349 PetscCall(VecSeq_CUDA::axpy(zz, 1.0, yy)); /* zz += yy */ 3350 } 3351 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3352 PetscCall(VecSeq_CUDA::set(zz, 0)); 3353 } 3354 3355 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3356 if (compressed) { 3357 PetscCall(PetscLogGpuTimeBegin()); 3358 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3359 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3360 prevent that. So I just add a ScatterAdd kernel. 3361 */ 3362 #if 0 3363 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3364 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3365 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3366 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3367 VecCUDAPlusEquals()); 3368 #else 3369 PetscInt n = matstruct->cprowIndices->size(); 3370 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3371 #endif 3372 PetscCall(PetscLogGpuTimeEnd()); 3373 } 3374 } else { 3375 if (yy && yy != zz) PetscCall(VecSeq_CUDA::axpy(zz, 1.0, yy)); /* zz += yy */ 3376 } 3377 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3378 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3379 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3380 } catch (char *ex) { 3381 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3382 } 3383 if (yy) { 3384 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3385 } else { 3386 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3387 } 3388 PetscFunctionReturn(PETSC_SUCCESS); 3389 } 3390 3391 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3392 { 3393 PetscFunctionBegin; 3394 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3395 PetscFunctionReturn(PETSC_SUCCESS); 3396 } 3397 3398 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3399 { 3400 PetscObjectState onnz = A->nonzerostate; 3401 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3402 3403 PetscFunctionBegin; 3404 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3405 if (onnz != A->nonzerostate && cusp->deviceMat) { 3406 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 3407 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3408 cusp->deviceMat = NULL; 3409 } 3410 PetscFunctionReturn(PETSC_SUCCESS); 3411 } 3412 3413 /*@ 3414 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3415 (the default parallel PETSc format). This matrix will ultimately pushed down 3416 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 3417 assembly performance the user should preallocate the matrix storage by setting 3418 the parameter nz (or the array nnz). By setting these parameters accurately, 3419 performance during matrix assembly can be increased by more than a factor of 50. 3420 3421 Collective 3422 3423 Input Parameters: 3424 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3425 . m - number of rows 3426 . n - number of columns 3427 . nz - number of nonzeros per row (same for all rows) 3428 - nnz - array containing the number of nonzeros in the various rows 3429 (possibly different for each row) or `NULL` 3430 3431 Output Parameter: 3432 . A - the matrix 3433 3434 Level: intermediate 3435 3436 Notes: 3437 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3438 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3439 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3440 3441 If `nnz` is given then `nz` is ignored 3442 3443 The AIJ format, also called 3444 compressed row storage, is fully compatible with standard Fortran 3445 storage. That is, the stored row and column indices can begin at 3446 either one (as in Fortran) or zero. See the users' manual for details. 3447 3448 Specify the preallocated storage with either nz or nnz (not both). 3449 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3450 allocation. For large problems you MUST preallocate memory or you 3451 will get TERRIBLE performance, see the users' manual chapter on matrices. 3452 3453 By default, this format uses inodes (identical nodes) when possible, to 3454 improve numerical efficiency of matrix-vector products and solves. We 3455 search for consecutive rows with the same nonzero structure, thereby 3456 reusing matrix information to achieve increased efficiency. 3457 3458 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3459 @*/ 3460 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3461 { 3462 PetscFunctionBegin; 3463 PetscCall(MatCreate(comm, A)); 3464 PetscCall(MatSetSizes(*A, m, n, m, n)); 3465 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3466 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3467 PetscFunctionReturn(PETSC_SUCCESS); 3468 } 3469 3470 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3471 { 3472 PetscFunctionBegin; 3473 if (A->factortype == MAT_FACTOR_NONE) { 3474 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 3475 } else { 3476 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3477 } 3478 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3479 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3480 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3481 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3482 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3483 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3484 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3485 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3486 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3487 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3488 PetscCall(MatDestroy_SeqAIJ(A)); 3489 PetscFunctionReturn(PETSC_SUCCESS); 3490 } 3491 3492 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3493 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3494 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3495 { 3496 PetscFunctionBegin; 3497 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3498 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3499 PetscFunctionReturn(PETSC_SUCCESS); 3500 } 3501 3502 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3503 { 3504 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3505 Mat_SeqAIJCUSPARSE *cy; 3506 Mat_SeqAIJCUSPARSE *cx; 3507 PetscScalar *ay; 3508 const PetscScalar *ax; 3509 CsrMatrix *csry, *csrx; 3510 3511 PetscFunctionBegin; 3512 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3513 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3514 if (X->ops->axpy != Y->ops->axpy) { 3515 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3516 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3517 PetscFunctionReturn(PETSC_SUCCESS); 3518 } 3519 /* if we are here, it means both matrices are bound to GPU */ 3520 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3521 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3522 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3523 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3524 csry = (CsrMatrix *)cy->mat->mat; 3525 csrx = (CsrMatrix *)cx->mat->mat; 3526 /* see if we can turn this into a cublas axpy */ 3527 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3528 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3529 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3530 if (eq) str = SAME_NONZERO_PATTERN; 3531 } 3532 /* spgeam is buggy with one column */ 3533 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3534 3535 if (str == SUBSET_NONZERO_PATTERN) { 3536 PetscScalar b = 1.0; 3537 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3538 size_t bufferSize; 3539 void *buffer; 3540 #endif 3541 3542 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3543 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3544 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3545 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3546 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3547 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3548 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3549 PetscCall(PetscLogGpuTimeBegin()); 3550 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3551 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3552 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3553 PetscCall(PetscLogGpuTimeEnd()); 3554 PetscCallCUDA(cudaFree(buffer)); 3555 #else 3556 PetscCall(PetscLogGpuTimeBegin()); 3557 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3558 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3559 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3560 PetscCall(PetscLogGpuTimeEnd()); 3561 #endif 3562 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3563 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3564 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3565 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3566 } else if (str == SAME_NONZERO_PATTERN) { 3567 cublasHandle_t cublasv2handle; 3568 PetscBLASInt one = 1, bnz = 1; 3569 3570 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3571 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3572 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3573 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3574 PetscCall(PetscLogGpuTimeBegin()); 3575 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3576 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3577 PetscCall(PetscLogGpuTimeEnd()); 3578 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3579 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3580 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3581 } else { 3582 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3583 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3584 } 3585 PetscFunctionReturn(PETSC_SUCCESS); 3586 } 3587 3588 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3589 { 3590 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3591 PetscScalar *ay; 3592 cublasHandle_t cublasv2handle; 3593 PetscBLASInt one = 1, bnz = 1; 3594 3595 PetscFunctionBegin; 3596 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3597 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3598 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3599 PetscCall(PetscLogGpuTimeBegin()); 3600 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3601 PetscCall(PetscLogGpuFlops(bnz)); 3602 PetscCall(PetscLogGpuTimeEnd()); 3603 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3604 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3605 PetscFunctionReturn(PETSC_SUCCESS); 3606 } 3607 3608 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3609 { 3610 PetscBool both = PETSC_FALSE; 3611 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3612 3613 PetscFunctionBegin; 3614 if (A->factortype == MAT_FACTOR_NONE) { 3615 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3616 if (spptr->mat) { 3617 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3618 if (matrix->values) { 3619 both = PETSC_TRUE; 3620 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3621 } 3622 } 3623 if (spptr->matTranspose) { 3624 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3625 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3626 } 3627 } 3628 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3629 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3630 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3631 else A->offloadmask = PETSC_OFFLOAD_CPU; 3632 PetscFunctionReturn(PETSC_SUCCESS); 3633 } 3634 3635 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3636 { 3637 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3638 3639 PetscFunctionBegin; 3640 if (A->factortype != MAT_FACTOR_NONE) { 3641 A->boundtocpu = flg; 3642 PetscFunctionReturn(PETSC_SUCCESS); 3643 } 3644 if (flg) { 3645 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3646 3647 A->ops->scale = MatScale_SeqAIJ; 3648 A->ops->axpy = MatAXPY_SeqAIJ; 3649 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3650 A->ops->mult = MatMult_SeqAIJ; 3651 A->ops->multadd = MatMultAdd_SeqAIJ; 3652 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3653 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3654 A->ops->multhermitiantranspose = NULL; 3655 A->ops->multhermitiantransposeadd = NULL; 3656 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3657 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3658 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3659 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3660 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3661 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3662 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3663 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3664 } else { 3665 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3666 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3667 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3668 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3669 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3670 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3671 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3672 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3673 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3674 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3675 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3676 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3677 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3678 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3679 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3680 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3681 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3682 3683 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3684 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3685 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3686 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3687 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 3688 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3689 } 3690 A->boundtocpu = flg; 3691 if (flg && a->inode.size) { 3692 a->inode.use = PETSC_TRUE; 3693 } else { 3694 a->inode.use = PETSC_FALSE; 3695 } 3696 PetscFunctionReturn(PETSC_SUCCESS); 3697 } 3698 3699 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 3700 { 3701 Mat B; 3702 3703 PetscFunctionBegin; 3704 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3705 if (reuse == MAT_INITIAL_MATRIX) { 3706 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3707 } else if (reuse == MAT_REUSE_MATRIX) { 3708 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 3709 } 3710 B = *newmat; 3711 3712 PetscCall(PetscFree(B->defaultvectype)); 3713 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 3714 3715 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3716 if (B->factortype == MAT_FACTOR_NONE) { 3717 Mat_SeqAIJCUSPARSE *spptr; 3718 PetscCall(PetscNew(&spptr)); 3719 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3720 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3721 spptr->format = MAT_CUSPARSE_CSR; 3722 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3723 #if CUSPARSE_VERSION > 11301 3724 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3725 #else 3726 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3727 #endif 3728 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3729 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3730 #endif 3731 B->spptr = spptr; 3732 } else { 3733 Mat_SeqAIJCUSPARSETriFactors *spptr; 3734 3735 PetscCall(PetscNew(&spptr)); 3736 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3737 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3738 B->spptr = spptr; 3739 } 3740 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3741 } 3742 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3743 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3744 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3745 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3746 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3747 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3748 3749 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 3750 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 3751 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3752 #if defined(PETSC_HAVE_HYPRE) 3753 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 3754 #endif 3755 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3756 PetscFunctionReturn(PETSC_SUCCESS); 3757 } 3758 3759 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3760 { 3761 PetscFunctionBegin; 3762 PetscCall(MatCreate_SeqAIJ(B)); 3763 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 3764 PetscFunctionReturn(PETSC_SUCCESS); 3765 } 3766 3767 /*MC 3768 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3769 3770 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 3771 CSR, ELL, or Hybrid format. 3772 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 3773 3774 Options Database Keys: 3775 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 3776 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 3777 Other options include ell (ellpack) or hyb (hybrid). 3778 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 3779 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 3780 3781 Level: beginner 3782 3783 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3784 M*/ 3785 3786 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 3787 3788 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3789 { 3790 PetscFunctionBegin; 3791 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 3792 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 3793 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 3794 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 3795 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 3796 3797 PetscFunctionReturn(PETSC_SUCCESS); 3798 } 3799 3800 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3801 { 3802 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 3803 3804 PetscFunctionBegin; 3805 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 3806 delete cusp->cooPerm; 3807 delete cusp->cooPerm_a; 3808 cusp->cooPerm = NULL; 3809 cusp->cooPerm_a = NULL; 3810 if (cusp->use_extended_coo) { 3811 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3812 PetscCallCUDA(cudaFree(cusp->perm_d)); 3813 } 3814 cusp->use_extended_coo = PETSC_FALSE; 3815 PetscFunctionReturn(PETSC_SUCCESS); 3816 } 3817 3818 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3819 { 3820 PetscFunctionBegin; 3821 if (*cusparsestruct) { 3822 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 3823 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 3824 delete (*cusparsestruct)->workVector; 3825 delete (*cusparsestruct)->rowoffsets_gpu; 3826 delete (*cusparsestruct)->cooPerm; 3827 delete (*cusparsestruct)->cooPerm_a; 3828 delete (*cusparsestruct)->csr2csc_i; 3829 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3830 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3831 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3832 PetscCall(PetscFree(*cusparsestruct)); 3833 } 3834 PetscFunctionReturn(PETSC_SUCCESS); 3835 } 3836 3837 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3838 { 3839 PetscFunctionBegin; 3840 if (*mat) { 3841 delete (*mat)->values; 3842 delete (*mat)->column_indices; 3843 delete (*mat)->row_offsets; 3844 delete *mat; 3845 *mat = 0; 3846 } 3847 PetscFunctionReturn(PETSC_SUCCESS); 3848 } 3849 3850 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3851 { 3852 PetscFunctionBegin; 3853 if (*trifactor) { 3854 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3855 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 3856 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3857 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3858 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3859 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3860 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3861 #endif 3862 PetscCall(PetscFree(*trifactor)); 3863 } 3864 PetscFunctionReturn(PETSC_SUCCESS); 3865 } 3866 3867 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 3868 { 3869 CsrMatrix *mat; 3870 3871 PetscFunctionBegin; 3872 if (*matstruct) { 3873 if ((*matstruct)->mat) { 3874 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 3875 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3876 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3877 #else 3878 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3879 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3880 #endif 3881 } else { 3882 mat = (CsrMatrix *)(*matstruct)->mat; 3883 PetscCall(CsrMatrix_Destroy(&mat)); 3884 } 3885 } 3886 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3887 delete (*matstruct)->cprowIndices; 3888 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3889 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3890 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3891 3892 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3893 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3894 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3895 for (int i = 0; i < 3; i++) { 3896 if (mdata->cuSpMV[i].initialized) { 3897 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3898 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3899 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3900 } 3901 } 3902 #endif 3903 delete *matstruct; 3904 *matstruct = NULL; 3905 } 3906 PetscFunctionReturn(PETSC_SUCCESS); 3907 } 3908 3909 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 3910 { 3911 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 3912 3913 PetscFunctionBegin; 3914 if (fs) { 3915 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 3916 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 3917 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 3918 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 3919 delete fs->rpermIndices; 3920 delete fs->cpermIndices; 3921 delete fs->workVector; 3922 fs->rpermIndices = NULL; 3923 fs->cpermIndices = NULL; 3924 fs->workVector = NULL; 3925 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 3926 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 3927 fs->init_dev_prop = PETSC_FALSE; 3928 #if CUSPARSE_VERSION >= 11500 3929 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 3930 PetscCallCUDA(cudaFree(fs->csrColIdx)); 3931 PetscCallCUDA(cudaFree(fs->csrVal)); 3932 PetscCallCUDA(cudaFree(fs->X)); 3933 PetscCallCUDA(cudaFree(fs->Y)); 3934 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 3935 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 3936 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 3937 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 3938 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 3939 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 3940 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 3941 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 3942 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 3943 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 3944 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 3945 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 3946 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 3947 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 3948 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 3949 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 3950 3951 fs->createdTransposeSpSVDescr = PETSC_FALSE; 3952 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 3953 #endif 3954 } 3955 PetscFunctionReturn(PETSC_SUCCESS); 3956 } 3957 3958 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 3959 { 3960 PetscFunctionBegin; 3961 if (*trifactors) { 3962 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3963 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 3964 PetscCall(PetscFree(*trifactors)); 3965 } 3966 PetscFunctionReturn(PETSC_SUCCESS); 3967 } 3968 3969 struct IJCompare { 3970 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3971 { 3972 if (t1.get<0>() < t2.get<0>()) return true; 3973 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3974 return false; 3975 } 3976 }; 3977 3978 struct IJEqual { 3979 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3980 { 3981 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3982 return true; 3983 } 3984 }; 3985 3986 struct IJDiff { 3987 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 3988 }; 3989 3990 struct IJSum { 3991 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 3992 }; 3993 3994 #include <thrust/iterator/discard_iterator.h> 3995 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3996 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 3997 { 3998 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3999 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4000 THRUSTARRAY *cooPerm_v = NULL; 4001 thrust::device_ptr<const PetscScalar> d_v; 4002 CsrMatrix *matrix; 4003 PetscInt n; 4004 4005 PetscFunctionBegin; 4006 PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 4007 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 4008 if (!cusp->cooPerm) { 4009 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 4010 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 4011 PetscFunctionReturn(PETSC_SUCCESS); 4012 } 4013 matrix = (CsrMatrix *)cusp->mat->mat; 4014 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4015 if (!v) { 4016 if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4017 goto finalize; 4018 } 4019 n = cusp->cooPerm->size(); 4020 if (isCudaMem(v)) { 4021 d_v = thrust::device_pointer_cast(v); 4022 } else { 4023 cooPerm_v = new THRUSTARRAY(n); 4024 cooPerm_v->assign(v, v + n); 4025 d_v = cooPerm_v->data(); 4026 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4027 } 4028 PetscCall(PetscLogGpuTimeBegin()); 4029 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4030 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4031 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4032 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4033 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4034 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4035 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4036 */ 4037 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4038 thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4039 delete cooPerm_w; 4040 } else { 4041 /* all nonzeros in d_v[] are unique entries */ 4042 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4043 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4044 thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4045 } 4046 } else { 4047 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4048 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4049 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4050 } else { 4051 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4052 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4053 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4054 } 4055 } 4056 PetscCall(PetscLogGpuTimeEnd()); 4057 finalize: 4058 delete cooPerm_v; 4059 A->offloadmask = PETSC_OFFLOAD_GPU; 4060 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4061 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4062 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 4063 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 4064 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4065 a->reallocs = 0; 4066 A->info.mallocs += 0; 4067 A->info.nz_unneeded = 0; 4068 A->assembled = A->was_assembled = PETSC_TRUE; 4069 A->num_ass++; 4070 PetscFunctionReturn(PETSC_SUCCESS); 4071 } 4072 4073 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4074 { 4075 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4076 4077 PetscFunctionBegin; 4078 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4079 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4080 if (destroy) { 4081 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4082 delete cusp->csr2csc_i; 4083 cusp->csr2csc_i = NULL; 4084 } 4085 A->transupdated = PETSC_FALSE; 4086 PetscFunctionReturn(PETSC_SUCCESS); 4087 } 4088 4089 #include <thrust/binary_search.h> 4090 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4091 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 4092 { 4093 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4094 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4095 PetscInt cooPerm_n, nzr = 0; 4096 4097 PetscFunctionBegin; 4098 PetscCall(PetscLayoutSetUp(A->rmap)); 4099 PetscCall(PetscLayoutSetUp(A->cmap)); 4100 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4101 if (n != cooPerm_n) { 4102 delete cusp->cooPerm; 4103 delete cusp->cooPerm_a; 4104 cusp->cooPerm = NULL; 4105 cusp->cooPerm_a = NULL; 4106 } 4107 if (n) { 4108 thrust::device_ptr<PetscInt> d_i, d_j; 4109 PetscInt *d_raw_i, *d_raw_j; 4110 PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4111 PetscMemType imtype, jmtype; 4112 4113 PetscCall(PetscGetMemType(coo_i, &imtype)); 4114 if (PetscMemTypeHost(imtype)) { 4115 PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4116 PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4117 d_i = thrust::device_pointer_cast(d_raw_i); 4118 free_raw_i = PETSC_TRUE; 4119 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4120 } else { 4121 d_i = thrust::device_pointer_cast(coo_i); 4122 } 4123 4124 PetscCall(PetscGetMemType(coo_j, &jmtype)); 4125 if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4126 PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4127 PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4128 d_j = thrust::device_pointer_cast(d_raw_j); 4129 free_raw_j = PETSC_TRUE; 4130 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4131 } else { 4132 d_j = thrust::device_pointer_cast(coo_j); 4133 } 4134 4135 THRUSTINTARRAY ii(A->rmap->n); 4136 4137 if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4138 if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 4139 4140 /* Ex. 4141 n = 6 4142 coo_i = [3,3,1,4,1,4] 4143 coo_j = [3,2,2,5,2,6] 4144 */ 4145 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4146 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 4147 4148 PetscCall(PetscLogGpuTimeBegin()); 4149 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4150 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4151 (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4152 THRUSTINTARRAY w(d_j, d_j + n); 4153 4154 /* 4155 d_i = [1,1,3,3,4,4] 4156 d_j = [2,2,2,3,5,6] 4157 cooPerm = [2,4,1,0,3,5] 4158 */ 4159 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4160 4161 /* 4162 d_i = [1,3,3,4,4,x] 4163 ^ekey 4164 d_j = [2,2,3,5,6,x] 4165 ^nekye 4166 */ 4167 if (nekey == ekey) { /* all entries are unique */ 4168 delete cusp->cooPerm_a; 4169 cusp->cooPerm_a = NULL; 4170 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4171 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4172 adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4173 adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4174 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4175 w[0] = 0; 4176 thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4177 thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4178 } 4179 thrust::counting_iterator<PetscInt> search_begin(0); 4180 thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4181 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4182 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4183 PetscCall(PetscLogGpuTimeEnd()); 4184 4185 PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 4186 a->singlemalloc = PETSC_FALSE; 4187 a->free_a = PETSC_TRUE; 4188 a->free_ij = PETSC_TRUE; 4189 PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4190 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4191 PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4192 a->nz = a->maxnz = a->i[A->rmap->n]; 4193 a->rmax = 0; 4194 PetscCall(PetscMalloc1(a->nz, &a->a)); 4195 PetscCall(PetscMalloc1(a->nz, &a->j)); 4196 PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4197 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 4198 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 4199 for (PetscInt i = 0; i < A->rmap->n; i++) { 4200 const PetscInt nnzr = a->i[i + 1] - a->i[i]; 4201 nzr += (PetscInt) !!(nnzr); 4202 a->ilen[i] = a->imax[i] = nnzr; 4203 a->rmax = PetscMax(a->rmax, nnzr); 4204 } 4205 a->nonzerorowcnt = nzr; 4206 A->preallocated = PETSC_TRUE; 4207 PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 4208 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4209 if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4210 if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 4211 } else { 4212 PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 4213 } 4214 PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 4215 4216 /* We want to allocate the CUSPARSE struct for matvec now. 4217 The code is so convoluted now that I prefer to copy zeros */ 4218 PetscCall(PetscArrayzero(a->a, a->nz)); 4219 PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 4220 A->offloadmask = PETSC_OFFLOAD_CPU; 4221 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4222 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 4223 PetscFunctionReturn(PETSC_SUCCESS); 4224 } 4225 4226 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4227 { 4228 Mat_SeqAIJ *seq; 4229 Mat_SeqAIJCUSPARSE *dev; 4230 PetscBool coo_basic = PETSC_TRUE; 4231 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4232 4233 PetscFunctionBegin; 4234 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4235 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4236 if (coo_i) { 4237 PetscCall(PetscGetMemType(coo_i, &mtype)); 4238 if (PetscMemTypeHost(mtype)) { 4239 for (PetscCount k = 0; k < coo_n; k++) { 4240 if (coo_i[k] < 0 || coo_j[k] < 0) { 4241 coo_basic = PETSC_FALSE; 4242 break; 4243 } 4244 } 4245 } 4246 } 4247 4248 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4249 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4250 } else { 4251 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4252 mat->offloadmask = PETSC_OFFLOAD_CPU; 4253 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4254 seq = static_cast<Mat_SeqAIJ *>(mat->data); 4255 dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4256 PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 4257 PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4258 PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 4259 PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4260 dev->use_extended_coo = PETSC_TRUE; 4261 } 4262 PetscFunctionReturn(PETSC_SUCCESS); 4263 } 4264 4265 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4266 { 4267 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4268 const PetscCount grid_size = gridDim.x * blockDim.x; 4269 for (; i < nnz; i += grid_size) { 4270 PetscScalar sum = 0.0; 4271 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4272 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4273 } 4274 } 4275 4276 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4277 { 4278 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4279 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4280 PetscCount Annz = seq->nz; 4281 PetscMemType memtype; 4282 const PetscScalar *v1 = v; 4283 PetscScalar *Aa; 4284 4285 PetscFunctionBegin; 4286 if (dev->use_extended_coo) { 4287 PetscCall(PetscGetMemType(v, &memtype)); 4288 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4289 PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 4290 PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4291 } 4292 4293 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4294 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4295 4296 if (Annz) { 4297 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 4298 PetscCallCUDA(cudaPeekAtLastError()); 4299 } 4300 4301 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4302 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4303 4304 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4305 } else { 4306 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4307 } 4308 PetscFunctionReturn(PETSC_SUCCESS); 4309 } 4310 4311 /*@C 4312 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4313 4314 Not Collective 4315 4316 Input Parameters: 4317 + A - the matrix 4318 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4319 4320 Output Parameters: 4321 + ia - the CSR row pointers 4322 - ja - the CSR column indices 4323 4324 Level: developer 4325 4326 Note: 4327 When compressed is true, the CSR structure does not contain empty rows 4328 4329 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4330 @*/ 4331 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4332 { 4333 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4334 CsrMatrix *csr; 4335 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4336 4337 PetscFunctionBegin; 4338 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4339 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4340 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4341 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4342 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4343 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4344 csr = (CsrMatrix *)cusp->mat->mat; 4345 if (i) { 4346 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4347 if (!cusp->rowoffsets_gpu) { 4348 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4349 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4350 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4351 } 4352 *i = cusp->rowoffsets_gpu->data().get(); 4353 } else *i = csr->row_offsets->data().get(); 4354 } 4355 if (j) *j = csr->column_indices->data().get(); 4356 PetscFunctionReturn(PETSC_SUCCESS); 4357 } 4358 4359 /*@C 4360 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4361 4362 Not Collective 4363 4364 Input Parameters: 4365 + A - the matrix 4366 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4367 . ia - the CSR row pointers 4368 - ja - the CSR column indices 4369 4370 Level: developer 4371 4372 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4373 @*/ 4374 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool, const int **i, const int **j) 4375 { 4376 PetscFunctionBegin; 4377 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4378 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4379 if (i) *i = NULL; 4380 if (j) *j = NULL; 4381 PetscFunctionReturn(PETSC_SUCCESS); 4382 } 4383 4384 /*@C 4385 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4386 4387 Not Collective 4388 4389 Input Parameter: 4390 . A - a `MATSEQAIJCUSPARSE` matrix 4391 4392 Output Parameter: 4393 . a - pointer to the device data 4394 4395 Level: developer 4396 4397 Note: 4398 May trigger host-device copies if up-to-date matrix data is on host 4399 4400 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4401 @*/ 4402 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4403 { 4404 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4405 CsrMatrix *csr; 4406 4407 PetscFunctionBegin; 4408 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4409 PetscValidPointer(a, 2); 4410 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4411 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4412 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4413 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4414 csr = (CsrMatrix *)cusp->mat->mat; 4415 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4416 *a = csr->values->data().get(); 4417 PetscFunctionReturn(PETSC_SUCCESS); 4418 } 4419 4420 /*@C 4421 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4422 4423 Not Collective 4424 4425 Input Parameters: 4426 + A - a `MATSEQAIJCUSPARSE` matrix 4427 - a - pointer to the device data 4428 4429 Level: developer 4430 4431 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4432 @*/ 4433 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4434 { 4435 PetscFunctionBegin; 4436 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4437 PetscValidPointer(a, 2); 4438 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4439 *a = NULL; 4440 PetscFunctionReturn(PETSC_SUCCESS); 4441 } 4442 4443 /*@C 4444 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4445 4446 Not Collective 4447 4448 Input Parameter: 4449 . A - a `MATSEQAIJCUSPARSE` matrix 4450 4451 Output Parameter: 4452 . a - pointer to the device data 4453 4454 Level: developer 4455 4456 Note: 4457 May trigger host-device copies if up-to-date matrix data is on host 4458 4459 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4460 @*/ 4461 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4462 { 4463 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4464 CsrMatrix *csr; 4465 4466 PetscFunctionBegin; 4467 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4468 PetscValidPointer(a, 2); 4469 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4470 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4471 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4472 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4473 csr = (CsrMatrix *)cusp->mat->mat; 4474 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4475 *a = csr->values->data().get(); 4476 A->offloadmask = PETSC_OFFLOAD_GPU; 4477 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4478 PetscFunctionReturn(PETSC_SUCCESS); 4479 } 4480 /*@C 4481 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4482 4483 Not Collective 4484 4485 Input Parameters: 4486 + A - a `MATSEQAIJCUSPARSE` matrix 4487 - a - pointer to the device data 4488 4489 Level: developer 4490 4491 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4492 @*/ 4493 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4494 { 4495 PetscFunctionBegin; 4496 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4497 PetscValidPointer(a, 2); 4498 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4499 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4500 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4501 *a = NULL; 4502 PetscFunctionReturn(PETSC_SUCCESS); 4503 } 4504 4505 /*@C 4506 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4507 4508 Not Collective 4509 4510 Input Parameter: 4511 . A - a `MATSEQAIJCUSPARSE` matrix 4512 4513 Output Parameter: 4514 . a - pointer to the device data 4515 4516 Level: developer 4517 4518 Note: 4519 Does not trigger host-device copies and flags data validity on the GPU 4520 4521 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4522 @*/ 4523 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4524 { 4525 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4526 CsrMatrix *csr; 4527 4528 PetscFunctionBegin; 4529 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4530 PetscValidPointer(a, 2); 4531 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4532 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4533 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4534 csr = (CsrMatrix *)cusp->mat->mat; 4535 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4536 *a = csr->values->data().get(); 4537 A->offloadmask = PETSC_OFFLOAD_GPU; 4538 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4539 PetscFunctionReturn(PETSC_SUCCESS); 4540 } 4541 4542 /*@C 4543 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4544 4545 Not Collective 4546 4547 Input Parameters: 4548 + A - a `MATSEQAIJCUSPARSE` matrix 4549 - a - pointer to the device data 4550 4551 Level: developer 4552 4553 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4554 @*/ 4555 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4556 { 4557 PetscFunctionBegin; 4558 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4559 PetscValidPointer(a, 2); 4560 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4561 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4562 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4563 *a = NULL; 4564 PetscFunctionReturn(PETSC_SUCCESS); 4565 } 4566 4567 struct IJCompare4 { 4568 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4569 { 4570 if (t1.get<0>() < t2.get<0>()) return true; 4571 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4572 return false; 4573 } 4574 }; 4575 4576 struct Shift { 4577 int _shift; 4578 4579 Shift(int shift) : _shift(shift) { } 4580 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4581 }; 4582 4583 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4584 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4585 { 4586 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4587 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4588 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4589 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4590 PetscInt Annz, Bnnz; 4591 cusparseStatus_t stat; 4592 PetscInt i, m, n, zero = 0; 4593 4594 PetscFunctionBegin; 4595 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4596 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4597 PetscValidPointer(C, 4); 4598 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4599 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4600 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4601 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4602 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4603 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4604 if (reuse == MAT_INITIAL_MATRIX) { 4605 m = A->rmap->n; 4606 n = A->cmap->n + B->cmap->n; 4607 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4608 PetscCall(MatSetSizes(*C, m, n, m, n)); 4609 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4610 c = (Mat_SeqAIJ *)(*C)->data; 4611 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4612 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4613 Ccsr = new CsrMatrix; 4614 Cmat->cprowIndices = NULL; 4615 c->compressedrow.use = PETSC_FALSE; 4616 c->compressedrow.nrows = 0; 4617 c->compressedrow.i = NULL; 4618 c->compressedrow.rindex = NULL; 4619 Ccusp->workVector = NULL; 4620 Ccusp->nrows = m; 4621 Ccusp->mat = Cmat; 4622 Ccusp->mat->mat = Ccsr; 4623 Ccsr->num_rows = m; 4624 Ccsr->num_cols = n; 4625 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4626 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4627 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4628 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 4629 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 4630 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4631 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4632 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4633 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4634 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4635 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4636 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4637 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4638 4639 Acsr = (CsrMatrix *)Acusp->mat->mat; 4640 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4641 Annz = (PetscInt)Acsr->column_indices->size(); 4642 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4643 c->nz = Annz + Bnnz; 4644 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4645 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4646 Ccsr->values = new THRUSTARRAY(c->nz); 4647 Ccsr->num_entries = c->nz; 4648 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4649 if (c->nz) { 4650 auto Acoo = new THRUSTINTARRAY32(Annz); 4651 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4652 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4653 THRUSTINTARRAY32 *Aroff, *Broff; 4654 4655 if (a->compressedrow.use) { /* need full row offset */ 4656 if (!Acusp->rowoffsets_gpu) { 4657 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4658 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4659 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4660 } 4661 Aroff = Acusp->rowoffsets_gpu; 4662 } else Aroff = Acsr->row_offsets; 4663 if (b->compressedrow.use) { /* need full row offset */ 4664 if (!Bcusp->rowoffsets_gpu) { 4665 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4666 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4667 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4668 } 4669 Broff = Bcusp->rowoffsets_gpu; 4670 } else Broff = Bcsr->row_offsets; 4671 PetscCall(PetscLogGpuTimeBegin()); 4672 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4673 PetscCallCUSPARSE(stat); 4674 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4675 PetscCallCUSPARSE(stat); 4676 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4677 auto Aperm = thrust::make_constant_iterator(1); 4678 auto Bperm = thrust::make_constant_iterator(0); 4679 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4680 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4681 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4682 #else 4683 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4684 auto Bcib = Bcsr->column_indices->begin(); 4685 auto Bcie = Bcsr->column_indices->end(); 4686 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4687 #endif 4688 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4689 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4690 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4691 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4692 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4693 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4694 auto p1 = Ccusp->cooPerm->begin(); 4695 auto p2 = Ccusp->cooPerm->begin(); 4696 thrust::advance(p2, Annz); 4697 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4698 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4699 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4700 #endif 4701 auto cci = thrust::make_counting_iterator(zero); 4702 auto cce = thrust::make_counting_iterator(c->nz); 4703 #if 0 //Errors on SUMMIT cuda 11.1.0 4704 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4705 #else 4706 auto pred = thrust::identity<int>(); 4707 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4708 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4709 #endif 4710 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4711 PetscCallCUSPARSE(stat); 4712 PetscCall(PetscLogGpuTimeEnd()); 4713 delete wPerm; 4714 delete Acoo; 4715 delete Bcoo; 4716 delete Ccoo; 4717 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4718 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4719 PetscCallCUSPARSE(stat); 4720 #endif 4721 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4722 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4723 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4724 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4725 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4726 CsrMatrix *CcsrT = new CsrMatrix; 4727 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4728 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4729 4730 (*C)->form_explicit_transpose = PETSC_TRUE; 4731 (*C)->transupdated = PETSC_TRUE; 4732 Ccusp->rowoffsets_gpu = NULL; 4733 CmatT->cprowIndices = NULL; 4734 CmatT->mat = CcsrT; 4735 CcsrT->num_rows = n; 4736 CcsrT->num_cols = m; 4737 CcsrT->num_entries = c->nz; 4738 4739 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4740 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4741 CcsrT->values = new THRUSTARRAY(c->nz); 4742 4743 PetscCall(PetscLogGpuTimeBegin()); 4744 auto rT = CcsrT->row_offsets->begin(); 4745 if (AT) { 4746 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4747 thrust::advance(rT, -1); 4748 } 4749 if (BT) { 4750 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4751 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4752 thrust::copy(titb, tite, rT); 4753 } 4754 auto cT = CcsrT->column_indices->begin(); 4755 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4756 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4757 auto vT = CcsrT->values->begin(); 4758 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4759 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4760 PetscCall(PetscLogGpuTimeEnd()); 4761 4762 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4763 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4764 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4765 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 4766 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 4767 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4768 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4769 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4770 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4771 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4772 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4773 PetscCallCUSPARSE(stat); 4774 #endif 4775 Ccusp->matTranspose = CmatT; 4776 } 4777 } 4778 4779 c->singlemalloc = PETSC_FALSE; 4780 c->free_a = PETSC_TRUE; 4781 c->free_ij = PETSC_TRUE; 4782 PetscCall(PetscMalloc1(m + 1, &c->i)); 4783 PetscCall(PetscMalloc1(c->nz, &c->j)); 4784 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4785 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4786 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4787 ii = *Ccsr->row_offsets; 4788 jj = *Ccsr->column_indices; 4789 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4790 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4791 } else { 4792 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4793 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4794 } 4795 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4796 PetscCall(PetscMalloc1(m, &c->ilen)); 4797 PetscCall(PetscMalloc1(m, &c->imax)); 4798 c->maxnz = c->nz; 4799 c->nonzerorowcnt = 0; 4800 c->rmax = 0; 4801 for (i = 0; i < m; i++) { 4802 const PetscInt nn = c->i[i + 1] - c->i[i]; 4803 c->ilen[i] = c->imax[i] = nn; 4804 c->nonzerorowcnt += (PetscInt) !!nn; 4805 c->rmax = PetscMax(c->rmax, nn); 4806 } 4807 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4808 PetscCall(PetscMalloc1(c->nz, &c->a)); 4809 (*C)->nonzerostate++; 4810 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4811 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4812 Ccusp->nonzerostate = (*C)->nonzerostate; 4813 (*C)->preallocated = PETSC_TRUE; 4814 } else { 4815 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4816 c = (Mat_SeqAIJ *)(*C)->data; 4817 if (c->nz) { 4818 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4819 PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 4820 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4821 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4822 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4823 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4824 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4825 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4826 Acsr = (CsrMatrix *)Acusp->mat->mat; 4827 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4828 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4829 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4830 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4831 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4832 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4833 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 4834 auto pmid = Ccusp->cooPerm->begin(); 4835 thrust::advance(pmid, Acsr->num_entries); 4836 PetscCall(PetscLogGpuTimeBegin()); 4837 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 4838 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4839 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4840 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4841 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 4842 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4843 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4844 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4845 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4846 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4847 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4848 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4849 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4850 auto vT = CcsrT->values->begin(); 4851 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4852 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4853 (*C)->transupdated = PETSC_TRUE; 4854 } 4855 PetscCall(PetscLogGpuTimeEnd()); 4856 } 4857 } 4858 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4859 (*C)->assembled = PETSC_TRUE; 4860 (*C)->was_assembled = PETSC_FALSE; 4861 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4862 PetscFunctionReturn(PETSC_SUCCESS); 4863 } 4864 4865 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4866 { 4867 bool dmem; 4868 const PetscScalar *av; 4869 4870 PetscFunctionBegin; 4871 dmem = isCudaMem(v); 4872 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4873 if (n && idx) { 4874 THRUSTINTARRAY widx(n); 4875 widx.assign(idx, idx + n); 4876 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4877 4878 THRUSTARRAY *w = NULL; 4879 thrust::device_ptr<PetscScalar> dv; 4880 if (dmem) { 4881 dv = thrust::device_pointer_cast(v); 4882 } else { 4883 w = new THRUSTARRAY(n); 4884 dv = w->data(); 4885 } 4886 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4887 4888 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4889 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4890 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4891 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4892 delete w; 4893 } else { 4894 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4895 } 4896 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4897 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 4898 PetscFunctionReturn(PETSC_SUCCESS); 4899 } 4900