1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the HIPSPARSE library, 4 Portions of this code are under: 5 Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. 6 */ 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal() 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/iterator/transform_iterator.h> 17 #if PETSC_CPP_VERSION >= 14 18 #define PETSC_HAVE_THRUST_ASYNC 1 19 #include <thrust/async/for_each.h> 20 #endif 21 #include <thrust/iterator/constant_iterator.h> 22 #include <thrust/iterator/discard_iterator.h> 23 #include <thrust/binary_search.h> 24 #include <thrust/remove.h> 25 #include <thrust/sort.h> 26 #include <thrust/unique.h> 27 28 const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0}; 29 const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0}; 30 const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0}; 31 //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0}; 32 33 static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *); 34 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *); 35 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *); 36 static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 37 static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 38 static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *); 39 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec); 40 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec); 41 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec); 42 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec); 43 static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems PetscOptionsObject); 44 static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure); 45 static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar); 46 static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec); 47 static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec); 48 static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec); 49 static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec); 50 static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec); 51 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec); 52 static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 53 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 54 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **); 55 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat); 56 static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **); 57 static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat); 58 static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat); 59 static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat); 60 static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool); 61 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 62 static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool); 63 static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 64 static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode); 65 66 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 67 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *); 68 69 /* 70 PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream) 71 { 72 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr; 73 74 PetscFunctionBegin; 75 PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 76 hipsparsestruct->stream = stream; 77 PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream)); 78 PetscFunctionReturn(PETSC_SUCCESS); 79 } 80 81 PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle) 82 { 83 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr; 84 85 PetscFunctionBegin; 86 PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 87 if (hipsparsestruct->handle != handle) { 88 if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle)); 89 hipsparsestruct->handle = handle; 90 } 91 PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE)); 92 PetscFunctionReturn(PETSC_SUCCESS); 93 } 94 95 PetscErrorCode MatHIPSPARSEClearHandle(Mat A) 96 { 97 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr; 98 PetscBool flg; 99 100 PetscFunctionBegin; 101 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg)); 102 if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS); 103 if (hipsparsestruct->handle) hipsparsestruct->handle = 0; 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106 */ 107 108 PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format) 109 { 110 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr; 111 112 PetscFunctionBegin; 113 switch (op) { 114 case MAT_HIPSPARSE_MULT: 115 hipsparsestruct->format = format; 116 break; 117 case MAT_HIPSPARSE_ALL: 118 hipsparsestruct->format = format; 119 break; 120 default: 121 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op); 122 } 123 PetscFunctionReturn(PETSC_SUCCESS); 124 } 125 126 /*@ 127 MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular 128 operation. Only the `MatMult()` operation can use different GPU storage formats 129 130 Not Collective 131 132 Input Parameters: 133 + A - Matrix of type `MATSEQAIJHIPSPARSE` 134 . op - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`. 135 `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`. 136 - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.) 137 138 Level: intermediate 139 140 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation` 141 @*/ 142 PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format) 143 { 144 PetscFunctionBegin; 145 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 146 PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format)); 147 PetscFunctionReturn(PETSC_SUCCESS); 148 } 149 150 PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu) 151 { 152 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr; 153 154 PetscFunctionBegin; 155 hipsparsestruct->use_cpu_solve = use_cpu; 156 PetscFunctionReturn(PETSC_SUCCESS); 157 } 158 159 /*@ 160 MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`. 161 162 Input Parameters: 163 + A - Matrix of type `MATSEQAIJHIPSPARSE` 164 - use_cpu - set flag for using the built-in CPU `MatSolve()` 165 166 Level: intermediate 167 168 Notes: 169 The hipSparse LU solver currently computes the factors with the built-in CPU method 170 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 171 This method to specifies if the solve is done on the CPU or GPU (GPU is the default). 172 173 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation` 174 @*/ 175 PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 176 { 177 PetscFunctionBegin; 178 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 179 PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 180 PetscFunctionReturn(PETSC_SUCCESS); 181 } 182 183 static PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg) 184 { 185 PetscFunctionBegin; 186 switch (op) { 187 case MAT_FORM_EXPLICIT_TRANSPOSE: 188 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 189 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE)); 190 A->form_explicit_transpose = flg; 191 break; 192 default: 193 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 194 break; 195 } 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info) 200 { 201 PetscBool row_identity, col_identity; 202 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 203 IS isrow = b->row, iscol = b->col; 204 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr; 205 206 PetscFunctionBegin; 207 PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A)); 208 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 209 B->offloadmask = PETSC_OFFLOAD_CPU; 210 /* determine which version of MatSolve needs to be used. */ 211 PetscCall(ISIdentity(isrow, &row_identity)); 212 PetscCall(ISIdentity(iscol, &col_identity)); 213 if (!hipsparsestruct->use_cpu_solve) { 214 if (row_identity && col_identity) { 215 B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering; 216 B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering; 217 } else { 218 B->ops->solve = MatSolve_SeqAIJHIPSPARSE; 219 B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE; 220 } 221 } 222 B->ops->matsolve = NULL; 223 B->ops->matsolvetranspose = NULL; 224 225 /* get the triangular factors */ 226 if (!hipsparsestruct->use_cpu_solve) PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B)); 227 PetscFunctionReturn(PETSC_SUCCESS); 228 } 229 230 static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 231 { 232 MatHIPSPARSEStorageFormat format; 233 PetscBool flg; 234 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr; 235 236 PetscFunctionBegin; 237 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options"); 238 if (A->factortype == MAT_FACTOR_NONE) { 239 PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg)); 240 if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format)); 241 PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg)); 242 if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format)); 243 PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg)); 244 if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve)); 245 PetscCall( 246 PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg)); 247 /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */ 248 PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 249 PetscCall( 250 PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg)); 251 PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 252 /* 253 PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg)); 254 PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 255 */ 256 } 257 PetscOptionsHeadEnd(); 258 PetscFunctionReturn(PETSC_SUCCESS); 259 } 260 261 static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A) 262 { 263 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 264 PetscInt n = A->rmap->n; 265 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr; 266 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr; 267 const PetscInt *ai = a->i, *aj = a->j, *vi; 268 const MatScalar *aa = a->a, *v; 269 PetscInt *AiLo, *AjLo; 270 PetscInt i, nz, nzLower, offset, rowOffset; 271 272 PetscFunctionBegin; 273 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 274 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 275 try { 276 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 277 nzLower = n + ai[n] - ai[1]; 278 if (!loTriFactor) { 279 PetscScalar *AALo; 280 PetscCallHIP(hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar))); 281 282 /* Allocate Space for the lower triangular matrix */ 283 PetscCallHIP(hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 284 PetscCallHIP(hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt))); 285 286 /* Fill the lower triangular matrix */ 287 AiLo[0] = (PetscInt)0; 288 AiLo[n] = nzLower; 289 AjLo[0] = (PetscInt)0; 290 AALo[0] = (MatScalar)1.0; 291 v = aa; 292 vi = aj; 293 offset = 1; 294 rowOffset = 1; 295 for (i = 1; i < n; i++) { 296 nz = ai[i + 1] - ai[i]; 297 /* additional 1 for the term on the diagonal */ 298 AiLo[i] = rowOffset; 299 rowOffset += nz + 1; 300 301 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 302 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 303 offset += nz; 304 AjLo[offset] = (PetscInt)i; 305 AALo[offset] = (MatScalar)1.0; 306 offset += 1; 307 v += nz; 308 vi += nz; 309 } 310 311 /* allocate space for the triangular factor information */ 312 PetscCall(PetscNew(&loTriFactor)); 313 loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL; 314 /* Create the matrix description */ 315 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr)); 316 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO)); 317 PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); 318 PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER)); 319 PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT)); 320 321 /* set the operation */ 322 loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE; 323 324 /* set the matrix */ 325 loTriFactor->csrMat = new CsrMatrix; 326 loTriFactor->csrMat->num_rows = n; 327 loTriFactor->csrMat->num_cols = n; 328 loTriFactor->csrMat->num_entries = nzLower; 329 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 330 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 331 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 332 333 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 334 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 335 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 336 337 /* Create the solve analysis information */ 338 PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0)); 339 PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 340 PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 341 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 342 PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 343 344 /* perform the solve analysis */ 345 PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 346 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 347 348 PetscCallHIP(WaitForHIP()); 349 PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0)); 350 351 /* assign the pointer */ 352 ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 353 loTriFactor->AA_h = AALo; 354 PetscCallHIP(hipHostFree(AiLo)); 355 PetscCallHIP(hipHostFree(AjLo)); 356 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 357 } else { /* update values only */ 358 if (!loTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 359 /* Fill the lower triangular matrix */ 360 loTriFactor->AA_h[0] = 1.0; 361 v = aa; 362 vi = aj; 363 offset = 1; 364 for (i = 1; i < n; i++) { 365 nz = ai[i + 1] - ai[i]; 366 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 367 offset += nz; 368 loTriFactor->AA_h[offset] = 1.0; 369 offset += 1; 370 v += nz; 371 } 372 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 373 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 374 } 375 } catch (char *ex) { 376 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex); 377 } 378 } 379 PetscFunctionReturn(PETSC_SUCCESS); 380 } 381 382 static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A) 383 { 384 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 385 PetscInt n = A->rmap->n; 386 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr; 387 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr; 388 const PetscInt *aj = a->j, *adiag, *vi; 389 const MatScalar *aa = a->a, *v; 390 PetscInt *AiUp, *AjUp; 391 PetscInt i, nz, nzUpper, offset; 392 393 PetscFunctionBegin; 394 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 395 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL)); 396 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 397 try { 398 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 399 nzUpper = adiag[0] - adiag[n]; 400 if (!upTriFactor) { 401 PetscScalar *AAUp; 402 PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 403 404 /* Allocate Space for the upper triangular matrix */ 405 PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 406 PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt))); 407 408 /* Fill the upper triangular matrix */ 409 AiUp[0] = (PetscInt)0; 410 AiUp[n] = nzUpper; 411 offset = nzUpper; 412 for (i = n - 1; i >= 0; i--) { 413 v = aa + adiag[i + 1] + 1; 414 vi = aj + adiag[i + 1] + 1; 415 nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */ 416 offset -= (nz + 1); /* decrement the offset */ 417 418 /* first, set the diagonal elements */ 419 AjUp[offset] = (PetscInt)i; 420 AAUp[offset] = (MatScalar)1. / v[nz]; 421 AiUp[i] = AiUp[i + 1] - (nz + 1); 422 423 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 424 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 425 } 426 427 /* allocate space for the triangular factor information */ 428 PetscCall(PetscNew(&upTriFactor)); 429 upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL; 430 431 /* Create the matrix description */ 432 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr)); 433 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO)); 434 PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); 435 PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER)); 436 PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT)); 437 438 /* set the operation */ 439 upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE; 440 441 /* set the matrix */ 442 upTriFactor->csrMat = new CsrMatrix; 443 upTriFactor->csrMat->num_rows = n; 444 upTriFactor->csrMat->num_cols = n; 445 upTriFactor->csrMat->num_entries = nzUpper; 446 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 447 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 448 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 449 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 450 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 451 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 452 453 /* Create the solve analysis information */ 454 PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0)); 455 PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 456 PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 457 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 458 PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 459 460 /* perform the solve analysis */ 461 PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 462 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 463 464 PetscCallHIP(WaitForHIP()); 465 PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0)); 466 467 /* assign the pointer */ 468 ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 469 upTriFactor->AA_h = AAUp; 470 PetscCallHIP(hipHostFree(AiUp)); 471 PetscCallHIP(hipHostFree(AjUp)); 472 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 473 } else { 474 if (!upTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 475 /* Fill the upper triangular matrix */ 476 offset = nzUpper; 477 for (i = n - 1; i >= 0; i--) { 478 v = aa + adiag[i + 1] + 1; 479 nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */ 480 offset -= (nz + 1); /* decrement the offset */ 481 482 /* first, set the diagonal elements */ 483 upTriFactor->AA_h[offset] = 1. / v[nz]; 484 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 485 } 486 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 487 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 488 } 489 } catch (char *ex) { 490 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex); 491 } 492 } 493 PetscFunctionReturn(PETSC_SUCCESS); 494 } 495 496 static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A) 497 { 498 PetscBool row_identity, col_identity; 499 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 500 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr; 501 IS isrow = a->row, iscol = a->icol; 502 PetscInt n = A->rmap->n; 503 504 PetscFunctionBegin; 505 PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors"); 506 PetscCall(MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A)); 507 PetscCall(MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A)); 508 509 if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n); 510 hipsparseTriFactors->nnz = a->nz; 511 512 A->offloadmask = PETSC_OFFLOAD_BOTH; 513 /* lower triangular indices */ 514 PetscCall(ISIdentity(isrow, &row_identity)); 515 if (!row_identity && !hipsparseTriFactors->rpermIndices) { 516 const PetscInt *r; 517 518 PetscCall(ISGetIndices(isrow, &r)); 519 hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 520 hipsparseTriFactors->rpermIndices->assign(r, r + n); 521 PetscCall(ISRestoreIndices(isrow, &r)); 522 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 523 } 524 /* upper triangular indices */ 525 PetscCall(ISIdentity(iscol, &col_identity)); 526 if (!col_identity && !hipsparseTriFactors->cpermIndices) { 527 const PetscInt *c; 528 529 PetscCall(ISGetIndices(iscol, &c)); 530 hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 531 hipsparseTriFactors->cpermIndices->assign(c, c + n); 532 PetscCall(ISRestoreIndices(iscol, &c)); 533 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 534 } 535 PetscFunctionReturn(PETSC_SUCCESS); 536 } 537 538 static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A) 539 { 540 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 541 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr; 542 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr; 543 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr; 544 PetscInt *AiUp, *AjUp; 545 PetscScalar *AAUp; 546 PetscScalar *AALo; 547 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 548 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 549 const PetscInt *ai = b->i, *aj = b->j, *vj; 550 const MatScalar *aa = b->a, *v; 551 552 PetscFunctionBegin; 553 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 554 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 555 try { 556 PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 557 PetscCallHIP(hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar))); 558 if (!upTriFactor && !loTriFactor) { 559 /* Allocate Space for the upper triangular matrix */ 560 PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 561 PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt))); 562 563 /* Fill the upper triangular matrix */ 564 AiUp[0] = (PetscInt)0; 565 AiUp[n] = nzUpper; 566 offset = 0; 567 for (i = 0; i < n; i++) { 568 /* set the pointers */ 569 v = aa + ai[i]; 570 vj = aj + ai[i]; 571 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 572 573 /* first, set the diagonal elements */ 574 AjUp[offset] = (PetscInt)i; 575 AAUp[offset] = (MatScalar)1.0 / v[nz]; 576 AiUp[i] = offset; 577 AALo[offset] = (MatScalar)1.0 / v[nz]; 578 579 offset += 1; 580 if (nz > 0) { 581 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 582 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 583 for (j = offset; j < offset + nz; j++) { 584 AAUp[j] = -AAUp[j]; 585 AALo[j] = AAUp[j] / v[nz]; 586 } 587 offset += nz; 588 } 589 } 590 591 /* allocate space for the triangular factor information */ 592 PetscCall(PetscNew(&upTriFactor)); 593 upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL; 594 595 /* Create the matrix description */ 596 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr)); 597 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO)); 598 PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); 599 PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER)); 600 PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT)); 601 602 /* set the matrix */ 603 upTriFactor->csrMat = new CsrMatrix; 604 upTriFactor->csrMat->num_rows = A->rmap->n; 605 upTriFactor->csrMat->num_cols = A->cmap->n; 606 upTriFactor->csrMat->num_entries = a->nz; 607 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 608 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 609 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 610 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 611 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 612 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 613 614 /* set the operation */ 615 upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE; 616 617 /* Create the solve analysis information */ 618 PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0)); 619 PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 620 PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 621 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 622 PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 623 624 /* perform the solve analysis */ 625 PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 626 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 627 628 PetscCallHIP(WaitForHIP()); 629 PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0)); 630 631 /* assign the pointer */ 632 ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 633 634 /* allocate space for the triangular factor information */ 635 PetscCall(PetscNew(&loTriFactor)); 636 loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL; 637 638 /* Create the matrix description */ 639 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr)); 640 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO)); 641 PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); 642 PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER)); 643 PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT)); 644 645 /* set the operation */ 646 loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE; 647 648 /* set the matrix */ 649 loTriFactor->csrMat = new CsrMatrix; 650 loTriFactor->csrMat->num_rows = A->rmap->n; 651 loTriFactor->csrMat->num_cols = A->cmap->n; 652 loTriFactor->csrMat->num_entries = a->nz; 653 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 654 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 655 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 656 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 657 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 658 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 659 660 /* Create the solve analysis information */ 661 PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0)); 662 PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 663 PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 664 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 665 PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 666 667 /* perform the solve analysis */ 668 PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 669 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 670 671 PetscCallHIP(WaitForHIP()); 672 PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0)); 673 674 /* assign the pointer */ 675 ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 676 677 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 678 PetscCallHIP(hipHostFree(AiUp)); 679 PetscCallHIP(hipHostFree(AjUp)); 680 } else { 681 /* Fill the upper triangular matrix */ 682 offset = 0; 683 for (i = 0; i < n; i++) { 684 /* set the pointers */ 685 v = aa + ai[i]; 686 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 687 688 /* first, set the diagonal elements */ 689 AAUp[offset] = 1.0 / v[nz]; 690 AALo[offset] = 1.0 / v[nz]; 691 692 offset += 1; 693 if (nz > 0) { 694 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 695 for (j = offset; j < offset + nz; j++) { 696 AAUp[j] = -AAUp[j]; 697 AALo[j] = AAUp[j] / v[nz]; 698 } 699 offset += nz; 700 } 701 } 702 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors"); 703 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors"); 704 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 705 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 706 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 707 } 708 PetscCallHIP(hipHostFree(AAUp)); 709 PetscCallHIP(hipHostFree(AALo)); 710 } catch (char *ex) { 711 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex); 712 } 713 } 714 PetscFunctionReturn(PETSC_SUCCESS); 715 } 716 717 static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A) 718 { 719 PetscBool perm_identity; 720 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 721 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr; 722 IS ip = a->row; 723 PetscInt n = A->rmap->n; 724 725 PetscFunctionBegin; 726 PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors"); 727 PetscCall(MatSeqAIJHIPSPARSEBuildICCTriMatrices(A)); 728 if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n); 729 hipsparseTriFactors->nnz = (a->nz - n) * 2 + n; 730 731 A->offloadmask = PETSC_OFFLOAD_BOTH; 732 /* lower triangular indices */ 733 PetscCall(ISIdentity(ip, &perm_identity)); 734 if (!perm_identity) { 735 IS iip; 736 const PetscInt *irip, *rip; 737 738 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 739 PetscCall(ISGetIndices(iip, &irip)); 740 PetscCall(ISGetIndices(ip, &rip)); 741 hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 742 hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 743 hipsparseTriFactors->rpermIndices->assign(rip, rip + n); 744 hipsparseTriFactors->cpermIndices->assign(irip, irip + n); 745 PetscCall(ISRestoreIndices(iip, &irip)); 746 PetscCall(ISDestroy(&iip)); 747 PetscCall(ISRestoreIndices(ip, &rip)); 748 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 749 } 750 PetscFunctionReturn(PETSC_SUCCESS); 751 } 752 753 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info) 754 { 755 PetscBool perm_identity; 756 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 757 IS ip = b->row; 758 759 PetscFunctionBegin; 760 PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A)); 761 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 762 B->offloadmask = PETSC_OFFLOAD_CPU; 763 /* determine which version of MatSolve needs to be used. */ 764 PetscCall(ISIdentity(ip, &perm_identity)); 765 if (perm_identity) { 766 B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering; 767 B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering; 768 B->ops->matsolve = NULL; 769 B->ops->matsolvetranspose = NULL; 770 } else { 771 B->ops->solve = MatSolve_SeqAIJHIPSPARSE; 772 B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE; 773 B->ops->matsolve = NULL; 774 B->ops->matsolvetranspose = NULL; 775 } 776 777 /* get the triangular factors */ 778 PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B)); 779 PetscFunctionReturn(PETSC_SUCCESS); 780 } 781 782 static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A) 783 { 784 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr; 785 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr; 786 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr; 787 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT; 788 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT; 789 hipsparseIndexBase_t indexBase; 790 hipsparseMatrixType_t matrixType; 791 hipsparseFillMode_t fillMode; 792 hipsparseDiagType_t diagType; 793 794 PetscFunctionBegin; 795 /* allocate space for the transpose of the lower triangular factor */ 796 PetscCall(PetscNew(&loTriFactorT)); 797 loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL; 798 799 /* set the matrix descriptors of the lower triangular factor */ 800 matrixType = hipsparseGetMatType(loTriFactor->descr); 801 indexBase = hipsparseGetMatIndexBase(loTriFactor->descr); 802 fillMode = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER; 803 diagType = hipsparseGetMatDiagType(loTriFactor->descr); 804 805 /* Create the matrix description */ 806 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactorT->descr)); 807 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 808 PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactorT->descr, matrixType)); 809 PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactorT->descr, fillMode)); 810 PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactorT->descr, diagType)); 811 812 /* set the operation */ 813 loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE; 814 815 /* allocate GPU space for the CSC of the lower triangular factor*/ 816 loTriFactorT->csrMat = new CsrMatrix; 817 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 818 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 819 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 820 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 821 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 822 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 823 824 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 825 /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm 826 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0) 827 PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 828 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 829 loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 830 PetscCallHIP(hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 831 #endif 832 */ 833 PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0)); 834 835 PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 836 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 837 #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/ 838 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 839 hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 840 #else 841 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase)); 842 #endif 843 844 PetscCallHIP(WaitForHIP()); 845 PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0)); 846 847 /* Create the solve analysis information */ 848 PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0)); 849 PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 850 PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 851 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 852 PetscCallHIP(hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 853 854 /* perform the solve analysis */ 855 PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 856 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 857 858 PetscCallHIP(WaitForHIP()); 859 PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0)); 860 861 /* assign the pointer */ 862 ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 863 864 /*********************************************/ 865 /* Now the Transpose of the Upper Tri Factor */ 866 /*********************************************/ 867 868 /* allocate space for the transpose of the upper triangular factor */ 869 PetscCall(PetscNew(&upTriFactorT)); 870 upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL; 871 872 /* set the matrix descriptors of the upper triangular factor */ 873 matrixType = hipsparseGetMatType(upTriFactor->descr); 874 indexBase = hipsparseGetMatIndexBase(upTriFactor->descr); 875 fillMode = hipsparseGetMatFillMode(upTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER; 876 diagType = hipsparseGetMatDiagType(upTriFactor->descr); 877 878 /* Create the matrix description */ 879 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactorT->descr)); 880 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 881 PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactorT->descr, matrixType)); 882 PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactorT->descr, fillMode)); 883 PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactorT->descr, diagType)); 884 885 /* set the operation */ 886 upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE; 887 888 /* allocate GPU space for the CSC of the upper triangular factor*/ 889 upTriFactorT->csrMat = new CsrMatrix; 890 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 891 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 892 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 893 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 894 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 895 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 896 897 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 898 /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm 899 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0) 900 PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 901 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 902 upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 903 PetscCallHIP(hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 904 #endif 905 */ 906 PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0)); 907 PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 908 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 909 #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/ 910 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 911 hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 912 #else 913 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase)); 914 #endif 915 916 PetscCallHIP(WaitForHIP()); 917 PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0)); 918 919 /* Create the solve analysis information */ 920 PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0)); 921 PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 922 PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 923 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 924 PetscCallHIP(hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 925 926 /* perform the solve analysis */ 927 PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 928 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 929 930 PetscCallHIP(WaitForHIP()); 931 PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0)); 932 933 /* assign the pointer */ 934 ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 935 PetscFunctionReturn(PETSC_SUCCESS); 936 } 937 938 struct PetscScalarToPetscInt { 939 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 940 }; 941 942 static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A) 943 { 944 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr; 945 Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT; 946 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 947 hipsparseIndexBase_t indexBase; 948 949 PetscFunctionBegin; 950 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 951 matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat; 952 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 953 matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose; 954 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 955 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 956 PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0)); 957 PetscCall(PetscLogGpuTimeBegin()); 958 if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE)); 959 if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */ 960 matstructT = new Mat_SeqAIJHIPSPARSEMultStruct; 961 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr)); 962 indexBase = hipsparseGetMatIndexBase(matstruct->descr); 963 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase)); 964 PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); 965 966 /* set alpha and beta */ 967 PetscCallHIP(hipMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 968 PetscCallHIP(hipMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 969 PetscCallHIP(hipMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 970 PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice)); 971 PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice)); 972 PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice)); 973 974 if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { 975 CsrMatrix *matrixT = new CsrMatrix; 976 matstructT->mat = matrixT; 977 matrixT->num_rows = A->cmap->n; 978 matrixT->num_cols = A->rmap->n; 979 matrixT->num_entries = a->nz; 980 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 981 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 982 matrixT->values = new THRUSTARRAY(a->nz); 983 984 if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 985 hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 986 987 PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 988 indexBase, hipsparse_scalartype)); 989 } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) { 990 CsrMatrix *temp = new CsrMatrix; 991 CsrMatrix *tempT = new CsrMatrix; 992 /* First convert HYB to CSR */ 993 temp->num_rows = A->rmap->n; 994 temp->num_cols = A->cmap->n; 995 temp->num_entries = a->nz; 996 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 997 temp->column_indices = new THRUSTINTARRAY32(a->nz); 998 temp->values = new THRUSTARRAY(a->nz); 999 1000 PetscCallHIPSPARSE(hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get())); 1001 1002 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1003 tempT->num_rows = A->rmap->n; 1004 tempT->num_cols = A->cmap->n; 1005 tempT->num_entries = a->nz; 1006 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1007 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1008 tempT->values = new THRUSTARRAY(a->nz); 1009 1010 PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1011 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase)); 1012 1013 /* Last, convert CSC to HYB */ 1014 hipsparseHybMat_t hybMat; 1015 PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat)); 1016 hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO; 1017 PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition)); 1018 1019 /* assign the pointer */ 1020 matstructT->mat = hybMat; 1021 A->transupdated = PETSC_TRUE; 1022 /* delete temporaries */ 1023 if (tempT) { 1024 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1025 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1026 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1027 delete (CsrMatrix *)tempT; 1028 } 1029 if (temp) { 1030 if (temp->values) delete (THRUSTARRAY *)temp->values; 1031 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1032 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1033 delete (CsrMatrix *)temp; 1034 } 1035 } 1036 } 1037 if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1038 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1039 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1040 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1041 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1042 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1043 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1044 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1045 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1046 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1047 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1048 if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1049 hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1050 hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1051 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1052 } 1053 if (!hipsparsestruct->csr2csc_i) { 1054 THRUSTARRAY csr2csc_a(matrix->num_entries); 1055 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1056 1057 indexBase = hipsparseGetMatIndexBase(matstruct->descr); 1058 if (matrix->num_entries) { 1059 /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10 1060 Need to verify this for ROCm. 1061 */ 1062 PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1063 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase)); 1064 } else { 1065 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1066 } 1067 1068 hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1069 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1070 } 1071 PetscCallThrust( 1072 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1073 } 1074 PetscCall(PetscLogGpuTimeEnd()); 1075 PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0)); 1076 /* the compressed row indices is not used for matTranspose */ 1077 matstructT->cprowIndices = NULL; 1078 /* assign the pointer */ 1079 ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT; 1080 A->transupdated = PETSC_TRUE; 1081 PetscFunctionReturn(PETSC_SUCCESS); 1082 } 1083 1084 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */ 1085 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx) 1086 { 1087 PetscInt n = xx->map->n; 1088 const PetscScalar *barray; 1089 PetscScalar *xarray; 1090 thrust::device_ptr<const PetscScalar> bGPU; 1091 thrust::device_ptr<PetscScalar> xGPU; 1092 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr; 1093 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose; 1094 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose; 1095 THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector; 1096 1097 PetscFunctionBegin; 1098 /* Analyze the matrix and create the transpose ... on the fly */ 1099 if (!loTriFactorT && !upTriFactorT) { 1100 PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A)); 1101 loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose; 1102 upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose; 1103 } 1104 1105 /* Get the GPU pointers */ 1106 PetscCall(VecHIPGetArrayWrite(xx, &xarray)); 1107 PetscCall(VecHIPGetArrayRead(bb, &barray)); 1108 xGPU = thrust::device_pointer_cast(xarray); 1109 bGPU = thrust::device_pointer_cast(barray); 1110 1111 PetscCall(PetscLogGpuTimeBegin()); 1112 /* First, reorder with the row permutation */ 1113 thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU); 1114 1115 /* First, solve U */ 1116 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1117 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1118 1119 /* Then, solve L */ 1120 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1121 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1122 1123 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1124 thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1125 1126 /* Copy the temporary to the full solution. */ 1127 thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU); 1128 1129 /* restore */ 1130 PetscCall(VecHIPRestoreArrayRead(bb, &barray)); 1131 PetscCall(VecHIPRestoreArrayWrite(xx, &xarray)); 1132 PetscCall(PetscLogGpuTimeEnd()); 1133 PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n)); 1134 PetscFunctionReturn(PETSC_SUCCESS); 1135 } 1136 1137 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1138 { 1139 const PetscScalar *barray; 1140 PetscScalar *xarray; 1141 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr; 1142 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose; 1143 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose; 1144 THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector; 1145 1146 PetscFunctionBegin; 1147 /* Analyze the matrix and create the transpose ... on the fly */ 1148 if (!loTriFactorT && !upTriFactorT) { 1149 PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A)); 1150 loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose; 1151 upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose; 1152 } 1153 1154 /* Get the GPU pointers */ 1155 PetscCall(VecHIPGetArrayWrite(xx, &xarray)); 1156 PetscCall(VecHIPGetArrayRead(bb, &barray)); 1157 1158 PetscCall(PetscLogGpuTimeBegin()); 1159 /* First, solve U */ 1160 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1161 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1162 1163 /* Then, solve L */ 1164 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1165 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1166 1167 /* restore */ 1168 PetscCall(VecHIPRestoreArrayRead(bb, &barray)); 1169 PetscCall(VecHIPRestoreArrayWrite(xx, &xarray)); 1170 PetscCall(PetscLogGpuTimeEnd()); 1171 PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n)); 1172 PetscFunctionReturn(PETSC_SUCCESS); 1173 } 1174 1175 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx) 1176 { 1177 const PetscScalar *barray; 1178 PetscScalar *xarray; 1179 thrust::device_ptr<const PetscScalar> bGPU; 1180 thrust::device_ptr<PetscScalar> xGPU; 1181 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr; 1182 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr; 1183 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr; 1184 THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector; 1185 1186 PetscFunctionBegin; 1187 /* Get the GPU pointers */ 1188 PetscCall(VecHIPGetArrayWrite(xx, &xarray)); 1189 PetscCall(VecHIPGetArrayRead(bb, &barray)); 1190 xGPU = thrust::device_pointer_cast(xarray); 1191 bGPU = thrust::device_pointer_cast(barray); 1192 1193 PetscCall(PetscLogGpuTimeBegin()); 1194 /* First, reorder with the row permutation */ 1195 thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1196 1197 /* Next, solve L */ 1198 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1199 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1200 1201 /* Then, solve U */ 1202 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1203 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1204 1205 /* Last, reorder with the column permutation */ 1206 thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU); 1207 1208 PetscCall(VecHIPRestoreArrayRead(bb, &barray)); 1209 PetscCall(VecHIPRestoreArrayWrite(xx, &xarray)); 1210 PetscCall(PetscLogGpuTimeEnd()); 1211 PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n)); 1212 PetscFunctionReturn(PETSC_SUCCESS); 1213 } 1214 1215 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1216 { 1217 const PetscScalar *barray; 1218 PetscScalar *xarray; 1219 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr; 1220 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr; 1221 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr; 1222 THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector; 1223 1224 PetscFunctionBegin; 1225 /* Get the GPU pointers */ 1226 PetscCall(VecHIPGetArrayWrite(xx, &xarray)); 1227 PetscCall(VecHIPGetArrayRead(bb, &barray)); 1228 1229 PetscCall(PetscLogGpuTimeBegin()); 1230 /* First, solve L */ 1231 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1232 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1233 1234 /* Next, solve U */ 1235 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1236 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1237 1238 PetscCall(VecHIPRestoreArrayRead(bb, &barray)); 1239 PetscCall(VecHIPRestoreArrayWrite(xx, &xarray)); 1240 PetscCall(PetscLogGpuTimeEnd()); 1241 PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n)); 1242 PetscFunctionReturn(PETSC_SUCCESS); 1243 } 1244 1245 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0) 1246 /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/ 1247 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x) 1248 { 1249 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr; 1250 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1251 const PetscScalar *barray; 1252 PetscScalar *xarray; 1253 1254 PetscFunctionBegin; 1255 PetscCall(VecHIPGetArrayWrite(x, &xarray)); 1256 PetscCall(VecHIPGetArrayRead(b, &barray)); 1257 PetscCall(PetscLogGpuTimeBegin()); 1258 1259 /* Solve L*y = b */ 1260 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1261 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1262 #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0) 1263 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1264 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()! 1265 #else 1266 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1267 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()! 1268 #endif 1269 /* Solve U*x = y */ 1270 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1271 #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0) 1272 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 1273 fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); 1274 #else 1275 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 1276 fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1277 #endif 1278 PetscCall(VecHIPRestoreArrayRead(b, &barray)); 1279 PetscCall(VecHIPRestoreArrayWrite(x, &xarray)); 1280 1281 PetscCall(PetscLogGpuTimeEnd()); 1282 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1283 PetscFunctionReturn(PETSC_SUCCESS); 1284 } 1285 1286 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x) 1287 { 1288 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr; 1289 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1290 const PetscScalar *barray; 1291 PetscScalar *xarray; 1292 1293 PetscFunctionBegin; 1294 if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1295 PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1296 PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1297 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1298 1299 PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1300 PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1301 PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1302 PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1303 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1304 } 1305 1306 if (!fs->updatedTransposeSpSVAnalysis) { 1307 PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1308 1309 PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1310 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1311 } 1312 1313 PetscCall(VecHIPGetArrayWrite(x, &xarray)); 1314 PetscCall(VecHIPGetArrayRead(b, &barray)); 1315 PetscCall(PetscLogGpuTimeBegin()); 1316 1317 /* Solve Ut*y = b */ 1318 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1319 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1320 #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0) 1321 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 1322 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); 1323 #else 1324 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 1325 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1326 #endif 1327 /* Solve Lt*x = y */ 1328 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1329 #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0) 1330 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1331 fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1332 #else 1333 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1334 fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1335 #endif 1336 PetscCall(VecHIPRestoreArrayRead(b, &barray)); 1337 PetscCall(VecHIPRestoreArrayWrite(x, &xarray)); 1338 PetscCall(PetscLogGpuTimeEnd()); 1339 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1340 PetscFunctionReturn(PETSC_SUCCESS); 1341 } 1342 1343 static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) 1344 { 1345 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr; 1346 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1347 Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; 1348 CsrMatrix *Acsr; 1349 PetscInt m, nz; 1350 PetscBool flg; 1351 1352 PetscFunctionBegin; 1353 if (PetscDefined(USE_DEBUG)) { 1354 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg)); 1355 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name); 1356 } 1357 1358 /* Copy A's value to fact */ 1359 m = fact->rmap->n; 1360 nz = aij->nz; 1361 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 1362 Acsr = (CsrMatrix *)Acusp->mat->mat; 1363 PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream)); 1364 1365 /* Factorize fact inplace */ 1366 if (m) 1367 PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */ 1368 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1369 if (PetscDefined(USE_DEBUG)) { 1370 int numerical_zero; 1371 hipsparseStatus_t status; 1372 status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1373 PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1374 } 1375 1376 /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */ 1377 PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1378 1379 PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1380 1381 /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */ 1382 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1383 1384 fact->offloadmask = PETSC_OFFLOAD_GPU; 1385 fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ILU0; 1386 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0; 1387 fact->ops->matsolve = NULL; 1388 fact->ops->matsolvetranspose = NULL; 1389 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1390 PetscFunctionReturn(PETSC_SUCCESS); 1391 } 1392 1393 static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1394 { 1395 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr; 1396 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1397 PetscInt m, nz; 1398 1399 PetscFunctionBegin; 1400 if (PetscDefined(USE_DEBUG)) { 1401 PetscBool flg, diagDense; 1402 1403 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg)); 1404 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name); 1405 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1406 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense)); 1407 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries"); 1408 } 1409 1410 /* Free the old stale stuff */ 1411 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs)); 1412 1413 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1414 but they will not be used. Allocate them just for easy debugging. 1415 */ 1416 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1417 1418 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1419 fact->factortype = MAT_FACTOR_ILU; 1420 fact->info.factor_mallocs = 0; 1421 fact->info.fill_ratio_given = info->fill; 1422 fact->info.fill_ratio_needed = 1.0; 1423 1424 aij->row = NULL; 1425 aij->col = NULL; 1426 1427 /* ====================================================================== */ 1428 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1429 /* We'll do in-place factorization on fact */ 1430 /* ====================================================================== */ 1431 const int *Ai, *Aj; 1432 1433 m = fact->rmap->n; 1434 nz = aij->nz; 1435 1436 PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1437 PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1438 PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1439 PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1440 PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream)); 1441 PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream)); 1442 1443 /* ====================================================================== */ 1444 /* Create descriptors for M, L, U */ 1445 /* ====================================================================== */ 1446 hipsparseFillMode_t fillMode; 1447 hipsparseDiagType_t diagType; 1448 1449 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M)); 1450 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO)); 1451 PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL)); 1452 1453 /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054 1454 hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1455 assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1456 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1457 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1458 */ 1459 fillMode = HIPSPARSE_FILL_MODE_LOWER; 1460 diagType = HIPSPARSE_DIAG_TYPE_UNIT; 1461 PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype)); 1462 PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1463 PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1464 1465 fillMode = HIPSPARSE_FILL_MODE_UPPER; 1466 diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT; 1467 PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype)); 1468 PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1469 PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1470 1471 /* ========================================================================= */ 1472 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1473 /* ========================================================================= */ 1474 PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1475 if (m) 1476 PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */ 1477 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M)); 1478 1479 PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1480 PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1481 1482 PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype)); 1483 PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype)); 1484 1485 PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L)); 1486 PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1487 1488 PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U)); 1489 PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1490 1491 /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1492 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1493 */ 1494 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1495 PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1496 fs->spsvBuffer_L = fs->factBuffer_M; 1497 PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1498 } else { 1499 PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1500 fs->spsvBuffer_U = fs->factBuffer_M; 1501 PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1502 } 1503 1504 /* ========================================================================== */ 1505 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1506 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1507 /* ========================================================================== */ 1508 int structural_zero; 1509 1510 fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL; 1511 if (m) 1512 PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */ 1513 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1514 if (PetscDefined(USE_DEBUG)) { 1515 /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */ 1516 hipsparseStatus_t status; 1517 status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1518 PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1519 } 1520 1521 /* Estimate FLOPs of the numeric factorization */ 1522 { 1523 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1524 PetscInt *Ai, nzRow, nzLeft; 1525 PetscLogDouble flops = 0.0; 1526 const PetscInt *Adiag; 1527 1528 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &Adiag, NULL)); 1529 Ai = Aseq->i; 1530 for (PetscInt i = 0; i < m; i++) { 1531 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1532 nzRow = Ai[i + 1] - Ai[i]; 1533 nzLeft = Adiag[i] - Ai[i]; 1534 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1535 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1536 */ 1537 nzLeft = (nzRow - 1) / 2; 1538 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1539 } 1540 } 1541 fs->numericFactFlops = flops; 1542 } 1543 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0; 1544 PetscFunctionReturn(PETSC_SUCCESS); 1545 } 1546 1547 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x) 1548 { 1549 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr; 1550 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1551 const PetscScalar *barray; 1552 PetscScalar *xarray; 1553 1554 PetscFunctionBegin; 1555 PetscCall(VecHIPGetArrayWrite(x, &xarray)); 1556 PetscCall(VecHIPGetArrayRead(b, &barray)); 1557 PetscCall(PetscLogGpuTimeBegin()); 1558 1559 /* Solve L*y = b */ 1560 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1561 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1562 #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0) 1563 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1564 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1565 #else 1566 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1567 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1568 #endif 1569 /* Solve Lt*x = y */ 1570 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1571 #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0) 1572 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1573 fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1574 #else 1575 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1576 fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1577 #endif 1578 PetscCall(VecHIPRestoreArrayRead(b, &barray)); 1579 PetscCall(VecHIPRestoreArrayWrite(x, &xarray)); 1580 1581 PetscCall(PetscLogGpuTimeEnd()); 1582 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1583 PetscFunctionReturn(PETSC_SUCCESS); 1584 } 1585 1586 static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) 1587 { 1588 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr; 1589 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1590 Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; 1591 CsrMatrix *Acsr; 1592 PetscInt m, nz; 1593 PetscBool flg; 1594 1595 PetscFunctionBegin; 1596 if (PetscDefined(USE_DEBUG)) { 1597 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg)); 1598 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name); 1599 } 1600 1601 /* Copy A's value to fact */ 1602 m = fact->rmap->n; 1603 nz = aij->nz; 1604 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 1605 Acsr = (CsrMatrix *)Acusp->mat->mat; 1606 PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream)); 1607 1608 /* Factorize fact inplace */ 1609 /* Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1610 The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1611 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1612 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1613 */ 1614 if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1615 if (PetscDefined(USE_DEBUG)) { 1616 int numerical_zero; 1617 hipsparseStatus_t status; 1618 status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1619 PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1620 } 1621 1622 PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1623 1624 /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1625 ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1626 */ 1627 PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1628 1629 fact->offloadmask = PETSC_OFFLOAD_GPU; 1630 fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ICC0; 1631 fact->ops->solvetranspose = MatSolve_SeqAIJHIPSPARSE_ICC0; 1632 fact->ops->matsolve = NULL; 1633 fact->ops->matsolvetranspose = NULL; 1634 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1635 PetscFunctionReturn(PETSC_SUCCESS); 1636 } 1637 1638 static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1639 { 1640 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr; 1641 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1642 PetscInt m, nz; 1643 1644 PetscFunctionBegin; 1645 if (PetscDefined(USE_DEBUG)) { 1646 PetscBool flg, diagDense; 1647 1648 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg)); 1649 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name); 1650 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1651 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense)); 1652 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries"); 1653 } 1654 1655 /* Free the old stale stuff */ 1656 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs)); 1657 1658 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1659 but they will not be used. Allocate them just for easy debugging. 1660 */ 1661 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1662 1663 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1664 fact->factortype = MAT_FACTOR_ICC; 1665 fact->info.factor_mallocs = 0; 1666 fact->info.fill_ratio_given = info->fill; 1667 fact->info.fill_ratio_needed = 1.0; 1668 1669 aij->row = NULL; 1670 aij->col = NULL; 1671 1672 /* ====================================================================== */ 1673 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1674 /* We'll do in-place factorization on fact */ 1675 /* ====================================================================== */ 1676 const int *Ai, *Aj; 1677 1678 m = fact->rmap->n; 1679 nz = aij->nz; 1680 1681 PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1682 PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1683 PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1684 PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1685 PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream)); 1686 PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream)); 1687 1688 /* ====================================================================== */ 1689 /* Create mat descriptors for M, L */ 1690 /* ====================================================================== */ 1691 hipsparseFillMode_t fillMode; 1692 hipsparseDiagType_t diagType; 1693 1694 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M)); 1695 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO)); 1696 PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL)); 1697 1698 /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054 1699 hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1700 assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1701 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1702 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1703 */ 1704 fillMode = HIPSPARSE_FILL_MODE_LOWER; 1705 diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT; 1706 PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype)); 1707 PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1708 PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1709 1710 /* ========================================================================= */ 1711 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 1712 /* ========================================================================= */ 1713 PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M)); 1714 if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M)); 1715 1716 PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1717 PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1718 1719 PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype)); 1720 PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype)); 1721 1722 PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L)); 1723 PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1724 1725 PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1726 PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1727 1728 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 1729 See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`. 1730 */ 1731 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 1732 PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1733 fs->spsvBuffer_L = fs->factBuffer_M; 1734 PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1735 } else { 1736 PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 1737 fs->spsvBuffer_Lt = fs->factBuffer_M; 1738 PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1739 } 1740 1741 /* ========================================================================== */ 1742 /* Perform analysis of ic0 on M */ 1743 /* The lower triangular part of M has the same sparsity pattern as L */ 1744 /* ========================================================================== */ 1745 int structural_zero; 1746 1747 fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL; 1748 if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1749 if (PetscDefined(USE_DEBUG)) { 1750 hipsparseStatus_t status; 1751 /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */ 1752 status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 1753 PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 1754 } 1755 1756 /* Estimate FLOPs of the numeric factorization */ 1757 { 1758 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1759 PetscInt *Ai, nzRow, nzLeft; 1760 PetscLogDouble flops = 0.0; 1761 1762 Ai = Aseq->i; 1763 for (PetscInt i = 0; i < m; i++) { 1764 nzRow = Ai[i + 1] - Ai[i]; 1765 if (nzRow > 1) { 1766 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1767 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1768 */ 1769 nzLeft = (nzRow - 1) / 2; 1770 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1771 } 1772 } 1773 fs->numericFactFlops = flops; 1774 } 1775 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0; 1776 PetscFunctionReturn(PETSC_SUCCESS); 1777 } 1778 #endif 1779 1780 static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1781 { 1782 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr; 1783 1784 PetscFunctionBegin; 1785 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0) 1786 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 1787 if (!info->factoronhost) { 1788 PetscCall(ISIdentity(isrow, &row_identity)); 1789 PetscCall(ISIdentity(iscol, &col_identity)); 1790 } 1791 if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info)); 1792 else 1793 #endif 1794 { 1795 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors)); 1796 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1797 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE; 1798 } 1799 PetscFunctionReturn(PETSC_SUCCESS); 1800 } 1801 1802 static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1803 { 1804 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr; 1805 1806 PetscFunctionBegin; 1807 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors)); 1808 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1809 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE; 1810 PetscFunctionReturn(PETSC_SUCCESS); 1811 } 1812 1813 static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 1814 { 1815 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr; 1816 1817 PetscFunctionBegin; 1818 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0) 1819 PetscBool perm_identity = PETSC_FALSE; 1820 if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity)); 1821 if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info)); 1822 else 1823 #endif 1824 { 1825 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors)); 1826 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 1827 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE; 1828 } 1829 PetscFunctionReturn(PETSC_SUCCESS); 1830 } 1831 1832 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 1833 { 1834 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr; 1835 1836 PetscFunctionBegin; 1837 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors)); 1838 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 1839 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE; 1840 PetscFunctionReturn(PETSC_SUCCESS); 1841 } 1842 1843 static PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type) 1844 { 1845 PetscFunctionBegin; 1846 *type = MATSOLVERHIPSPARSE; 1847 PetscFunctionReturn(PETSC_SUCCESS); 1848 } 1849 1850 /*MC 1851 MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices 1852 on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported 1853 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 1854 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 1855 HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 1856 algorithms are not recommended. This class does NOT support direct solver operations. 1857 1858 Level: beginner 1859 1860 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation` 1861 M*/ 1862 1863 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B) 1864 { 1865 PetscInt n = A->rmap->n; 1866 1867 PetscFunctionBegin; 1868 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1869 PetscCall(MatSetSizes(*B, n, n, n, n)); 1870 (*B)->factortype = ftype; 1871 PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE)); 1872 1873 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 1874 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 1875 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1876 if (!A->boundtocpu) { 1877 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE; 1878 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJHIPSPARSE; 1879 } else { 1880 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 1881 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 1882 } 1883 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1884 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 1885 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 1886 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 1887 if (!A->boundtocpu) { 1888 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJHIPSPARSE; 1889 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE; 1890 } else { 1891 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 1892 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 1893 } 1894 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 1895 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 1896 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types"); 1897 1898 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1899 (*B)->canuseordering = PETSC_TRUE; 1900 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse)); 1901 PetscFunctionReturn(PETSC_SUCCESS); 1902 } 1903 1904 static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A) 1905 { 1906 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1907 Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; 1908 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0) 1909 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr; 1910 #endif 1911 1912 PetscFunctionBegin; 1913 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1914 PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0)); 1915 if (A->factortype == MAT_FACTOR_NONE) { 1916 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 1917 PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost)); 1918 } 1919 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0) 1920 else if (fs->csrVal) { 1921 /* We have a factorized matrix on device and are able to copy it to host */ 1922 PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost)); 1923 } 1924 #endif 1925 else 1926 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 1927 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 1928 PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0)); 1929 A->offloadmask = PETSC_OFFLOAD_BOTH; 1930 } 1931 PetscFunctionReturn(PETSC_SUCCESS); 1932 } 1933 1934 static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[]) 1935 { 1936 PetscFunctionBegin; 1937 PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A)); 1938 *array = ((Mat_SeqAIJ *)A->data)->a; 1939 PetscFunctionReturn(PETSC_SUCCESS); 1940 } 1941 1942 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[]) 1943 { 1944 PetscFunctionBegin; 1945 A->offloadmask = PETSC_OFFLOAD_CPU; 1946 *array = NULL; 1947 PetscFunctionReturn(PETSC_SUCCESS); 1948 } 1949 1950 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[]) 1951 { 1952 PetscFunctionBegin; 1953 PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A)); 1954 *array = ((Mat_SeqAIJ *)A->data)->a; 1955 PetscFunctionReturn(PETSC_SUCCESS); 1956 } 1957 1958 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[]) 1959 { 1960 PetscFunctionBegin; 1961 *array = NULL; 1962 PetscFunctionReturn(PETSC_SUCCESS); 1963 } 1964 1965 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[]) 1966 { 1967 PetscFunctionBegin; 1968 *array = ((Mat_SeqAIJ *)A->data)->a; 1969 PetscFunctionReturn(PETSC_SUCCESS); 1970 } 1971 1972 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[]) 1973 { 1974 PetscFunctionBegin; 1975 A->offloadmask = PETSC_OFFLOAD_CPU; 1976 *array = NULL; 1977 PetscFunctionReturn(PETSC_SUCCESS); 1978 } 1979 1980 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 1981 { 1982 Mat_SeqAIJHIPSPARSE *cusp; 1983 CsrMatrix *matrix; 1984 1985 PetscFunctionBegin; 1986 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 1987 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 1988 cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr); 1989 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 1990 matrix = (CsrMatrix *)cusp->mat->mat; 1991 1992 if (i) { 1993 #if !defined(PETSC_USE_64BIT_INDICES) 1994 *i = matrix->row_offsets->data().get(); 1995 #else 1996 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices"); 1997 #endif 1998 } 1999 if (j) { 2000 #if !defined(PETSC_USE_64BIT_INDICES) 2001 *j = matrix->column_indices->data().get(); 2002 #else 2003 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices"); 2004 #endif 2005 } 2006 if (a) *a = matrix->values->data().get(); 2007 if (mtype) *mtype = PETSC_MEMTYPE_HIP; 2008 PetscFunctionReturn(PETSC_SUCCESS); 2009 } 2010 2011 PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A) 2012 { 2013 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr; 2014 Mat_SeqAIJHIPSPARSEMultStruct *matstruct = hipsparsestruct->mat; 2015 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2016 PetscBool both = PETSC_TRUE; 2017 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2018 2019 PetscFunctionBegin; 2020 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2021 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2022 if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */ 2023 CsrMatrix *matrix; 2024 matrix = (CsrMatrix *)hipsparsestruct->mat->mat; 2025 2026 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2027 PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0)); 2028 matrix->values->assign(a->a, a->a + a->nz); 2029 PetscCallHIP(WaitForHIP()); 2030 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2031 PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0)); 2032 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2033 } else { 2034 PetscInt nnz; 2035 PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0)); 2036 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format)); 2037 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2038 delete hipsparsestruct->workVector; 2039 delete hipsparsestruct->rowoffsets_gpu; 2040 hipsparsestruct->workVector = NULL; 2041 hipsparsestruct->rowoffsets_gpu = NULL; 2042 try { 2043 if (a->compressedrow.use) { 2044 m = a->compressedrow.nrows; 2045 ii = a->compressedrow.i; 2046 ridx = a->compressedrow.rindex; 2047 } else { 2048 m = A->rmap->n; 2049 ii = a->i; 2050 ridx = NULL; 2051 } 2052 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2053 if (!a->a) { 2054 nnz = ii[m]; 2055 both = PETSC_FALSE; 2056 } else nnz = a->nz; 2057 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2058 2059 /* create hipsparse matrix */ 2060 hipsparsestruct->nrows = m; 2061 matstruct = new Mat_SeqAIJHIPSPARSEMultStruct; 2062 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr)); 2063 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO)); 2064 PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); 2065 2066 PetscCallHIP(hipMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2067 PetscCallHIP(hipMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2068 PetscCallHIP(hipMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2069 PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice)); 2070 PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice)); 2071 PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice)); 2072 PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE)); 2073 2074 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2075 if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { 2076 /* set the matrix */ 2077 CsrMatrix *mat = new CsrMatrix; 2078 mat->num_rows = m; 2079 mat->num_cols = A->cmap->n; 2080 mat->num_entries = nnz; 2081 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2082 mat->column_indices = new THRUSTINTARRAY32(nnz); 2083 mat->values = new THRUSTARRAY(nnz); 2084 mat->row_offsets->assign(ii, ii + m + 1); 2085 mat->column_indices->assign(a->j, a->j + nnz); 2086 if (a->a) mat->values->assign(a->a, a->a + nnz); 2087 2088 /* assign the pointer */ 2089 matstruct->mat = mat; 2090 if (mat->num_rows) { /* hipsparse errors on empty matrices! */ 2091 PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2092 HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype)); 2093 } 2094 } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) { 2095 CsrMatrix *mat = new CsrMatrix; 2096 mat->num_rows = m; 2097 mat->num_cols = A->cmap->n; 2098 mat->num_entries = nnz; 2099 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2100 mat->column_indices = new THRUSTINTARRAY32(nnz); 2101 mat->values = new THRUSTARRAY(nnz); 2102 mat->row_offsets->assign(ii, ii + m + 1); 2103 mat->column_indices->assign(a->j, a->j + nnz); 2104 if (a->a) mat->values->assign(a->a, a->a + nnz); 2105 2106 hipsparseHybMat_t hybMat; 2107 PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat)); 2108 hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO; 2109 PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition)); 2110 /* assign the pointer */ 2111 matstruct->mat = hybMat; 2112 2113 if (mat) { 2114 if (mat->values) delete (THRUSTARRAY *)mat->values; 2115 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2116 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2117 delete (CsrMatrix *)mat; 2118 } 2119 } 2120 2121 /* assign the compressed row indices */ 2122 if (a->compressedrow.use) { 2123 hipsparsestruct->workVector = new THRUSTARRAY(m); 2124 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2125 matstruct->cprowIndices->assign(ridx, ridx + m); 2126 tmp = m; 2127 } else { 2128 hipsparsestruct->workVector = NULL; 2129 matstruct->cprowIndices = NULL; 2130 tmp = 0; 2131 } 2132 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2133 2134 /* assign the pointer */ 2135 hipsparsestruct->mat = matstruct; 2136 } catch (char *ex) { 2137 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex); 2138 } 2139 PetscCallHIP(WaitForHIP()); 2140 PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0)); 2141 hipsparsestruct->nonzerostate = A->nonzerostate; 2142 } 2143 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2144 } 2145 PetscFunctionReturn(PETSC_SUCCESS); 2146 } 2147 2148 struct VecHIPPlusEquals { 2149 template <typename Tuple> 2150 __host__ __device__ void operator()(Tuple t) 2151 { 2152 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2153 } 2154 }; 2155 2156 struct VecHIPEquals { 2157 template <typename Tuple> 2158 __host__ __device__ void operator()(Tuple t) 2159 { 2160 thrust::get<1>(t) = thrust::get<0>(t); 2161 } 2162 }; 2163 2164 struct VecHIPEqualsReverse { 2165 template <typename Tuple> 2166 __host__ __device__ void operator()(Tuple t) 2167 { 2168 thrust::get<0>(t) = thrust::get<1>(t); 2169 } 2170 }; 2171 2172 struct MatProductCtx_MatMatHipsparse { 2173 PetscBool cisdense; 2174 PetscScalar *Bt; 2175 Mat X; 2176 PetscBool reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */ 2177 PetscLogDouble flops; 2178 CsrMatrix *Bcsr; 2179 hipsparseSpMatDescr_t matSpBDescr; 2180 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2181 hipsparseDnMatDescr_t matBDescr; 2182 hipsparseDnMatDescr_t matCDescr; 2183 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2184 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0) 2185 void *dBuffer4, *dBuffer5; 2186 #endif 2187 size_t mmBufferSize; 2188 void *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2189 hipsparseSpGEMMDescr_t spgemmDesc; 2190 }; 2191 2192 static PetscErrorCode MatProductCtxDestroy_MatMatHipsparse(PetscCtxRt data) 2193 { 2194 MatProductCtx_MatMatHipsparse *mmdata = *(MatProductCtx_MatMatHipsparse **)data; 2195 2196 PetscFunctionBegin; 2197 PetscCallHIP(hipFree(mmdata->Bt)); 2198 delete mmdata->Bcsr; 2199 if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr)); 2200 if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr)); 2201 if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr)); 2202 if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2203 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0) 2204 if (mmdata->dBuffer4) PetscCallHIP(hipFree(mmdata->dBuffer4)); 2205 if (mmdata->dBuffer5) PetscCallHIP(hipFree(mmdata->dBuffer5)); 2206 #endif 2207 if (mmdata->mmBuffer) PetscCallHIP(hipFree(mmdata->mmBuffer)); 2208 if (mmdata->mmBuffer2) PetscCallHIP(hipFree(mmdata->mmBuffer2)); 2209 PetscCall(MatDestroy(&mmdata->X)); 2210 PetscCall(PetscFree(*(void **)data)); 2211 PetscFunctionReturn(PETSC_SUCCESS); 2212 } 2213 2214 static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C) 2215 { 2216 Mat_Product *product = C->product; 2217 Mat A, B; 2218 PetscInt m, n, blda, clda; 2219 PetscBool flg, biship; 2220 Mat_SeqAIJHIPSPARSE *cusp; 2221 hipsparseOperation_t opA; 2222 const PetscScalar *barray; 2223 PetscScalar *carray; 2224 MatProductCtx_MatMatHipsparse *mmdata; 2225 Mat_SeqAIJHIPSPARSEMultStruct *mat; 2226 CsrMatrix *csrmat; 2227 2228 PetscFunctionBegin; 2229 MatCheckProduct(C, 1); 2230 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2231 mmdata = (MatProductCtx_MatMatHipsparse *)product->data; 2232 A = product->A; 2233 B = product->B; 2234 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg)); 2235 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2236 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2237 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2238 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2239 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 2240 cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; 2241 switch (product->type) { 2242 case MATPRODUCT_AB: 2243 case MATPRODUCT_PtAP: 2244 mat = cusp->mat; 2245 opA = HIPSPARSE_OPERATION_NON_TRANSPOSE; 2246 m = A->rmap->n; 2247 n = B->cmap->n; 2248 break; 2249 case MATPRODUCT_AtB: 2250 if (!A->form_explicit_transpose) { 2251 mat = cusp->mat; 2252 opA = HIPSPARSE_OPERATION_TRANSPOSE; 2253 } else { 2254 PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A)); 2255 mat = cusp->matTranspose; 2256 opA = HIPSPARSE_OPERATION_NON_TRANSPOSE; 2257 } 2258 m = A->cmap->n; 2259 n = B->cmap->n; 2260 break; 2261 case MATPRODUCT_ABt: 2262 case MATPRODUCT_RARt: 2263 mat = cusp->mat; 2264 opA = HIPSPARSE_OPERATION_NON_TRANSPOSE; 2265 m = A->rmap->n; 2266 n = B->rmap->n; 2267 break; 2268 default: 2269 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2270 } 2271 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct"); 2272 csrmat = (CsrMatrix *)mat->mat; 2273 /* if the user passed a CPU matrix, copy the data to the GPU */ 2274 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship)); 2275 if (!biship) PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B)); 2276 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2277 PetscCall(MatDenseGetLDA(B, &blda)); 2278 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2279 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2280 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2281 } else { 2282 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2283 PetscCall(MatDenseGetLDA(C, &clda)); 2284 } 2285 2286 PetscCall(PetscLogGpuTimeBegin()); 2287 hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE; 2288 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2289 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2290 size_t mmBufferSize; 2291 if (mmdata->initialized && mmdata->Blda != blda) { 2292 PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr)); 2293 mmdata->matBDescr = NULL; 2294 } 2295 if (!mmdata->matBDescr) { 2296 PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL)); 2297 mmdata->Blda = blda; 2298 } 2299 if (mmdata->initialized && mmdata->Clda != clda) { 2300 PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr)); 2301 mmdata->matCDescr = NULL; 2302 } 2303 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2304 PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL)); 2305 mmdata->Clda = clda; 2306 } 2307 if (!mat->matDescr) { 2308 PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2309 HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype)); 2310 } 2311 PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2312 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2313 PetscCallHIP(hipFree(mmdata->mmBuffer)); 2314 PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize)); 2315 mmdata->mmBufferSize = mmBufferSize; 2316 } 2317 mmdata->initialized = PETSC_TRUE; 2318 } else { 2319 /* to be safe, always update pointers of the mats */ 2320 PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2321 PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2322 PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2323 } 2324 2325 /* do hipsparseSpMM, which supports transpose on B */ 2326 PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2327 2328 PetscCall(PetscLogGpuTimeEnd()); 2329 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2330 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2331 if (product->type == MATPRODUCT_RARt) { 2332 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2333 PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2334 } else if (product->type == MATPRODUCT_PtAP) { 2335 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2336 PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2337 } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2338 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2339 if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2340 PetscFunctionReturn(PETSC_SUCCESS); 2341 } 2342 2343 static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C) 2344 { 2345 Mat_Product *product = C->product; 2346 Mat A, B; 2347 PetscInt m, n; 2348 PetscBool cisdense, flg; 2349 MatProductCtx_MatMatHipsparse *mmdata; 2350 Mat_SeqAIJHIPSPARSE *cusp; 2351 2352 PetscFunctionBegin; 2353 MatCheckProduct(C, 1); 2354 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2355 A = product->A; 2356 B = product->B; 2357 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg)); 2358 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2359 cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; 2360 PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format"); 2361 switch (product->type) { 2362 case MATPRODUCT_AB: 2363 m = A->rmap->n; 2364 n = B->cmap->n; 2365 break; 2366 case MATPRODUCT_AtB: 2367 m = A->cmap->n; 2368 n = B->cmap->n; 2369 break; 2370 case MATPRODUCT_ABt: 2371 m = A->rmap->n; 2372 n = B->rmap->n; 2373 break; 2374 case MATPRODUCT_PtAP: 2375 m = B->cmap->n; 2376 n = B->cmap->n; 2377 break; 2378 case MATPRODUCT_RARt: 2379 m = B->rmap->n; 2380 n = B->rmap->n; 2381 break; 2382 default: 2383 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2384 } 2385 PetscCall(MatSetSizes(C, m, n, m, n)); 2386 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2387 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2388 PetscCall(MatSetType(C, MATSEQDENSEHIP)); 2389 2390 /* product data */ 2391 PetscCall(PetscNew(&mmdata)); 2392 mmdata->cisdense = cisdense; 2393 /* for these products we need intermediate storage */ 2394 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2395 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2396 PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP)); 2397 /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */ 2398 if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2399 else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2400 } 2401 C->product->data = mmdata; 2402 C->product->destroy = MatProductCtxDestroy_MatMatHipsparse; 2403 C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP; 2404 PetscFunctionReturn(PETSC_SUCCESS); 2405 } 2406 2407 static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C) 2408 { 2409 Mat_Product *product = C->product; 2410 Mat A, B; 2411 Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp; 2412 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2413 Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat; 2414 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2415 PetscBool flg; 2416 MatProductType ptype; 2417 MatProductCtx_MatMatHipsparse *mmdata; 2418 hipsparseSpMatDescr_t BmatSpDescr; 2419 hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */ 2420 2421 PetscFunctionBegin; 2422 MatCheckProduct(C, 1); 2423 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2424 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg)); 2425 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2426 mmdata = (MatProductCtx_MatMatHipsparse *)C->product->data; 2427 A = product->A; 2428 B = product->B; 2429 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2430 mmdata->reusesym = PETSC_FALSE; 2431 Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr; 2432 PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format"); 2433 Cmat = Ccusp->mat; 2434 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2435 Ccsr = (CsrMatrix *)Cmat->mat; 2436 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2437 goto finalize; 2438 } 2439 if (!c->nz) goto finalize; 2440 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg)); 2441 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2442 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg)); 2443 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2444 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2445 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2446 Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; 2447 Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr; 2448 Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr; 2449 PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format"); 2450 PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format"); 2451 PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format"); 2452 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 2453 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B)); 2454 2455 ptype = product->type; 2456 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2457 ptype = MATPRODUCT_AB; 2458 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2459 } 2460 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2461 ptype = MATPRODUCT_AB; 2462 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2463 } 2464 switch (ptype) { 2465 case MATPRODUCT_AB: 2466 Amat = Acusp->mat; 2467 Bmat = Bcusp->mat; 2468 break; 2469 case MATPRODUCT_AtB: 2470 Amat = Acusp->matTranspose; 2471 Bmat = Bcusp->mat; 2472 break; 2473 case MATPRODUCT_ABt: 2474 Amat = Acusp->mat; 2475 Bmat = Bcusp->matTranspose; 2476 break; 2477 default: 2478 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2479 } 2480 Cmat = Ccusp->mat; 2481 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2482 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2483 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2484 Acsr = (CsrMatrix *)Amat->mat; 2485 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2486 Ccsr = (CsrMatrix *)Cmat->mat; 2487 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2488 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2489 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2490 PetscCall(PetscLogGpuTimeBegin()); 2491 #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0) 2492 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2493 PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE)); 2494 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0) 2495 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc)); 2496 #else 2497 PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer)); 2498 PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc)); 2499 #endif 2500 #else 2501 PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, 2502 Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), 2503 Ccsr->column_indices->data().get())); 2504 #endif 2505 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2506 PetscCallHIP(WaitForHIP()); 2507 PetscCall(PetscLogGpuTimeEnd()); 2508 C->offloadmask = PETSC_OFFLOAD_GPU; 2509 finalize: 2510 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2511 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2512 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2513 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2514 c->reallocs = 0; 2515 C->info.mallocs += 0; 2516 C->info.nz_unneeded = 0; 2517 C->assembled = C->was_assembled = PETSC_TRUE; 2518 C->num_ass++; 2519 PetscFunctionReturn(PETSC_SUCCESS); 2520 } 2521 2522 static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C) 2523 { 2524 Mat_Product *product = C->product; 2525 Mat A, B; 2526 Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp; 2527 Mat_SeqAIJ *a, *b, *c; 2528 Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat; 2529 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2530 PetscInt i, j, m, n, k; 2531 PetscBool flg; 2532 MatProductType ptype; 2533 MatProductCtx_MatMatHipsparse *mmdata; 2534 PetscLogDouble flops; 2535 PetscBool biscompressed, ciscompressed; 2536 #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0) 2537 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2538 hipsparseSpMatDescr_t BmatSpDescr; 2539 #else 2540 int cnz; 2541 #endif 2542 hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */ 2543 2544 PetscFunctionBegin; 2545 MatCheckProduct(C, 1); 2546 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2547 A = product->A; 2548 B = product->B; 2549 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg)); 2550 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2551 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg)); 2552 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2553 a = (Mat_SeqAIJ *)A->data; 2554 b = (Mat_SeqAIJ *)B->data; 2555 /* product data */ 2556 PetscCall(PetscNew(&mmdata)); 2557 C->product->data = mmdata; 2558 C->product->destroy = MatProductCtxDestroy_MatMatHipsparse; 2559 2560 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 2561 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B)); 2562 Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */ 2563 Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr; 2564 PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format"); 2565 PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format"); 2566 2567 ptype = product->type; 2568 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2569 ptype = MATPRODUCT_AB; 2570 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2571 } 2572 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2573 ptype = MATPRODUCT_AB; 2574 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2575 } 2576 biscompressed = PETSC_FALSE; 2577 ciscompressed = PETSC_FALSE; 2578 switch (ptype) { 2579 case MATPRODUCT_AB: 2580 m = A->rmap->n; 2581 n = B->cmap->n; 2582 k = A->cmap->n; 2583 Amat = Acusp->mat; 2584 Bmat = Bcusp->mat; 2585 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2586 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2587 break; 2588 case MATPRODUCT_AtB: 2589 m = A->cmap->n; 2590 n = B->cmap->n; 2591 k = A->rmap->n; 2592 PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A)); 2593 Amat = Acusp->matTranspose; 2594 Bmat = Bcusp->mat; 2595 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2596 break; 2597 case MATPRODUCT_ABt: 2598 m = A->rmap->n; 2599 n = B->rmap->n; 2600 k = A->cmap->n; 2601 PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B)); 2602 Amat = Acusp->mat; 2603 Bmat = Bcusp->matTranspose; 2604 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2605 break; 2606 default: 2607 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2608 } 2609 2610 /* create hipsparse matrix */ 2611 PetscCall(MatSetSizes(C, m, n, m, n)); 2612 PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE)); 2613 c = (Mat_SeqAIJ *)C->data; 2614 Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr; 2615 Cmat = new Mat_SeqAIJHIPSPARSEMultStruct; 2616 Ccsr = new CsrMatrix; 2617 2618 c->compressedrow.use = ciscompressed; 2619 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2620 c->compressedrow.nrows = a->compressedrow.nrows; 2621 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 2622 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 2623 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2624 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2625 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 2626 } else { 2627 c->compressedrow.nrows = 0; 2628 c->compressedrow.i = NULL; 2629 c->compressedrow.rindex = NULL; 2630 Ccusp->workVector = NULL; 2631 Cmat->cprowIndices = NULL; 2632 } 2633 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2634 Ccusp->mat = Cmat; 2635 Ccusp->mat->mat = Ccsr; 2636 Ccsr->num_rows = Ccusp->nrows; 2637 Ccsr->num_cols = n; 2638 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 2639 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr)); 2640 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO)); 2641 PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); 2642 PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 2643 PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 2644 PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 2645 PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice)); 2646 PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice)); 2647 PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice)); 2648 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */ 2649 thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0); 2650 c->nz = 0; 2651 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2652 Ccsr->values = new THRUSTARRAY(c->nz); 2653 goto finalizesym; 2654 } 2655 2656 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2657 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2658 Acsr = (CsrMatrix *)Amat->mat; 2659 if (!biscompressed) { 2660 Bcsr = (CsrMatrix *)Bmat->mat; 2661 BmatSpDescr = Bmat->matDescr; 2662 } else { /* we need to use row offsets for the full matrix */ 2663 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 2664 Bcsr = new CsrMatrix; 2665 Bcsr->num_rows = B->rmap->n; 2666 Bcsr->num_cols = cBcsr->num_cols; 2667 Bcsr->num_entries = cBcsr->num_entries; 2668 Bcsr->column_indices = cBcsr->column_indices; 2669 Bcsr->values = cBcsr->values; 2670 if (!Bcusp->rowoffsets_gpu) { 2671 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2672 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 2673 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 2674 } 2675 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2676 mmdata->Bcsr = Bcsr; 2677 if (Bcsr->num_rows && Bcsr->num_cols) { 2678 PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype)); 2679 } 2680 BmatSpDescr = mmdata->matSpBDescr; 2681 } 2682 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2683 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2684 /* precompute flops count */ 2685 if (ptype == MATPRODUCT_AB) { 2686 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2687 const PetscInt st = a->i[i]; 2688 const PetscInt en = a->i[i + 1]; 2689 for (j = st; j < en; j++) { 2690 const PetscInt brow = a->j[j]; 2691 flops += 2. * (b->i[brow + 1] - b->i[brow]); 2692 } 2693 } 2694 } else if (ptype == MATPRODUCT_AtB) { 2695 for (i = 0, flops = 0; i < A->rmap->n; i++) { 2696 const PetscInt anzi = a->i[i + 1] - a->i[i]; 2697 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 2698 flops += (2. * anzi) * bnzi; 2699 } 2700 } else flops = 0.; /* TODO */ 2701 2702 mmdata->flops = flops; 2703 PetscCall(PetscLogGpuTimeBegin()); 2704 #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0) 2705 PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE)); 2706 PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype)); 2707 PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2708 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0) 2709 { 2710 /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it. 2711 We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp 2712 */ 2713 void *dBuffer1 = NULL; 2714 void *dBuffer2 = NULL; 2715 void *dBuffer3 = NULL; 2716 /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2717 size_t bufferSize1 = 0; 2718 size_t bufferSize2 = 0; 2719 size_t bufferSize3 = 0; 2720 size_t bufferSize4 = 0; 2721 size_t bufferSize5 = 0; 2722 2723 /* ask bufferSize1 bytes for external memory */ 2724 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL)); 2725 PetscCallHIP(hipMalloc((void **)&dBuffer1, bufferSize1)); 2726 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2727 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1)); 2728 2729 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL)); 2730 PetscCallHIP(hipMalloc((void **)&dBuffer2, bufferSize2)); 2731 PetscCallHIP(hipMalloc((void **)&dBuffer3, bufferSize3)); 2732 PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 2733 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4)); 2734 PetscCallHIP(hipFree(dBuffer1)); 2735 PetscCallHIP(hipFree(dBuffer2)); 2736 2737 /* get matrix C non-zero entries C_nnz1 */ 2738 PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2739 c->nz = (PetscInt)C_nnz1; 2740 /* allocate matrix C */ 2741 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2742 PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */ 2743 Ccsr->values = new THRUSTARRAY(c->nz); 2744 PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */ 2745 /* update matC with the new pointers */ 2746 if (c->nz) { /* 5.5.1 has a bug with nz = 0, exposed by mat_tests_ex123_2_hypre */ 2747 PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get())); 2748 2749 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL)); 2750 PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 2751 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5)); 2752 PetscCallHIP(hipFree(dBuffer3)); 2753 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc)); 2754 } 2755 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 2756 } 2757 #else 2758 size_t bufSize2; 2759 /* ask bufferSize bytes for external memory */ 2760 PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL)); 2761 PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 2762 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2763 PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2)); 2764 /* ask bufferSize again bytes for external memory */ 2765 PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL)); 2766 /* Similar to CUSPARSE, we need both buffers to perform the operations properly! 2767 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2768 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2769 is stored in the descriptor! What a messy API... */ 2770 PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 2771 /* compute the intermediate product of A * B */ 2772 PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer)); 2773 /* get matrix C non-zero entries C_nnz1 */ 2774 PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2775 c->nz = (PetscInt)C_nnz1; 2776 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 2777 mmdata->mmBufferSize / 1024)); 2778 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2779 PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */ 2780 Ccsr->values = new THRUSTARRAY(c->nz); 2781 PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */ 2782 PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get())); 2783 PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc)); 2784 #endif 2785 #else 2786 PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST)); 2787 PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2788 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz)); 2789 c->nz = cnz; 2790 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2791 PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */ 2792 Ccsr->values = new THRUSTARRAY(c->nz); 2793 PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */ 2794 2795 PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE)); 2796 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2797 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2798 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2799 PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, 2800 Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), 2801 Ccsr->column_indices->data().get())); 2802 #endif 2803 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2804 PetscCall(PetscLogGpuTimeEnd()); 2805 finalizesym: 2806 c->free_a = PETSC_TRUE; 2807 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 2808 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 2809 c->free_ij = PETSC_TRUE; 2810 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 2811 PetscInt *d_i = c->i; 2812 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2813 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2814 ii = *Ccsr->row_offsets; 2815 jj = *Ccsr->column_indices; 2816 if (ciscompressed) d_i = c->compressedrow.i; 2817 PetscCallHIP(hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost)); 2818 PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost)); 2819 } else { 2820 PetscInt *d_i = c->i; 2821 if (ciscompressed) d_i = c->compressedrow.i; 2822 PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost)); 2823 PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost)); 2824 } 2825 if (ciscompressed) { /* need to expand host row offsets */ 2826 PetscInt r = 0; 2827 c->i[0] = 0; 2828 for (k = 0; k < c->compressedrow.nrows; k++) { 2829 const PetscInt next = c->compressedrow.rindex[k]; 2830 const PetscInt old = c->compressedrow.i[k]; 2831 for (; r < next; r++) c->i[r + 1] = old; 2832 } 2833 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 2834 } 2835 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 2836 PetscCall(PetscMalloc1(m, &c->ilen)); 2837 PetscCall(PetscMalloc1(m, &c->imax)); 2838 c->maxnz = c->nz; 2839 c->nonzerorowcnt = 0; 2840 c->rmax = 0; 2841 for (k = 0; k < m; k++) { 2842 const PetscInt nn = c->i[k + 1] - c->i[k]; 2843 c->ilen[k] = c->imax[k] = nn; 2844 c->nonzerorowcnt += (PetscInt)!!nn; 2845 c->rmax = PetscMax(c->rmax, nn); 2846 } 2847 PetscCall(PetscMalloc1(c->nz, &c->a)); 2848 Ccsr->num_entries = c->nz; 2849 2850 C->nonzerostate++; 2851 PetscCall(PetscLayoutSetUp(C->rmap)); 2852 PetscCall(PetscLayoutSetUp(C->cmap)); 2853 Ccusp->nonzerostate = C->nonzerostate; 2854 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2855 C->preallocated = PETSC_TRUE; 2856 C->assembled = PETSC_FALSE; 2857 C->was_assembled = PETSC_FALSE; 2858 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2859 mmdata->reusesym = PETSC_TRUE; 2860 C->offloadmask = PETSC_OFFLOAD_GPU; 2861 } 2862 C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE; 2863 PetscFunctionReturn(PETSC_SUCCESS); 2864 } 2865 2866 /* handles sparse or dense B */ 2867 static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat) 2868 { 2869 Mat_Product *product = mat->product; 2870 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 2871 2872 PetscFunctionBegin; 2873 MatCheckProduct(mat, 1); 2874 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 2875 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp)); 2876 if (product->type == MATPRODUCT_ABC) { 2877 Ciscusp = PETSC_FALSE; 2878 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp)); 2879 } 2880 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2881 PetscBool usecpu = PETSC_FALSE; 2882 switch (product->type) { 2883 case MATPRODUCT_AB: 2884 if (product->api_user) { 2885 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 2886 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 2887 PetscOptionsEnd(); 2888 } else { 2889 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 2890 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 2891 PetscOptionsEnd(); 2892 } 2893 break; 2894 case MATPRODUCT_AtB: 2895 if (product->api_user) { 2896 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 2897 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 2898 PetscOptionsEnd(); 2899 } else { 2900 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 2901 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 2902 PetscOptionsEnd(); 2903 } 2904 break; 2905 case MATPRODUCT_PtAP: 2906 if (product->api_user) { 2907 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 2908 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 2909 PetscOptionsEnd(); 2910 } else { 2911 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 2912 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 2913 PetscOptionsEnd(); 2914 } 2915 break; 2916 case MATPRODUCT_RARt: 2917 if (product->api_user) { 2918 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 2919 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 2920 PetscOptionsEnd(); 2921 } else { 2922 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 2923 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 2924 PetscOptionsEnd(); 2925 } 2926 break; 2927 case MATPRODUCT_ABC: 2928 if (product->api_user) { 2929 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 2930 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 2931 PetscOptionsEnd(); 2932 } else { 2933 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 2934 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 2935 PetscOptionsEnd(); 2936 } 2937 break; 2938 default: 2939 break; 2940 } 2941 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2942 } 2943 /* dispatch */ 2944 if (isdense) { 2945 switch (product->type) { 2946 case MATPRODUCT_AB: 2947 case MATPRODUCT_AtB: 2948 case MATPRODUCT_ABt: 2949 case MATPRODUCT_PtAP: 2950 case MATPRODUCT_RARt: 2951 if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 2952 else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP; 2953 break; 2954 case MATPRODUCT_ABC: 2955 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2956 break; 2957 default: 2958 break; 2959 } 2960 } else if (Biscusp && Ciscusp) { 2961 switch (product->type) { 2962 case MATPRODUCT_AB: 2963 case MATPRODUCT_AtB: 2964 case MATPRODUCT_ABt: 2965 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE; 2966 break; 2967 case MATPRODUCT_PtAP: 2968 case MATPRODUCT_RARt: 2969 case MATPRODUCT_ABC: 2970 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2971 break; 2972 default: 2973 break; 2974 } 2975 } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */ 2976 PetscFunctionReturn(PETSC_SUCCESS); 2977 } 2978 2979 static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy) 2980 { 2981 PetscFunctionBegin; 2982 PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 2983 PetscFunctionReturn(PETSC_SUCCESS); 2984 } 2985 2986 static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 2987 { 2988 PetscFunctionBegin; 2989 PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 2990 PetscFunctionReturn(PETSC_SUCCESS); 2991 } 2992 2993 static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy) 2994 { 2995 PetscFunctionBegin; 2996 PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 2997 PetscFunctionReturn(PETSC_SUCCESS); 2998 } 2999 3000 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3001 { 3002 PetscFunctionBegin; 3003 PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3004 PetscFunctionReturn(PETSC_SUCCESS); 3005 } 3006 3007 static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy) 3008 { 3009 PetscFunctionBegin; 3010 PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3011 PetscFunctionReturn(PETSC_SUCCESS); 3012 } 3013 3014 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3015 { 3016 int i = blockIdx.x * blockDim.x + threadIdx.x; 3017 if (i < n) y[idx[i]] += x[i]; 3018 } 3019 3020 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3021 static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3022 { 3023 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3024 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr; 3025 Mat_SeqAIJHIPSPARSEMultStruct *matstruct; 3026 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3027 hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE; 3028 PetscBool compressed; 3029 PetscInt nx, ny; 3030 3031 PetscFunctionBegin; 3032 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3033 if (!a->nz) { 3034 if (yy) PetscCall(VecSeq_HIP::Copy(yy, zz)); 3035 else PetscCall(VecSeq_HIP::Set(zz, 0)); 3036 PetscFunctionReturn(PETSC_SUCCESS); 3037 } 3038 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3039 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 3040 if (!trans) { 3041 matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat; 3042 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)"); 3043 } else { 3044 if (herm || !A->form_explicit_transpose) { 3045 opA = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE; 3046 matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat; 3047 } else { 3048 if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A)); 3049 matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose; 3050 } 3051 } 3052 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3053 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3054 try { 3055 PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray)); 3056 if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3057 else PetscCall(VecHIPGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3058 3059 PetscCall(PetscLogGpuTimeBegin()); 3060 if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) { 3061 /* z = A x + beta y. 3062 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3063 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3064 */ 3065 xptr = xarray; 3066 dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray; 3067 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3068 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3069 allocated to accommodate different uses. So we get the length info directly from mat. 3070 */ 3071 if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { 3072 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3073 nx = mat->num_cols; 3074 ny = mat->num_rows; 3075 } 3076 } else { 3077 /* z = A^T x + beta y 3078 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3079 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3080 */ 3081 xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray; 3082 dptr = zarray; 3083 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3084 if (compressed) { /* Scatter x to work vector */ 3085 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3086 thrust::for_each( 3087 #if PetscDefined(HAVE_THRUST_ASYNC) 3088 thrust::hip::par.on(PetscDefaultHipStream), 3089 #endif 3090 thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3091 thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse()); 3092 } 3093 if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { 3094 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3095 nx = mat->num_rows; 3096 ny = mat->num_cols; 3097 } 3098 } 3099 /* csr_spmv does y = alpha op(A) x + beta y */ 3100 if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { 3101 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0) 3102 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly"); 3103 if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */ 3104 PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype)); 3105 PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype)); 3106 PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, 3107 &matstruct->hipSpMV[opA].spmvBufferSize)); 3108 PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize)); 3109 matstruct->hipSpMV[opA].initialized = PETSC_TRUE; 3110 } else { 3111 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3112 PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr)); 3113 PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr)); 3114 } 3115 PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */ 3116 matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer)); 3117 #else 3118 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3119 PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3120 #endif 3121 } else { 3122 if (hipsparsestruct->nrows) { 3123 hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat; 3124 PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3125 } 3126 } 3127 PetscCall(PetscLogGpuTimeEnd()); 3128 3129 if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) { 3130 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3131 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3132 PetscCall(VecSeq_HIP::Copy(yy, zz)); /* zz = yy */ 3133 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3134 PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */ 3135 } 3136 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3137 PetscCall(VecSeq_HIP::Set(zz, 0)); 3138 } 3139 3140 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3141 if (compressed) { 3142 PetscCall(PetscLogGpuTimeBegin()); 3143 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3144 and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to 3145 prevent that. So I just add a ScatterAdd kernel. 3146 */ 3147 #if 0 3148 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3149 thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream), 3150 thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3151 thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3152 VecHIPPlusEquals()); 3153 #else 3154 PetscInt n = matstruct->cprowIndices->size(); 3155 hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray); 3156 #endif 3157 PetscCall(PetscLogGpuTimeEnd()); 3158 } 3159 } else { 3160 if (yy && yy != zz) PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */ 3161 } 3162 PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3163 if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray)); 3164 else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray)); 3165 } catch (char *ex) { 3166 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex); 3167 } 3168 if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3169 else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3170 PetscFunctionReturn(PETSC_SUCCESS); 3171 } 3172 3173 static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3174 { 3175 PetscFunctionBegin; 3176 PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3177 PetscFunctionReturn(PETSC_SUCCESS); 3178 } 3179 3180 static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode) 3181 { 3182 PetscFunctionBegin; 3183 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3184 PetscFunctionReturn(PETSC_SUCCESS); 3185 } 3186 3187 /*@ 3188 MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format. 3189 This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations. 3190 3191 Collective 3192 3193 Input Parameters: 3194 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3195 . m - number of rows 3196 . n - number of columns 3197 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is set 3198 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3199 3200 Output Parameter: 3201 . A - the matrix 3202 3203 Level: intermediate 3204 3205 Notes: 3206 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3207 `MatXXXXSetPreallocation()` paradgm instead of this routine directly. 3208 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`] 3209 3210 The AIJ format (compressed row storage), is fully compatible with standard Fortran 3211 storage. That is, the stored row and column indices can begin at 3212 either one (as in Fortran) or zero. 3213 3214 Specify the preallocated storage with either `nz` or `nnz` (not both). 3215 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3216 allocation. 3217 3218 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE` 3219 @*/ 3220 PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3221 { 3222 PetscFunctionBegin; 3223 PetscCall(MatCreate(comm, A)); 3224 PetscCall(MatSetSizes(*A, m, n, m, n)); 3225 PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE)); 3226 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3227 PetscFunctionReturn(PETSC_SUCCESS); 3228 } 3229 3230 static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A) 3231 { 3232 PetscFunctionBegin; 3233 if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqAIJHIPSPARSE_Destroy(A)); 3234 else PetscCall(MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr)); 3235 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3236 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL)); 3237 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL)); 3238 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL)); 3239 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL)); 3240 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL)); 3241 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3242 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3243 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3244 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL)); 3245 PetscCall(MatDestroy_SeqAIJ(A)); 3246 PetscFunctionReturn(PETSC_SUCCESS); 3247 } 3248 3249 static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3250 { 3251 PetscFunctionBegin; 3252 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3253 PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B)); 3254 PetscFunctionReturn(PETSC_SUCCESS); 3255 } 3256 3257 static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3258 { 3259 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3260 Mat_SeqAIJHIPSPARSE *cy; 3261 Mat_SeqAIJHIPSPARSE *cx; 3262 PetscScalar *ay; 3263 const PetscScalar *ax; 3264 CsrMatrix *csry, *csrx; 3265 3266 PetscFunctionBegin; 3267 cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr; 3268 cx = (Mat_SeqAIJHIPSPARSE *)X->spptr; 3269 if (X->ops->axpy != Y->ops->axpy) { 3270 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3271 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3272 PetscFunctionReturn(PETSC_SUCCESS); 3273 } 3274 /* if we are here, it means both matrices are bound to GPU */ 3275 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y)); 3276 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X)); 3277 PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported"); 3278 PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported"); 3279 csry = (CsrMatrix *)cy->mat->mat; 3280 csrx = (CsrMatrix *)cx->mat->mat; 3281 /* see if we can turn this into a hipblas axpy */ 3282 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3283 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3284 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3285 if (eq) str = SAME_NONZERO_PATTERN; 3286 } 3287 /* spgeam is buggy with one column */ 3288 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3289 if (str == SUBSET_NONZERO_PATTERN) { 3290 PetscScalar b = 1.0; 3291 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0) 3292 size_t bufferSize; 3293 void *buffer; 3294 #endif 3295 3296 PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax)); 3297 PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay)); 3298 PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST)); 3299 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0) 3300 PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3301 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3302 PetscCallHIP(hipMalloc(&buffer, bufferSize)); 3303 PetscCall(PetscLogGpuTimeBegin()); 3304 PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3305 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3306 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3307 PetscCall(PetscLogGpuTimeEnd()); 3308 PetscCallHIP(hipFree(buffer)); 3309 #else 3310 PetscCall(PetscLogGpuTimeBegin()); 3311 PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3312 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3313 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3314 PetscCall(PetscLogGpuTimeEnd()); 3315 #endif 3316 PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE)); 3317 PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax)); 3318 PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay)); 3319 } else if (str == SAME_NONZERO_PATTERN) { 3320 hipblasHandle_t hipblasv2handle; 3321 PetscBLASInt one = 1, bnz = 1; 3322 3323 PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax)); 3324 PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay)); 3325 PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle)); 3326 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3327 PetscCall(PetscLogGpuTimeBegin()); 3328 PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one)); 3329 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3330 PetscCall(PetscLogGpuTimeEnd()); 3331 PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax)); 3332 PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay)); 3333 } else { 3334 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3335 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3336 } 3337 PetscFunctionReturn(PETSC_SUCCESS); 3338 } 3339 3340 static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a) 3341 { 3342 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3343 PetscScalar *ay; 3344 hipblasHandle_t hipblasv2handle; 3345 PetscBLASInt one = 1, bnz = 1; 3346 3347 PetscFunctionBegin; 3348 PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay)); 3349 PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle)); 3350 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3351 PetscCall(PetscLogGpuTimeBegin()); 3352 PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, bnz, &a, ay, one)); 3353 PetscCall(PetscLogGpuFlops(bnz)); 3354 PetscCall(PetscLogGpuTimeEnd()); 3355 PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay)); 3356 PetscFunctionReturn(PETSC_SUCCESS); 3357 } 3358 3359 static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A) 3360 { 3361 PetscBool both = PETSC_FALSE; 3362 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3363 3364 PetscFunctionBegin; 3365 if (A->factortype == MAT_FACTOR_NONE) { 3366 Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr; 3367 if (spptr->mat) { 3368 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3369 if (matrix->values) { 3370 both = PETSC_TRUE; 3371 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3372 } 3373 } 3374 if (spptr->matTranspose) { 3375 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3376 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3377 } 3378 } 3379 //PetscCall(MatZeroEntries_SeqAIJ(A)); 3380 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3381 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3382 else A->offloadmask = PETSC_OFFLOAD_CPU; 3383 PetscFunctionReturn(PETSC_SUCCESS); 3384 } 3385 3386 static PetscErrorCode MatGetCurrentMemType_SeqAIJHIPSPARSE(PETSC_UNUSED Mat A, PetscMemType *m) 3387 { 3388 PetscFunctionBegin; 3389 *m = PETSC_MEMTYPE_HIP; 3390 PetscFunctionReturn(PETSC_SUCCESS); 3391 } 3392 3393 static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg) 3394 { 3395 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3396 3397 PetscFunctionBegin; 3398 if (A->factortype != MAT_FACTOR_NONE) { 3399 A->boundtocpu = flg; 3400 PetscFunctionReturn(PETSC_SUCCESS); 3401 } 3402 if (flg) { 3403 PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A)); 3404 3405 A->ops->scale = MatScale_SeqAIJ; 3406 A->ops->axpy = MatAXPY_SeqAIJ; 3407 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3408 A->ops->mult = MatMult_SeqAIJ; 3409 A->ops->multadd = MatMultAdd_SeqAIJ; 3410 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3411 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3412 A->ops->multhermitiantranspose = NULL; 3413 A->ops->multhermitiantransposeadd = NULL; 3414 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3415 A->ops->getcurrentmemtype = NULL; 3416 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3417 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3418 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL)); 3419 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL)); 3420 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3421 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3422 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL)); 3423 } else { 3424 A->ops->scale = MatScale_SeqAIJHIPSPARSE; 3425 A->ops->axpy = MatAXPY_SeqAIJHIPSPARSE; 3426 A->ops->zeroentries = MatZeroEntries_SeqAIJHIPSPARSE; 3427 A->ops->mult = MatMult_SeqAIJHIPSPARSE; 3428 A->ops->multadd = MatMultAdd_SeqAIJHIPSPARSE; 3429 A->ops->multtranspose = MatMultTranspose_SeqAIJHIPSPARSE; 3430 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJHIPSPARSE; 3431 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJHIPSPARSE; 3432 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE; 3433 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJHIPSPARSE; 3434 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE; 3435 a->ops->getarray = MatSeqAIJGetArray_SeqAIJHIPSPARSE; 3436 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE; 3437 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE; 3438 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE; 3439 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE; 3440 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE; 3441 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE; 3442 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE)); 3443 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE)); 3444 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE)); 3445 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE)); 3446 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE)); 3447 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE)); 3448 } 3449 A->boundtocpu = flg; 3450 if (flg && a->inode.size_csr) a->inode.use = PETSC_TRUE; 3451 else a->inode.use = PETSC_FALSE; 3452 PetscFunctionReturn(PETSC_SUCCESS); 3453 } 3454 3455 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 3456 { 3457 Mat B; 3458 3459 PetscFunctionBegin; 3460 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */ 3461 if (reuse == MAT_INITIAL_MATRIX) { 3462 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3463 } else if (reuse == MAT_REUSE_MATRIX) { 3464 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 3465 } 3466 B = *newmat; 3467 PetscCall(PetscFree(B->defaultvectype)); 3468 PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype)); 3469 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3470 if (B->factortype == MAT_FACTOR_NONE) { 3471 Mat_SeqAIJHIPSPARSE *spptr; 3472 PetscCall(PetscNew(&spptr)); 3473 PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle)); 3474 PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream)); 3475 spptr->format = MAT_HIPSPARSE_CSR; 3476 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0) 3477 spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1; 3478 #else 3479 spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3480 #endif 3481 spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3482 //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1; 3483 3484 B->spptr = spptr; 3485 } else { 3486 Mat_SeqAIJHIPSPARSETriFactors *spptr; 3487 3488 PetscCall(PetscNew(&spptr)); 3489 PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle)); 3490 PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream)); 3491 B->spptr = spptr; 3492 } 3493 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3494 } 3495 B->ops->assemblyend = MatAssemblyEnd_SeqAIJHIPSPARSE; 3496 B->ops->destroy = MatDestroy_SeqAIJHIPSPARSE; 3497 B->ops->setoption = MatSetOption_SeqAIJHIPSPARSE; 3498 B->ops->setfromoptions = MatSetFromOptions_SeqAIJHIPSPARSE; 3499 B->ops->bindtocpu = MatBindToCPU_SeqAIJHIPSPARSE; 3500 B->ops->duplicate = MatDuplicate_SeqAIJHIPSPARSE; 3501 B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE; 3502 3503 PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE)); 3504 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE)); 3505 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE)); 3506 #if defined(PETSC_HAVE_HYPRE) 3507 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE)); 3508 #endif 3509 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE)); 3510 PetscFunctionReturn(PETSC_SUCCESS); 3511 } 3512 3513 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B) 3514 { 3515 PetscFunctionBegin; 3516 PetscCall(MatCreate_SeqAIJ(B)); 3517 PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B)); 3518 PetscFunctionReturn(PETSC_SUCCESS); 3519 } 3520 3521 /*MC 3522 MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs 3523 3524 A matrix type whose data resides on AMD GPUs. These matrices can be in either 3525 CSR, ELL, or Hybrid format. 3526 All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library. 3527 3528 Options Database Keys: 3529 + -mat_type aijhipsparse - sets the matrix type to `MATSEQAIJHIPSPARSE` 3530 . -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 3531 Other options include ell (ellpack) or hyb (hybrid). 3532 . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 3533 - -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU 3534 3535 Level: beginner 3536 3537 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation` 3538 M*/ 3539 3540 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void) 3541 { 3542 PetscFunctionBegin; 3543 PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse)); 3544 PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse)); 3545 PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse)); 3546 PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse)); 3547 PetscFunctionReturn(PETSC_SUCCESS); 3548 } 3549 3550 static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat mat) 3551 { 3552 Mat_SeqAIJHIPSPARSE *cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr); 3553 3554 PetscFunctionBegin; 3555 if (cusp) { 3556 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 3557 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 3558 delete cusp->workVector; 3559 delete cusp->rowoffsets_gpu; 3560 delete cusp->csr2csc_i; 3561 delete cusp->coords; 3562 if (cusp->handle) PetscCallHIPSPARSE(hipsparseDestroy(cusp->handle)); 3563 PetscCall(PetscFree(mat->spptr)); 3564 } 3565 PetscFunctionReturn(PETSC_SUCCESS); 3566 } 3567 3568 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3569 { 3570 PetscFunctionBegin; 3571 if (*mat) { 3572 delete (*mat)->values; 3573 delete (*mat)->column_indices; 3574 delete (*mat)->row_offsets; 3575 delete *mat; 3576 *mat = 0; 3577 } 3578 PetscFunctionReturn(PETSC_SUCCESS); 3579 } 3580 3581 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor) 3582 { 3583 PetscFunctionBegin; 3584 if (*trifactor) { 3585 if ((*trifactor)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*trifactor)->descr)); 3586 if ((*trifactor)->solveInfo) PetscCallHIPSPARSE(hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 3587 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3588 if ((*trifactor)->solveBuffer) PetscCallHIP(hipFree((*trifactor)->solveBuffer)); 3589 if ((*trifactor)->AA_h) PetscCallHIP(hipHostFree((*trifactor)->AA_h)); 3590 if ((*trifactor)->csr2cscBuffer) PetscCallHIP(hipFree((*trifactor)->csr2cscBuffer)); 3591 PetscCall(PetscFree(*trifactor)); 3592 } 3593 PetscFunctionReturn(PETSC_SUCCESS); 3594 } 3595 3596 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format) 3597 { 3598 CsrMatrix *mat; 3599 3600 PetscFunctionBegin; 3601 if (*matstruct) { 3602 if ((*matstruct)->mat) { 3603 if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) { 3604 hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat; 3605 PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat)); 3606 } else { 3607 mat = (CsrMatrix *)(*matstruct)->mat; 3608 PetscCall(CsrMatrix_Destroy(&mat)); 3609 } 3610 } 3611 if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr)); 3612 delete (*matstruct)->cprowIndices; 3613 if ((*matstruct)->alpha_one) PetscCallHIP(hipFree((*matstruct)->alpha_one)); 3614 if ((*matstruct)->beta_zero) PetscCallHIP(hipFree((*matstruct)->beta_zero)); 3615 if ((*matstruct)->beta_one) PetscCallHIP(hipFree((*matstruct)->beta_one)); 3616 3617 Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct; 3618 if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr)); 3619 for (int i = 0; i < 3; i++) { 3620 if (mdata->hipSpMV[i].initialized) { 3621 PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer)); 3622 PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr)); 3623 PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr)); 3624 } 3625 } 3626 delete *matstruct; 3627 *matstruct = NULL; 3628 } 3629 PetscFunctionReturn(PETSC_SUCCESS); 3630 } 3631 3632 PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors) 3633 { 3634 Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors; 3635 3636 PetscFunctionBegin; 3637 if (fs) { 3638 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 3639 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 3640 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 3641 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 3642 delete fs->rpermIndices; 3643 delete fs->cpermIndices; 3644 delete fs->workVector; 3645 fs->rpermIndices = NULL; 3646 fs->cpermIndices = NULL; 3647 fs->workVector = NULL; 3648 fs->init_dev_prop = PETSC_FALSE; 3649 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0) 3650 PetscCallHIP(hipFree(fs->csrRowPtr)); 3651 PetscCallHIP(hipFree(fs->csrColIdx)); 3652 PetscCallHIP(hipFree(fs->csrVal)); 3653 PetscCallHIP(hipFree(fs->X)); 3654 PetscCallHIP(hipFree(fs->Y)); 3655 // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 3656 PetscCallHIP(hipFree(fs->spsvBuffer_L)); 3657 PetscCallHIP(hipFree(fs->spsvBuffer_U)); 3658 PetscCallHIP(hipFree(fs->spsvBuffer_Lt)); 3659 PetscCallHIP(hipFree(fs->spsvBuffer_Ut)); 3660 PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M)); 3661 if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L)); 3662 if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U)); 3663 PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L)); 3664 PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 3665 PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U)); 3666 PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 3667 if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X)); 3668 if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y)); 3669 PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M)); 3670 PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M)); 3671 3672 fs->createdTransposeSpSVDescr = PETSC_FALSE; 3673 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 3674 #endif 3675 } 3676 PetscFunctionReturn(PETSC_SUCCESS); 3677 } 3678 3679 static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors) 3680 { 3681 hipsparseHandle_t handle; 3682 3683 PetscFunctionBegin; 3684 if (*trifactors) { 3685 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors)); 3686 if ((handle = (*trifactors)->handle)) PetscCallHIPSPARSE(hipsparseDestroy(handle)); 3687 PetscCall(PetscFree(*trifactors)); 3688 } 3689 PetscFunctionReturn(PETSC_SUCCESS); 3690 } 3691 3692 struct IJCompare { 3693 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3694 { 3695 if (t1.get<0>() < t2.get<0>()) return true; 3696 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3697 return false; 3698 } 3699 }; 3700 3701 static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3702 { 3703 Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; 3704 3705 PetscFunctionBegin; 3706 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE); 3707 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 3708 if (destroy) { 3709 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 3710 delete cusp->csr2csc_i; 3711 cusp->csr2csc_i = NULL; 3712 } 3713 A->transupdated = PETSC_FALSE; 3714 PetscFunctionReturn(PETSC_SUCCESS); 3715 } 3716 3717 static PetscErrorCode MatCOOStructDestroy_SeqAIJHIPSPARSE(PetscCtxRt data) 3718 { 3719 MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)data; 3720 3721 PetscFunctionBegin; 3722 PetscCallHIP(hipFree(coo->perm)); 3723 PetscCallHIP(hipFree(coo->jmap)); 3724 PetscCall(PetscFree(coo)); 3725 PetscFunctionReturn(PETSC_SUCCESS); 3726 } 3727 3728 static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 3729 { 3730 PetscBool dev_ij = PETSC_FALSE; 3731 PetscMemType mtype = PETSC_MEMTYPE_HOST; 3732 PetscInt *i, *j; 3733 PetscContainer container_h; 3734 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 3735 3736 PetscFunctionBegin; 3737 PetscCall(PetscGetMemType(coo_i, &mtype)); 3738 if (PetscMemTypeDevice(mtype)) { 3739 dev_ij = PETSC_TRUE; 3740 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 3741 PetscCallHIP(hipMemcpy(i, coo_i, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost)); 3742 PetscCallHIP(hipMemcpy(j, coo_j, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost)); 3743 } else { 3744 i = coo_i; 3745 j = coo_j; 3746 } 3747 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 3748 if (dev_ij) PetscCall(PetscFree2(i, j)); 3749 mat->offloadmask = PETSC_OFFLOAD_CPU; 3750 // Create the GPU memory 3751 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mat)); 3752 3753 // Copy the COO struct to device 3754 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 3755 PetscCall(PetscContainerGetPointer(container_h, &coo_h)); 3756 PetscCall(PetscMalloc1(1, &coo_d)); 3757 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 3758 PetscCallHIP(hipMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 3759 PetscCallHIP(hipMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice)); 3760 PetscCallHIP(hipMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 3761 PetscCallHIP(hipMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), hipMemcpyHostToDevice)); 3762 3763 // Put the COO struct in a container and then attach that to the matrix 3764 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJHIPSPARSE)); 3765 PetscFunctionReturn(PETSC_SUCCESS); 3766 } 3767 3768 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 3769 { 3770 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 3771 const PetscCount grid_size = gridDim.x * blockDim.x; 3772 for (; i < nnz; i += grid_size) { 3773 PetscScalar sum = 0.0; 3774 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 3775 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 3776 } 3777 } 3778 3779 static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3780 { 3781 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 3782 Mat_SeqAIJHIPSPARSE *dev = (Mat_SeqAIJHIPSPARSE *)A->spptr; 3783 PetscCount Annz = seq->nz; 3784 PetscMemType memtype; 3785 const PetscScalar *v1 = v; 3786 PetscScalar *Aa; 3787 PetscContainer container; 3788 MatCOOStruct_SeqAIJ *coo; 3789 3790 PetscFunctionBegin; 3791 if (!dev->mat) PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 3792 3793 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 3794 PetscCall(PetscContainerGetPointer(container, &coo)); 3795 3796 PetscCall(PetscGetMemType(v, &memtype)); 3797 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 3798 PetscCallHIP(hipMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 3799 PetscCallHIP(hipMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), hipMemcpyHostToDevice)); 3800 } 3801 3802 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa)); 3803 else PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa)); 3804 3805 PetscCall(PetscLogGpuTimeBegin()); 3806 if (Annz) { 3807 hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, coo->jmap, coo->perm, imode, Aa); 3808 PetscCallHIP(hipPeekAtLastError()); 3809 } 3810 PetscCall(PetscLogGpuTimeEnd()); 3811 3812 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa)); 3813 else PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa)); 3814 3815 if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1)); 3816 PetscFunctionReturn(PETSC_SUCCESS); 3817 } 3818 3819 /*@C 3820 MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices. 3821 3822 Not Collective 3823 3824 Input Parameters: 3825 + A - the matrix 3826 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 3827 3828 Output Parameters: 3829 + i - the CSR row pointers 3830 - j - the CSR column indices 3831 3832 Level: developer 3833 3834 Note: 3835 When compressed is true, the CSR structure does not contain empty rows 3836 3837 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()` 3838 @*/ 3839 PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int *i[], const int *j[]) 3840 { 3841 Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; 3842 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3843 CsrMatrix *csr; 3844 3845 PetscFunctionBegin; 3846 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 3847 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 3848 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE); 3849 PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 3850 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 3851 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct"); 3852 csr = (CsrMatrix *)cusp->mat->mat; 3853 if (i) { 3854 if (!compressed && a->compressedrow.use) { /* need full row offset */ 3855 if (!cusp->rowoffsets_gpu) { 3856 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3857 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 3858 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 3859 } 3860 *i = cusp->rowoffsets_gpu->data().get(); 3861 } else *i = csr->row_offsets->data().get(); 3862 } 3863 if (j) *j = csr->column_indices->data().get(); 3864 PetscFunctionReturn(PETSC_SUCCESS); 3865 } 3866 3867 /*@C 3868 MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()` 3869 3870 Not Collective 3871 3872 Input Parameters: 3873 + A - the matrix 3874 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 3875 . i - the CSR row pointers 3876 - j - the CSR column indices 3877 3878 Level: developer 3879 3880 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()` 3881 @*/ 3882 PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int *i[], const int *j[]) 3883 { 3884 PetscFunctionBegin; 3885 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 3886 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE); 3887 if (i) *i = NULL; 3888 if (j) *j = NULL; 3889 PetscFunctionReturn(PETSC_SUCCESS); 3890 } 3891 3892 /*@C 3893 MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored 3894 3895 Not Collective 3896 3897 Input Parameter: 3898 . A - a `MATSEQAIJHIPSPARSE` matrix 3899 3900 Output Parameter: 3901 . a - pointer to the device data 3902 3903 Level: developer 3904 3905 Note: 3906 May trigger host-device copies if the up-to-date matrix data is on host 3907 3908 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()` 3909 @*/ 3910 PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar *a[]) 3911 { 3912 Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; 3913 CsrMatrix *csr; 3914 3915 PetscFunctionBegin; 3916 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 3917 PetscAssertPointer(a, 2); 3918 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE); 3919 PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 3920 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 3921 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct"); 3922 csr = (CsrMatrix *)cusp->mat->mat; 3923 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory"); 3924 *a = csr->values->data().get(); 3925 PetscFunctionReturn(PETSC_SUCCESS); 3926 } 3927 3928 /*@C 3929 MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()` 3930 3931 Not Collective 3932 3933 Input Parameters: 3934 + A - a `MATSEQAIJHIPSPARSE` matrix 3935 - a - pointer to the device data 3936 3937 Level: developer 3938 3939 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()` 3940 @*/ 3941 PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar *a[]) 3942 { 3943 PetscFunctionBegin; 3944 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 3945 PetscAssertPointer(a, 2); 3946 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE); 3947 *a = NULL; 3948 PetscFunctionReturn(PETSC_SUCCESS); 3949 } 3950 3951 /*@C 3952 MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored 3953 3954 Not Collective 3955 3956 Input Parameter: 3957 . A - a `MATSEQAIJHIPSPARSE` matrix 3958 3959 Output Parameter: 3960 . a - pointer to the device data 3961 3962 Level: developer 3963 3964 Note: 3965 May trigger host-device copies if up-to-date matrix data is on host 3966 3967 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()` 3968 @*/ 3969 PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar *a[]) 3970 { 3971 Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; 3972 CsrMatrix *csr; 3973 3974 PetscFunctionBegin; 3975 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 3976 PetscAssertPointer(a, 2); 3977 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE); 3978 PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 3979 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 3980 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct"); 3981 csr = (CsrMatrix *)cusp->mat->mat; 3982 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory"); 3983 *a = csr->values->data().get(); 3984 A->offloadmask = PETSC_OFFLOAD_GPU; 3985 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE)); 3986 PetscFunctionReturn(PETSC_SUCCESS); 3987 } 3988 /*@C 3989 MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()` 3990 3991 Not Collective 3992 3993 Input Parameters: 3994 + A - a `MATSEQAIJHIPSPARSE` matrix 3995 - a - pointer to the device data 3996 3997 Level: developer 3998 3999 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()` 4000 @*/ 4001 PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar *a[]) 4002 { 4003 PetscFunctionBegin; 4004 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4005 PetscAssertPointer(a, 2); 4006 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE); 4007 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4008 *a = NULL; 4009 PetscFunctionReturn(PETSC_SUCCESS); 4010 } 4011 4012 /*@C 4013 MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored 4014 4015 Not Collective 4016 4017 Input Parameter: 4018 . A - a `MATSEQAIJHIPSPARSE` matrix 4019 4020 Output Parameter: 4021 . a - pointer to the device data 4022 4023 Level: developer 4024 4025 Note: 4026 Does not trigger host-device copies and flags data validity on the GPU 4027 4028 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()` 4029 @*/ 4030 PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar *a[]) 4031 { 4032 Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; 4033 CsrMatrix *csr; 4034 4035 PetscFunctionBegin; 4036 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4037 PetscAssertPointer(a, 2); 4038 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE); 4039 PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4040 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct"); 4041 csr = (CsrMatrix *)cusp->mat->mat; 4042 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory"); 4043 *a = csr->values->data().get(); 4044 A->offloadmask = PETSC_OFFLOAD_GPU; 4045 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4046 PetscFunctionReturn(PETSC_SUCCESS); 4047 } 4048 4049 /*@C 4050 MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()` 4051 4052 Not Collective 4053 4054 Input Parameters: 4055 + A - a `MATSEQAIJHIPSPARSE` matrix 4056 - a - pointer to the device data 4057 4058 Level: developer 4059 4060 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()` 4061 @*/ 4062 PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar *a[]) 4063 { 4064 PetscFunctionBegin; 4065 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4066 PetscAssertPointer(a, 2); 4067 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE); 4068 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4069 *a = NULL; 4070 PetscFunctionReturn(PETSC_SUCCESS); 4071 } 4072 4073 struct IJCompare4 { 4074 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4075 { 4076 if (t1.get<0>() < t2.get<0>()) return true; 4077 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4078 return false; 4079 } 4080 }; 4081 4082 struct Shift { 4083 int _shift; 4084 4085 Shift(int shift) : _shift(shift) { } 4086 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4087 }; 4088 4089 /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4090 PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4091 { 4092 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4093 Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp; 4094 Mat_SeqAIJHIPSPARSEMultStruct *Cmat; 4095 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4096 PetscInt Annz, Bnnz; 4097 PetscInt i, m, n, zero = 0; 4098 4099 PetscFunctionBegin; 4100 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4101 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4102 PetscAssertPointer(C, 4); 4103 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE); 4104 PetscCheckTypeName(B, MATSEQAIJHIPSPARSE); 4105 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4106 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4107 PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4108 PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4109 if (reuse == MAT_INITIAL_MATRIX) { 4110 m = A->rmap->n; 4111 n = A->cmap->n + B->cmap->n; 4112 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4113 PetscCall(MatSetSizes(*C, m, n, m, n)); 4114 PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE)); 4115 c = (Mat_SeqAIJ *)(*C)->data; 4116 Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr; 4117 Cmat = new Mat_SeqAIJHIPSPARSEMultStruct; 4118 Ccsr = new CsrMatrix; 4119 Cmat->cprowIndices = NULL; 4120 c->compressedrow.use = PETSC_FALSE; 4121 c->compressedrow.nrows = 0; 4122 c->compressedrow.i = NULL; 4123 c->compressedrow.rindex = NULL; 4124 Ccusp->workVector = NULL; 4125 Ccusp->nrows = m; 4126 Ccusp->mat = Cmat; 4127 Ccusp->mat->mat = Ccsr; 4128 Ccsr->num_rows = m; 4129 Ccsr->num_cols = n; 4130 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr)); 4131 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO)); 4132 PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); 4133 PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4134 PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4135 PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4136 PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice)); 4137 PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice)); 4138 PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice)); 4139 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 4140 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B)); 4141 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct"); 4142 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct"); 4143 4144 Acsr = (CsrMatrix *)Acusp->mat->mat; 4145 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4146 Annz = (PetscInt)Acsr->column_indices->size(); 4147 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4148 c->nz = Annz + Bnnz; 4149 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4150 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4151 Ccsr->values = new THRUSTARRAY(c->nz); 4152 Ccsr->num_entries = c->nz; 4153 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4154 if (c->nz) { 4155 auto Acoo = new THRUSTINTARRAY32(Annz); 4156 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4157 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4158 THRUSTINTARRAY32 *Aroff, *Broff; 4159 4160 if (a->compressedrow.use) { /* need full row offset */ 4161 if (!Acusp->rowoffsets_gpu) { 4162 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4163 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4164 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4165 } 4166 Aroff = Acusp->rowoffsets_gpu; 4167 } else Aroff = Acsr->row_offsets; 4168 if (b->compressedrow.use) { /* need full row offset */ 4169 if (!Bcusp->rowoffsets_gpu) { 4170 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4171 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4172 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4173 } 4174 Broff = Bcusp->rowoffsets_gpu; 4175 } else Broff = Bcsr->row_offsets; 4176 PetscCall(PetscLogGpuTimeBegin()); 4177 PetscCallHIPSPARSE(hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO)); 4178 PetscCallHIPSPARSE(hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO)); 4179 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4180 auto Aperm = thrust::make_constant_iterator(1); 4181 auto Bperm = thrust::make_constant_iterator(0); 4182 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4183 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4184 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4185 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4186 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4187 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4188 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4189 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4190 auto p1 = Ccusp->coords->begin(); 4191 auto p2 = Ccusp->coords->begin(); 4192 thrust::advance(p2, Annz); 4193 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4194 auto cci = thrust::make_counting_iterator(zero); 4195 auto cce = thrust::make_counting_iterator(c->nz); 4196 #if 0 //Errors on SUMMIT cuda 11.1.0 4197 PetscCallThrust(thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>())); 4198 #else 4199 auto pred = thrust::identity<int>(); 4200 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4201 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4202 #endif 4203 PetscCallHIPSPARSE(hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO)); 4204 PetscCall(PetscLogGpuTimeEnd()); 4205 delete wPerm; 4206 delete Acoo; 4207 delete Bcoo; 4208 delete Ccoo; 4209 PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype)); 4210 4211 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4212 PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A)); 4213 PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B)); 4214 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4215 Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct; 4216 CsrMatrix *CcsrT = new CsrMatrix; 4217 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4218 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4219 4220 (*C)->form_explicit_transpose = PETSC_TRUE; 4221 (*C)->transupdated = PETSC_TRUE; 4222 Ccusp->rowoffsets_gpu = NULL; 4223 CmatT->cprowIndices = NULL; 4224 CmatT->mat = CcsrT; 4225 CcsrT->num_rows = n; 4226 CcsrT->num_cols = m; 4227 CcsrT->num_entries = c->nz; 4228 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4229 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4230 CcsrT->values = new THRUSTARRAY(c->nz); 4231 4232 PetscCall(PetscLogGpuTimeBegin()); 4233 auto rT = CcsrT->row_offsets->begin(); 4234 if (AT) { 4235 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4236 thrust::advance(rT, -1); 4237 } 4238 if (BT) { 4239 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4240 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4241 thrust::copy(titb, tite, rT); 4242 } 4243 auto cT = CcsrT->column_indices->begin(); 4244 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4245 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4246 auto vT = CcsrT->values->begin(); 4247 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4248 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4249 PetscCall(PetscLogGpuTimeEnd()); 4250 4251 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr)); 4252 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO)); 4253 PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); 4254 PetscCallHIP(hipMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4255 PetscCallHIP(hipMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4256 PetscCallHIP(hipMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4257 PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice)); 4258 PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice)); 4259 PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice)); 4260 4261 PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype)); 4262 Ccusp->matTranspose = CmatT; 4263 } 4264 } 4265 4266 c->free_a = PETSC_TRUE; 4267 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4268 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4269 c->free_ij = PETSC_TRUE; 4270 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4271 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4272 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4273 ii = *Ccsr->row_offsets; 4274 jj = *Ccsr->column_indices; 4275 PetscCallHIP(hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost)); 4276 PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost)); 4277 } else { 4278 PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost)); 4279 PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost)); 4280 } 4281 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4282 PetscCall(PetscMalloc1(m, &c->ilen)); 4283 PetscCall(PetscMalloc1(m, &c->imax)); 4284 c->maxnz = c->nz; 4285 c->nonzerorowcnt = 0; 4286 c->rmax = 0; 4287 for (i = 0; i < m; i++) { 4288 const PetscInt nn = c->i[i + 1] - c->i[i]; 4289 c->ilen[i] = c->imax[i] = nn; 4290 c->nonzerorowcnt += (PetscInt)!!nn; 4291 c->rmax = PetscMax(c->rmax, nn); 4292 } 4293 PetscCall(PetscMalloc1(c->nz, &c->a)); 4294 (*C)->nonzerostate++; 4295 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4296 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4297 Ccusp->nonzerostate = (*C)->nonzerostate; 4298 (*C)->preallocated = PETSC_TRUE; 4299 } else { 4300 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4301 c = (Mat_SeqAIJ *)(*C)->data; 4302 if (c->nz) { 4303 Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr; 4304 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4305 PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4306 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4307 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A)); 4308 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B)); 4309 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct"); 4310 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct"); 4311 Acsr = (CsrMatrix *)Acusp->mat->mat; 4312 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4313 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4314 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4315 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4316 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4317 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4318 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4319 auto pmid = Ccusp->coords->begin(); 4320 thrust::advance(pmid, Acsr->num_entries); 4321 PetscCall(PetscLogGpuTimeBegin()); 4322 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4323 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4324 thrust::for_each(zibait, zieait, VecHIPEquals()); 4325 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4326 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4327 thrust::for_each(zibbit, ziebit, VecHIPEquals()); 4328 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4329 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4330 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct"); 4331 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4332 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4333 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4334 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4335 auto vT = CcsrT->values->begin(); 4336 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4337 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4338 (*C)->transupdated = PETSC_TRUE; 4339 } 4340 PetscCall(PetscLogGpuTimeEnd()); 4341 } 4342 } 4343 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4344 (*C)->assembled = PETSC_TRUE; 4345 (*C)->was_assembled = PETSC_FALSE; 4346 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4347 PetscFunctionReturn(PETSC_SUCCESS); 4348 } 4349 4350 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4351 { 4352 bool dmem; 4353 const PetscScalar *av; 4354 4355 PetscFunctionBegin; 4356 dmem = isHipMem(v); 4357 PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(A, &av)); 4358 if (n && idx) { 4359 THRUSTINTARRAY widx(n); 4360 widx.assign(idx, idx + n); 4361 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4362 4363 THRUSTARRAY *w = NULL; 4364 thrust::device_ptr<PetscScalar> dv; 4365 if (dmem) dv = thrust::device_pointer_cast(v); 4366 else { 4367 w = new THRUSTARRAY(n); 4368 dv = w->data(); 4369 } 4370 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4371 4372 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4373 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4374 thrust::for_each(zibit, zieit, VecHIPEquals()); 4375 if (w) PetscCallHIP(hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost)); 4376 delete w; 4377 } else PetscCallHIP(hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost)); 4378 4379 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4380 PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(A, &av)); 4381 PetscFunctionReturn(PETSC_SUCCESS); 4382 } 4383