1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <string.h> 12 #include <hip/hip_runtime.h> 13 14 #include "../hip/ceed-hip-common.h" 15 #include "../hip/ceed-hip-compile.h" 16 #include "ceed-hip-ref.h" 17 18 //------------------------------------------------------------------------------ 19 // Basis apply - tensor 20 //------------------------------------------------------------------------------ 21 static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 22 CeedVector u, CeedVector v) { 23 Ceed ceed; 24 CeedInt Q_1d, dim; 25 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 26 const int max_block_size = 64; 27 const CeedScalar *d_u; 28 CeedScalar *d_v; 29 CeedBasis_Hip *data; 30 31 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 32 CeedCallBackend(CeedBasisGetData(basis, &data)); 33 34 // Get read/write access to u, v 35 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 36 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 37 if (apply_add) { 38 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 39 } else { 40 // Clear v for transpose operation 41 if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); 42 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 43 } 44 45 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 46 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 47 48 // Basis action 49 switch (eval_mode) { 50 case CEED_EVAL_INTERP: { 51 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 52 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 53 54 CeedCallBackend(CeedRunKernel_Hip(ceed, data->Interp, num_elem, block_size, interp_args)); 55 } break; 56 case CEED_EVAL_GRAD: { 57 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 58 const CeedInt block_size = max_block_size; 59 60 CeedCallBackend(CeedRunKernel_Hip(ceed, data->Grad, num_elem, block_size, grad_args)); 61 } break; 62 case CEED_EVAL_WEIGHT: { 63 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 64 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 65 const int block_size_x = Q_1d; 66 const int block_size_y = dim >= 2 ? Q_1d : 1; 67 68 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 69 } break; 70 case CEED_EVAL_NONE: /* handled separately below */ 71 break; 72 // LCOV_EXCL_START 73 case CEED_EVAL_DIV: 74 case CEED_EVAL_CURL: 75 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 76 // LCOV_EXCL_STOP 77 } 78 79 // Restore vectors, cover CEED_EVAL_NONE 80 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 81 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 82 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 83 CeedCallBackend(CeedDestroy(&ceed)); 84 return CEED_ERROR_SUCCESS; 85 } 86 87 static int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 88 CeedCallBackend(CeedBasisApplyCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v)); 89 return CEED_ERROR_SUCCESS; 90 } 91 92 static int CeedBasisApplyAdd_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 93 CeedVector v) { 94 CeedCallBackend(CeedBasisApplyCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v)); 95 return CEED_ERROR_SUCCESS; 96 } 97 98 //------------------------------------------------------------------------------ 99 // Basis apply - tensor AtPoints 100 //------------------------------------------------------------------------------ 101 static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 102 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 103 Ceed ceed; 104 CeedInt Q_1d, dim, max_num_points = num_points[0]; 105 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 106 const int max_block_size = 32; 107 const CeedScalar *d_x, *d_u; 108 CeedScalar *d_v; 109 CeedBasis_Hip *data; 110 111 CeedCallBackend(CeedBasisGetData(basis, &data)); 112 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 113 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 114 115 // Weight handled separately 116 if (eval_mode == CEED_EVAL_WEIGHT) { 117 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 118 return CEED_ERROR_SUCCESS; 119 } 120 121 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 122 123 // Check padded to uniform number of points per elem 124 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 125 { 126 CeedInt num_comp, q_comp; 127 CeedSize len, len_required; 128 129 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 130 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 131 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 132 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 133 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 134 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 135 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 136 len, len_required); 137 } 138 139 // Move num_points array to device 140 if (is_transpose) { 141 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 142 143 if (num_elem != data->num_elem_at_points) { 144 data->num_elem_at_points = num_elem; 145 146 if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); 147 CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes)); 148 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 149 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 150 } 151 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 152 memcpy(data->h_points_per_elem, num_points, num_bytes); 153 CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice)); 154 } 155 } 156 157 // Build kernels if needed 158 if (data->num_points != max_num_points) { 159 CeedInt P_1d; 160 161 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 162 data->num_points = max_num_points; 163 164 // -- Create interp matrix to Chebyshev coefficients 165 if (!data->d_chebyshev_interp_1d) { 166 CeedSize interp_bytes; 167 CeedScalar *chebyshev_interp_1d; 168 169 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 170 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 171 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 172 CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 173 CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 174 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 175 } 176 177 // -- Compile kernels 178 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h>\n"; 179 CeedInt num_comp; 180 181 if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); 182 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 183 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 184 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 185 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 186 max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 187 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 188 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); 189 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 190 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); 191 } 192 193 // Get read/write access to u, v 194 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 195 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 196 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 197 if (apply_add) { 198 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 199 } else { 200 // Clear v for transpose operation 201 if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); 202 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 203 } 204 205 // Basis action 206 switch (eval_mode) { 207 case CEED_EVAL_INTERP: { 208 void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 209 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 210 211 CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, 212 interp_args)); 213 } break; 214 case CEED_EVAL_GRAD: { 215 void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 216 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 217 218 CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); 219 } break; 220 case CEED_EVAL_WEIGHT: 221 case CEED_EVAL_NONE: /* handled separately below */ 222 break; 223 // LCOV_EXCL_START 224 case CEED_EVAL_DIV: 225 case CEED_EVAL_CURL: 226 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 227 // LCOV_EXCL_STOP 228 } 229 230 // Restore vectors, cover CEED_EVAL_NONE 231 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 232 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 233 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 234 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 235 CeedCallBackend(CeedDestroy(&ceed)); 236 return CEED_ERROR_SUCCESS; 237 } 238 239 static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 240 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 241 CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 242 return CEED_ERROR_SUCCESS; 243 } 244 245 static int CeedBasisApplyAddAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 246 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 247 CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 248 return CEED_ERROR_SUCCESS; 249 } 250 251 //------------------------------------------------------------------------------ 252 // Basis apply - non-tensor 253 //------------------------------------------------------------------------------ 254 static int CeedBasisApplyNonTensorCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 255 CeedVector u, CeedVector v) { 256 Ceed ceed; 257 CeedInt num_nodes, num_qpts; 258 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 259 const int elems_per_block = 1; 260 const int grid = CeedDivUpInt(num_elem, elems_per_block); 261 const CeedScalar *d_u; 262 CeedScalar *d_v; 263 CeedBasisNonTensor_Hip *data; 264 265 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 266 CeedCallBackend(CeedBasisGetData(basis, &data)); 267 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 268 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 269 270 // Get read/write access to u, v 271 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 272 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 273 if (apply_add) { 274 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 275 } else { 276 // Clear v for transpose operation 277 if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); 278 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 279 } 280 281 // Apply basis operation 282 switch (eval_mode) { 283 case CEED_EVAL_INTERP: { 284 void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 285 const int block_size_x = is_transpose ? num_nodes : num_qpts; 286 287 if (is_transpose) { 288 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 289 } else { 290 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 291 } 292 } break; 293 case CEED_EVAL_GRAD: { 294 void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 295 const int block_size_x = is_transpose ? num_nodes : num_qpts; 296 297 if (is_transpose) { 298 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 299 } else { 300 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 301 } 302 } break; 303 case CEED_EVAL_DIV: { 304 void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 305 const int block_size_x = is_transpose ? num_nodes : num_qpts; 306 307 if (is_transpose) { 308 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 309 } else { 310 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 311 } 312 } break; 313 case CEED_EVAL_CURL: { 314 void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 315 const int block_size_x = is_transpose ? num_nodes : num_qpts; 316 317 if (is_transpose) { 318 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 319 } else { 320 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 321 } 322 } break; 323 case CEED_EVAL_WEIGHT: { 324 CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 325 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 326 327 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 328 } break; 329 case CEED_EVAL_NONE: /* handled separately below */ 330 break; 331 } 332 333 // Restore vectors, cover CEED_EVAL_NONE 334 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 335 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 336 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 337 CeedCallBackend(CeedDestroy(&ceed)); 338 return CEED_ERROR_SUCCESS; 339 } 340 341 static int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 342 CeedVector v) { 343 CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v)); 344 return CEED_ERROR_SUCCESS; 345 } 346 347 static int CeedBasisApplyAddNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 348 CeedVector v) { 349 CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v)); 350 return CEED_ERROR_SUCCESS; 351 } 352 353 //------------------------------------------------------------------------------ 354 // Destroy tensor basis 355 //------------------------------------------------------------------------------ 356 static int CeedBasisDestroy_Hip(CeedBasis basis) { 357 Ceed ceed; 358 CeedBasis_Hip *data; 359 360 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 361 CeedCallBackend(CeedBasisGetData(basis, &data)); 362 CeedCallHip(ceed, hipModuleUnload(data->module)); 363 if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); 364 if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); 365 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 366 if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); 367 CeedCallHip(ceed, hipFree(data->d_interp_1d)); 368 CeedCallHip(ceed, hipFree(data->d_grad_1d)); 369 CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d)); 370 CeedCallBackend(CeedFree(&data)); 371 CeedCallBackend(CeedDestroy(&ceed)); 372 return CEED_ERROR_SUCCESS; 373 } 374 375 //------------------------------------------------------------------------------ 376 // Destroy non-tensor basis 377 //------------------------------------------------------------------------------ 378 static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { 379 Ceed ceed; 380 CeedBasisNonTensor_Hip *data; 381 382 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 383 CeedCallBackend(CeedBasisGetData(basis, &data)); 384 CeedCallHip(ceed, hipModuleUnload(data->module)); 385 if (data->d_q_weight) CeedCallHip(ceed, hipFree(data->d_q_weight)); 386 CeedCallHip(ceed, hipFree(data->d_interp)); 387 CeedCallHip(ceed, hipFree(data->d_grad)); 388 CeedCallHip(ceed, hipFree(data->d_div)); 389 CeedCallHip(ceed, hipFree(data->d_curl)); 390 CeedCallBackend(CeedFree(&data)); 391 CeedCallBackend(CeedDestroy(&ceed)); 392 return CEED_ERROR_SUCCESS; 393 } 394 395 //------------------------------------------------------------------------------ 396 // Create tensor 397 //------------------------------------------------------------------------------ 398 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 399 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 400 Ceed ceed; 401 CeedInt num_comp; 402 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 403 const CeedInt interp_bytes = q_bytes * P_1d; 404 CeedBasis_Hip *data; 405 406 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 407 CeedCallBackend(CeedCalloc(1, &data)); 408 409 // Copy data to GPU 410 if (q_weight_1d) { 411 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); 412 CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice)); 413 } 414 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes)); 415 CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice)); 416 CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes)); 417 CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice)); 418 419 // Compile basis kernels 420 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor.h>\n"; 421 422 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 423 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 424 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 425 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 426 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 427 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); 428 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 429 430 CeedCallBackend(CeedBasisSetData(basis, data)); 431 432 // Register backend functions 433 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip)); 434 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Hip)); 435 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip)); 436 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip)); 437 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip)); 438 CeedCallBackend(CeedDestroy(&ceed)); 439 return CEED_ERROR_SUCCESS; 440 } 441 442 //------------------------------------------------------------------------------ 443 // Create non-tensor H^1 444 //------------------------------------------------------------------------------ 445 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 446 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 447 Ceed ceed; 448 CeedInt num_comp, q_comp_interp, q_comp_grad; 449 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 450 CeedBasisNonTensor_Hip *data; 451 452 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 453 CeedCallBackend(CeedCalloc(1, &data)); 454 455 // Copy basis data to GPU 456 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 457 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 458 if (q_weight) { 459 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); 460 CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); 461 } 462 if (interp) { 463 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 464 465 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); 466 CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); 467 } 468 if (grad) { 469 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 470 471 CeedCallHip(ceed, hipMalloc((void **)&data->d_grad, grad_bytes)); 472 CeedCallHip(ceed, hipMemcpy(data->d_grad, grad, grad_bytes, hipMemcpyHostToDevice)); 473 } 474 475 // Compile basis kernels 476 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n"; 477 478 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 479 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 480 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 481 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 482 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 483 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); 484 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 485 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 486 487 CeedCallBackend(CeedBasisSetData(basis, data)); 488 489 // Register backend functions 490 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); 491 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); 492 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); 493 CeedCallBackend(CeedDestroy(&ceed)); 494 return CEED_ERROR_SUCCESS; 495 } 496 497 //------------------------------------------------------------------------------ 498 // Create non-tensor H(div) 499 //------------------------------------------------------------------------------ 500 int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 501 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 502 Ceed ceed; 503 CeedInt num_comp, q_comp_interp, q_comp_div; 504 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 505 CeedBasisNonTensor_Hip *data; 506 507 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 508 CeedCallBackend(CeedCalloc(1, &data)); 509 510 // Copy basis data to GPU 511 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 512 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 513 if (q_weight) { 514 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); 515 CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); 516 } 517 if (interp) { 518 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 519 520 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); 521 CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); 522 } 523 if (div) { 524 const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 525 526 CeedCallHip(ceed, hipMalloc((void **)&data->d_div, div_bytes)); 527 CeedCallHip(ceed, hipMemcpy(data->d_div, div, div_bytes, hipMemcpyHostToDevice)); 528 } 529 530 // Compile basis kernels 531 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n"; 532 533 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 534 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 535 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 536 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 537 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 538 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); 539 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 540 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 541 542 CeedCallBackend(CeedBasisSetData(basis, data)); 543 544 // Register backend functions 545 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); 546 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); 547 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); 548 CeedCallBackend(CeedDestroy(&ceed)); 549 return CEED_ERROR_SUCCESS; 550 } 551 552 //------------------------------------------------------------------------------ 553 // Create non-tensor H(curl) 554 //------------------------------------------------------------------------------ 555 int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 556 const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 557 Ceed ceed; 558 CeedInt num_comp, q_comp_interp, q_comp_curl; 559 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 560 CeedBasisNonTensor_Hip *data; 561 562 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 563 CeedCallBackend(CeedCalloc(1, &data)); 564 565 // Copy basis data to GPU 566 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 567 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 568 if (q_weight) { 569 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); 570 CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); 571 } 572 if (interp) { 573 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 574 575 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); 576 CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); 577 } 578 if (curl) { 579 const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 580 581 CeedCallHip(ceed, hipMalloc((void **)&data->d_curl, curl_bytes)); 582 CeedCallHip(ceed, hipMemcpy(data->d_curl, curl, curl_bytes, hipMemcpyHostToDevice)); 583 } 584 585 // Compile basis kernels 586 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n"; 587 588 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 589 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 590 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 591 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 592 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 593 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); 594 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 595 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 596 597 CeedCallBackend(CeedBasisSetData(basis, data)); 598 599 // Register backend functions 600 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); 601 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); 602 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); 603 CeedCallBackend(CeedDestroy(&ceed)); 604 return CEED_ERROR_SUCCESS; 605 } 606 607 //------------------------------------------------------------------------------ 608