1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <string.h> 12 #include <hip/hip_runtime.h> 13 14 #include "../hip/ceed-hip-common.h" 15 #include "../hip/ceed-hip-compile.h" 16 #include "ceed-hip-ref.h" 17 18 //------------------------------------------------------------------------------ 19 // Basis apply - tensor 20 //------------------------------------------------------------------------------ 21 static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 22 CeedVector u, CeedVector v) { 23 Ceed ceed; 24 CeedInt Q_1d, dim; 25 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 26 const int max_block_size = 64; 27 const CeedScalar *d_u; 28 CeedScalar *d_v; 29 CeedBasis_Hip *data; 30 31 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 32 CeedCallBackend(CeedBasisGetData(basis, &data)); 33 34 // Get read/write access to u, v 35 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 36 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 37 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 38 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 39 40 // Clear v for transpose operation 41 if (is_transpose && !apply_add) { 42 CeedInt num_comp, q_comp, num_nodes, num_qpts; 43 CeedSize length; 44 45 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 46 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 47 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 48 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 49 length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp)); 50 CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); 51 } 52 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 53 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 54 55 // Basis action 56 switch (eval_mode) { 57 case CEED_EVAL_INTERP: { 58 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 59 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 60 61 CeedCallBackend(CeedRunKernel_Hip(ceed, data->Interp, num_elem, block_size, interp_args)); 62 } break; 63 case CEED_EVAL_GRAD: { 64 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 65 const CeedInt block_size = max_block_size; 66 67 CeedCallBackend(CeedRunKernel_Hip(ceed, data->Grad, num_elem, block_size, grad_args)); 68 } break; 69 case CEED_EVAL_WEIGHT: { 70 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 71 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 72 const int block_size_x = Q_1d; 73 const int block_size_y = dim >= 2 ? Q_1d : 1; 74 75 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 76 } break; 77 case CEED_EVAL_NONE: /* handled separately below */ 78 break; 79 // LCOV_EXCL_START 80 case CEED_EVAL_DIV: 81 case CEED_EVAL_CURL: 82 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 83 // LCOV_EXCL_STOP 84 } 85 86 // Restore vectors, cover CEED_EVAL_NONE 87 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 88 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 89 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 90 CeedCallBackend(CeedDestroy(&ceed)); 91 return CEED_ERROR_SUCCESS; 92 } 93 94 static int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 95 CeedCallBackend(CeedBasisApplyCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v)); 96 return CEED_ERROR_SUCCESS; 97 } 98 99 static int CeedBasisApplyAdd_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 100 CeedVector v) { 101 CeedCallBackend(CeedBasisApplyCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v)); 102 return CEED_ERROR_SUCCESS; 103 } 104 105 //------------------------------------------------------------------------------ 106 // Basis apply - tensor AtPoints 107 //------------------------------------------------------------------------------ 108 static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 109 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 110 Ceed ceed; 111 CeedInt Q_1d, dim, max_num_points = num_points[0]; 112 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 113 const int max_block_size = 32; 114 const CeedScalar *d_x, *d_u; 115 CeedScalar *d_v; 116 CeedBasis_Hip *data; 117 118 CeedCallBackend(CeedBasisGetData(basis, &data)); 119 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 120 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 121 122 // Weight handled separately 123 if (eval_mode == CEED_EVAL_WEIGHT) { 124 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 125 return CEED_ERROR_SUCCESS; 126 } 127 128 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 129 130 // Check padded to uniform number of points per elem 131 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 132 { 133 CeedInt num_comp, q_comp; 134 CeedSize len, len_required; 135 136 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 137 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 138 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 139 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 140 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 141 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 142 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 143 len, len_required); 144 } 145 146 // Move num_points array to device 147 if (is_transpose) { 148 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 149 150 if (num_elem != data->num_elem_at_points) { 151 data->num_elem_at_points = num_elem; 152 153 if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); 154 CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes)); 155 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 156 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 157 } 158 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 159 memcpy(data->h_points_per_elem, num_points, num_bytes); 160 CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice)); 161 } 162 } 163 164 // Build kernels if needed 165 if (data->num_points != max_num_points) { 166 CeedInt P_1d; 167 168 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 169 data->num_points = max_num_points; 170 171 // -- Create interp matrix to Chebyshev coefficients 172 if (!data->d_chebyshev_interp_1d) { 173 CeedSize interp_bytes; 174 CeedScalar *chebyshev_interp_1d; 175 176 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 177 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 178 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 179 CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 180 CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 181 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 182 } 183 184 // -- Compile kernels 185 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h>\n"; 186 CeedInt num_comp; 187 188 if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); 189 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 190 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 191 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 192 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 193 max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 194 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 195 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); 196 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 197 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); 198 } 199 200 // Get read/write access to u, v 201 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 202 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 203 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 204 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 205 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 206 207 // Clear v for transpose operation 208 if (is_transpose && !apply_add) { 209 CeedInt num_comp, q_comp, num_nodes; 210 CeedSize length; 211 212 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 213 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 214 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 215 length = 216 (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp)); 217 CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); 218 } 219 220 // Basis action 221 switch (eval_mode) { 222 case CEED_EVAL_INTERP: { 223 void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 224 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 225 226 CeedCallBackend( 227 CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args)); 228 } break; 229 case CEED_EVAL_GRAD: { 230 void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 231 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 232 233 CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); 234 } break; 235 case CEED_EVAL_WEIGHT: 236 case CEED_EVAL_NONE: /* handled separately below */ 237 break; 238 // LCOV_EXCL_START 239 case CEED_EVAL_DIV: 240 case CEED_EVAL_CURL: 241 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 242 // LCOV_EXCL_STOP 243 } 244 245 // Restore vectors, cover CEED_EVAL_NONE 246 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 247 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 248 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 249 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 250 CeedCallBackend(CeedDestroy(&ceed)); 251 return CEED_ERROR_SUCCESS; 252 } 253 254 static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 255 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 256 CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 257 return CEED_ERROR_SUCCESS; 258 } 259 260 static int CeedBasisApplyAddAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 261 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 262 CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 263 return CEED_ERROR_SUCCESS; 264 } 265 266 //------------------------------------------------------------------------------ 267 // Basis apply - non-tensor 268 //------------------------------------------------------------------------------ 269 static int CeedBasisApplyNonTensorCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 270 CeedVector u, CeedVector v) { 271 Ceed ceed; 272 CeedInt num_nodes, num_qpts; 273 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 274 const int elems_per_block = 1; 275 const int grid = CeedDivUpInt(num_elem, elems_per_block); 276 const CeedScalar *d_u; 277 CeedScalar *d_v; 278 CeedBasisNonTensor_Hip *data; 279 280 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 281 CeedCallBackend(CeedBasisGetData(basis, &data)); 282 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 283 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 284 285 // Get read/write access to u, v 286 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 287 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 288 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 289 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 290 291 // Clear v for transpose operation 292 if (is_transpose && !apply_add) { 293 CeedSize length; 294 295 CeedCallBackend(CeedVectorGetLength(v, &length)); 296 CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); 297 } 298 299 // Apply basis operation 300 switch (eval_mode) { 301 case CEED_EVAL_INTERP: { 302 void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 303 const int block_size_x = is_transpose ? num_nodes : num_qpts; 304 305 if (is_transpose) { 306 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 307 } else { 308 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 309 } 310 } break; 311 case CEED_EVAL_GRAD: { 312 void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 313 const int block_size_x = is_transpose ? num_nodes : num_qpts; 314 315 if (is_transpose) { 316 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 317 } else { 318 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 319 } 320 } break; 321 case CEED_EVAL_DIV: { 322 void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 323 const int block_size_x = is_transpose ? num_nodes : num_qpts; 324 325 if (is_transpose) { 326 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 327 } else { 328 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 329 } 330 } break; 331 case CEED_EVAL_CURL: { 332 void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 333 const int block_size_x = is_transpose ? num_nodes : num_qpts; 334 335 if (is_transpose) { 336 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 337 } else { 338 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 339 } 340 } break; 341 case CEED_EVAL_WEIGHT: { 342 CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 343 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 344 345 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 346 } break; 347 case CEED_EVAL_NONE: /* handled separately below */ 348 break; 349 } 350 351 // Restore vectors, cover CEED_EVAL_NONE 352 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 353 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 354 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 355 CeedCallBackend(CeedDestroy(&ceed)); 356 return CEED_ERROR_SUCCESS; 357 } 358 359 static int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 360 CeedVector v) { 361 CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v)); 362 return CEED_ERROR_SUCCESS; 363 } 364 365 static int CeedBasisApplyAddNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 366 CeedVector v) { 367 CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v)); 368 return CEED_ERROR_SUCCESS; 369 } 370 371 //------------------------------------------------------------------------------ 372 // Destroy tensor basis 373 //------------------------------------------------------------------------------ 374 static int CeedBasisDestroy_Hip(CeedBasis basis) { 375 Ceed ceed; 376 CeedBasis_Hip *data; 377 378 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 379 CeedCallBackend(CeedBasisGetData(basis, &data)); 380 CeedCallHip(ceed, hipModuleUnload(data->module)); 381 if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); 382 if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); 383 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 384 if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); 385 CeedCallHip(ceed, hipFree(data->d_interp_1d)); 386 CeedCallHip(ceed, hipFree(data->d_grad_1d)); 387 CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d)); 388 CeedCallBackend(CeedFree(&data)); 389 CeedCallBackend(CeedDestroy(&ceed)); 390 return CEED_ERROR_SUCCESS; 391 } 392 393 //------------------------------------------------------------------------------ 394 // Destroy non-tensor basis 395 //------------------------------------------------------------------------------ 396 static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { 397 Ceed ceed; 398 CeedBasisNonTensor_Hip *data; 399 400 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 401 CeedCallBackend(CeedBasisGetData(basis, &data)); 402 CeedCallHip(ceed, hipModuleUnload(data->module)); 403 if (data->d_q_weight) CeedCallHip(ceed, hipFree(data->d_q_weight)); 404 CeedCallHip(ceed, hipFree(data->d_interp)); 405 CeedCallHip(ceed, hipFree(data->d_grad)); 406 CeedCallHip(ceed, hipFree(data->d_div)); 407 CeedCallHip(ceed, hipFree(data->d_curl)); 408 CeedCallBackend(CeedFree(&data)); 409 CeedCallBackend(CeedDestroy(&ceed)); 410 return CEED_ERROR_SUCCESS; 411 } 412 413 //------------------------------------------------------------------------------ 414 // Create tensor 415 //------------------------------------------------------------------------------ 416 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 417 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 418 Ceed ceed; 419 CeedInt num_comp; 420 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 421 const CeedInt interp_bytes = q_bytes * P_1d; 422 CeedBasis_Hip *data; 423 424 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 425 CeedCallBackend(CeedCalloc(1, &data)); 426 427 // Copy data to GPU 428 if (q_weight_1d) { 429 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); 430 CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice)); 431 } 432 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes)); 433 CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice)); 434 CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes)); 435 CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice)); 436 437 // Compile basis kernels 438 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor.h>\n"; 439 440 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 441 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 442 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 443 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 444 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 445 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); 446 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 447 448 CeedCallBackend(CeedBasisSetData(basis, data)); 449 450 // Register backend functions 451 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip)); 452 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Hip)); 453 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip)); 454 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip)); 455 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip)); 456 CeedCallBackend(CeedDestroy(&ceed)); 457 return CEED_ERROR_SUCCESS; 458 } 459 460 //------------------------------------------------------------------------------ 461 // Create non-tensor H^1 462 //------------------------------------------------------------------------------ 463 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 464 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 465 Ceed ceed; 466 CeedInt num_comp, q_comp_interp, q_comp_grad; 467 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 468 CeedBasisNonTensor_Hip *data; 469 470 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 471 CeedCallBackend(CeedCalloc(1, &data)); 472 473 // Copy basis data to GPU 474 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 475 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 476 if (q_weight) { 477 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); 478 CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); 479 } 480 if (interp) { 481 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 482 483 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); 484 CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); 485 } 486 if (grad) { 487 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 488 489 CeedCallHip(ceed, hipMalloc((void **)&data->d_grad, grad_bytes)); 490 CeedCallHip(ceed, hipMemcpy(data->d_grad, grad, grad_bytes, hipMemcpyHostToDevice)); 491 } 492 493 // Compile basis kernels 494 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n"; 495 496 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 497 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 498 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 499 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 500 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 501 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); 502 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 503 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 504 505 CeedCallBackend(CeedBasisSetData(basis, data)); 506 507 // Register backend functions 508 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); 509 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); 510 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); 511 CeedCallBackend(CeedDestroy(&ceed)); 512 return CEED_ERROR_SUCCESS; 513 } 514 515 //------------------------------------------------------------------------------ 516 // Create non-tensor H(div) 517 //------------------------------------------------------------------------------ 518 int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 519 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 520 Ceed ceed; 521 CeedInt num_comp, q_comp_interp, q_comp_div; 522 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 523 CeedBasisNonTensor_Hip *data; 524 525 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 526 CeedCallBackend(CeedCalloc(1, &data)); 527 528 // Copy basis data to GPU 529 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 530 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 531 if (q_weight) { 532 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); 533 CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); 534 } 535 if (interp) { 536 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 537 538 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); 539 CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); 540 } 541 if (div) { 542 const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 543 544 CeedCallHip(ceed, hipMalloc((void **)&data->d_div, div_bytes)); 545 CeedCallHip(ceed, hipMemcpy(data->d_div, div, div_bytes, hipMemcpyHostToDevice)); 546 } 547 548 // Compile basis kernels 549 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n"; 550 551 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 552 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 553 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 554 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 555 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 556 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); 557 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 558 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 559 560 CeedCallBackend(CeedBasisSetData(basis, data)); 561 562 // Register backend functions 563 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); 564 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); 565 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); 566 CeedCallBackend(CeedDestroy(&ceed)); 567 return CEED_ERROR_SUCCESS; 568 } 569 570 //------------------------------------------------------------------------------ 571 // Create non-tensor H(curl) 572 //------------------------------------------------------------------------------ 573 int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 574 const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 575 Ceed ceed; 576 CeedInt num_comp, q_comp_interp, q_comp_curl; 577 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 578 CeedBasisNonTensor_Hip *data; 579 580 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 581 CeedCallBackend(CeedCalloc(1, &data)); 582 583 // Copy basis data to GPU 584 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 585 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 586 if (q_weight) { 587 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); 588 CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); 589 } 590 if (interp) { 591 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 592 593 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); 594 CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); 595 } 596 if (curl) { 597 const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 598 599 CeedCallHip(ceed, hipMalloc((void **)&data->d_curl, curl_bytes)); 600 CeedCallHip(ceed, hipMemcpy(data->d_curl, curl, curl_bytes, hipMemcpyHostToDevice)); 601 } 602 603 // Compile basis kernels 604 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n"; 605 606 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 607 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 608 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 609 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 610 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 611 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); 612 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 613 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 614 615 CeedCallBackend(CeedBasisSetData(basis, data)); 616 617 // Register backend functions 618 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); 619 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); 620 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); 621 CeedCallBackend(CeedDestroy(&ceed)); 622 return CEED_ERROR_SUCCESS; 623 } 624 625 //------------------------------------------------------------------------------ 626