1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <string.h> 12 #include <hip/hip_runtime.h> 13 14 #include "../hip/ceed-hip-common.h" 15 #include "../hip/ceed-hip-compile.h" 16 #include "ceed-hip-ref.h" 17 18 //------------------------------------------------------------------------------ 19 // Basis apply - tensor 20 //------------------------------------------------------------------------------ 21 static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 22 CeedVector u, CeedVector v) { 23 Ceed ceed; 24 CeedInt Q_1d, dim; 25 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 26 const int max_block_size = 64; 27 const CeedScalar *d_u; 28 CeedScalar *d_v; 29 CeedBasis_Hip *data; 30 31 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 32 CeedCallBackend(CeedBasisGetData(basis, &data)); 33 34 // Get read/write access to u, v 35 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 36 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 37 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 38 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 39 40 // Clear v for transpose operation 41 if (is_transpose && !apply_add) { 42 CeedInt num_comp, q_comp, num_nodes, num_qpts; 43 CeedSize length; 44 45 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 46 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 47 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 48 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 49 length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp)); 50 CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); 51 } 52 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 53 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 54 55 // Basis action 56 switch (eval_mode) { 57 case CEED_EVAL_INTERP: { 58 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 59 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 60 61 CeedCallBackend(CeedRunKernel_Hip(ceed, data->Interp, num_elem, block_size, interp_args)); 62 } break; 63 case CEED_EVAL_GRAD: { 64 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 65 const CeedInt block_size = max_block_size; 66 67 CeedCallBackend(CeedRunKernel_Hip(ceed, data->Grad, num_elem, block_size, grad_args)); 68 } break; 69 case CEED_EVAL_WEIGHT: { 70 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 71 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 72 const int block_size_x = Q_1d; 73 const int block_size_y = dim >= 2 ? Q_1d : 1; 74 75 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 76 } break; 77 case CEED_EVAL_NONE: /* handled separately below */ 78 break; 79 // LCOV_EXCL_START 80 case CEED_EVAL_DIV: 81 case CEED_EVAL_CURL: 82 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 83 // LCOV_EXCL_STOP 84 } 85 86 // Restore vectors, cover CEED_EVAL_NONE 87 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 88 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 89 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 90 return CEED_ERROR_SUCCESS; 91 } 92 93 static int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 94 CeedCallBackend(CeedBasisApplyCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v)); 95 return CEED_ERROR_SUCCESS; 96 } 97 98 static int CeedBasisApplyAdd_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 99 CeedVector v) { 100 CeedCallBackend(CeedBasisApplyCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v)); 101 return CEED_ERROR_SUCCESS; 102 } 103 104 //------------------------------------------------------------------------------ 105 // Basis apply - tensor AtPoints 106 //------------------------------------------------------------------------------ 107 static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 108 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 109 Ceed ceed; 110 CeedInt Q_1d, dim, max_num_points = num_points[0]; 111 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 112 const int max_block_size = 32; 113 const CeedScalar *d_x, *d_u; 114 CeedScalar *d_v; 115 CeedBasis_Hip *data; 116 117 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 118 CeedCallBackend(CeedBasisGetData(basis, &data)); 119 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 120 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 121 122 // Weight handled separately 123 if (eval_mode == CEED_EVAL_WEIGHT) { 124 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 125 return CEED_ERROR_SUCCESS; 126 } 127 128 // Check padded to uniform number of points per elem 129 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 130 { 131 CeedInt num_comp, q_comp; 132 CeedSize len, len_required; 133 134 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 135 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 136 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 137 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 138 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 139 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 140 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 141 len, len_required); 142 } 143 144 // Move num_points array to device 145 if (is_transpose) { 146 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 147 148 if (num_elem != data->num_elem_at_points) { 149 data->num_elem_at_points = num_elem; 150 151 if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); 152 CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes)); 153 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 154 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 155 } 156 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 157 memcpy(data->h_points_per_elem, num_points, num_bytes); 158 CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice)); 159 } 160 } 161 162 // Build kernels if needed 163 if (data->num_points != max_num_points) { 164 CeedInt P_1d; 165 166 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 167 data->num_points = max_num_points; 168 169 // -- Create interp matrix to Chebyshev coefficients 170 if (!data->d_chebyshev_interp_1d) { 171 CeedSize interp_bytes; 172 CeedScalar *chebyshev_interp_1d; 173 174 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 175 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 176 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 177 CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 178 CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 179 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 180 } 181 182 // -- Compile kernels 183 char *basis_kernel_source; 184 const char *basis_kernel_path; 185 CeedInt num_comp; 186 187 if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); 188 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 189 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h", &basis_kernel_path)); 190 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 191 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 192 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 193 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 194 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 195 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 196 max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 197 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 198 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 199 CeedCallBackend(CeedFree(&basis_kernel_path)); 200 CeedCallBackend(CeedFree(&basis_kernel_source)); 201 } 202 203 // Get read/write access to u, v 204 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 205 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 206 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 207 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 208 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 209 210 // Clear v for transpose operation 211 if (is_transpose && !apply_add) { 212 CeedInt num_comp, q_comp, num_nodes; 213 CeedSize length; 214 215 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 216 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 217 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 218 length = 219 (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp)); 220 CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); 221 } 222 223 // Basis action 224 switch (eval_mode) { 225 case CEED_EVAL_INTERP: { 226 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 227 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 228 229 CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); 230 } break; 231 case CEED_EVAL_GRAD: { 232 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 233 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 234 235 CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); 236 } break; 237 case CEED_EVAL_WEIGHT: 238 case CEED_EVAL_NONE: /* handled separately below */ 239 break; 240 // LCOV_EXCL_START 241 case CEED_EVAL_DIV: 242 case CEED_EVAL_CURL: 243 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 244 // LCOV_EXCL_STOP 245 } 246 247 // Restore vectors, cover CEED_EVAL_NONE 248 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 249 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 250 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 251 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 252 return CEED_ERROR_SUCCESS; 253 } 254 255 static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 256 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 257 CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 258 return CEED_ERROR_SUCCESS; 259 } 260 261 static int CeedBasisApplyAddAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 262 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 263 CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 264 return CEED_ERROR_SUCCESS; 265 } 266 267 //------------------------------------------------------------------------------ 268 // Basis apply - non-tensor 269 //------------------------------------------------------------------------------ 270 static int CeedBasisApplyNonTensorCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 271 CeedVector u, CeedVector v) { 272 Ceed ceed; 273 CeedInt num_nodes, num_qpts; 274 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 275 const int elems_per_block = 1; 276 const int grid = CeedDivUpInt(num_elem, elems_per_block); 277 const CeedScalar *d_u; 278 CeedScalar *d_v; 279 CeedBasisNonTensor_Hip *data; 280 281 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 282 CeedCallBackend(CeedBasisGetData(basis, &data)); 283 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 284 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 285 286 // Get read/write access to u, v 287 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 288 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 289 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 290 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 291 292 // Clear v for transpose operation 293 if (is_transpose && !apply_add) { 294 CeedSize length; 295 296 CeedCallBackend(CeedVectorGetLength(v, &length)); 297 CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); 298 } 299 300 // Apply basis operation 301 switch (eval_mode) { 302 case CEED_EVAL_INTERP: { 303 void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 304 const int block_size_x = is_transpose ? num_nodes : num_qpts; 305 306 if (is_transpose) { 307 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 308 } else { 309 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 310 } 311 } break; 312 case CEED_EVAL_GRAD: { 313 void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 314 const int block_size_x = is_transpose ? num_nodes : num_qpts; 315 316 if (is_transpose) { 317 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 318 } else { 319 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 320 } 321 } break; 322 case CEED_EVAL_DIV: { 323 void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 324 const int block_size_x = is_transpose ? num_nodes : num_qpts; 325 326 if (is_transpose) { 327 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 328 } else { 329 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 330 } 331 } break; 332 case CEED_EVAL_CURL: { 333 void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 334 const int block_size_x = is_transpose ? num_nodes : num_qpts; 335 336 if (is_transpose) { 337 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 338 } else { 339 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 340 } 341 } break; 342 case CEED_EVAL_WEIGHT: { 343 CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 344 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 345 346 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 347 } break; 348 case CEED_EVAL_NONE: /* handled separately below */ 349 break; 350 } 351 352 // Restore vectors, cover CEED_EVAL_NONE 353 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 354 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 355 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 356 return CEED_ERROR_SUCCESS; 357 } 358 359 static int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 360 CeedVector v) { 361 CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v)); 362 return CEED_ERROR_SUCCESS; 363 } 364 365 static int CeedBasisApplyAddNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 366 CeedVector v) { 367 CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v)); 368 return CEED_ERROR_SUCCESS; 369 } 370 371 //------------------------------------------------------------------------------ 372 // Destroy tensor basis 373 //------------------------------------------------------------------------------ 374 static int CeedBasisDestroy_Hip(CeedBasis basis) { 375 Ceed ceed; 376 CeedBasis_Hip *data; 377 378 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 379 CeedCallBackend(CeedBasisGetData(basis, &data)); 380 CeedCallHip(ceed, hipModuleUnload(data->module)); 381 if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); 382 if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); 383 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 384 if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); 385 CeedCallHip(ceed, hipFree(data->d_interp_1d)); 386 CeedCallHip(ceed, hipFree(data->d_grad_1d)); 387 CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d)); 388 CeedCallBackend(CeedFree(&data)); 389 return CEED_ERROR_SUCCESS; 390 } 391 392 //------------------------------------------------------------------------------ 393 // Destroy non-tensor basis 394 //------------------------------------------------------------------------------ 395 static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { 396 Ceed ceed; 397 CeedBasisNonTensor_Hip *data; 398 399 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 400 CeedCallBackend(CeedBasisGetData(basis, &data)); 401 CeedCallHip(ceed, hipModuleUnload(data->module)); 402 if (data->d_q_weight) CeedCallHip(ceed, hipFree(data->d_q_weight)); 403 CeedCallHip(ceed, hipFree(data->d_interp)); 404 CeedCallHip(ceed, hipFree(data->d_grad)); 405 CeedCallHip(ceed, hipFree(data->d_div)); 406 CeedCallHip(ceed, hipFree(data->d_curl)); 407 CeedCallBackend(CeedFree(&data)); 408 return CEED_ERROR_SUCCESS; 409 } 410 411 //------------------------------------------------------------------------------ 412 // Create tensor 413 //------------------------------------------------------------------------------ 414 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 415 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 416 Ceed ceed; 417 char *basis_kernel_source; 418 const char *basis_kernel_path; 419 CeedInt num_comp; 420 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 421 const CeedInt interp_bytes = q_bytes * P_1d; 422 CeedBasis_Hip *data; 423 424 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 425 CeedCallBackend(CeedCalloc(1, &data)); 426 427 // Copy data to GPU 428 if (q_weight_1d) { 429 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); 430 CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice)); 431 } 432 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes)); 433 CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice)); 434 CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes)); 435 CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice)); 436 437 // Compile basis kernels 438 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 439 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor.h", &basis_kernel_path)); 440 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 441 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 442 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 443 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 444 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 445 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 446 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 447 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); 448 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 449 CeedCallBackend(CeedFree(&basis_kernel_path)); 450 CeedCallBackend(CeedFree(&basis_kernel_source)); 451 452 CeedCallBackend(CeedBasisSetData(basis, data)); 453 454 // Register backend functions 455 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip)); 456 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Hip)); 457 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip)); 458 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip)); 459 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip)); 460 return CEED_ERROR_SUCCESS; 461 } 462 463 //------------------------------------------------------------------------------ 464 // Create non-tensor H^1 465 //------------------------------------------------------------------------------ 466 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 467 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 468 Ceed ceed; 469 char *basis_kernel_source; 470 const char *basis_kernel_path; 471 CeedInt num_comp, q_comp_interp, q_comp_grad; 472 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 473 CeedBasisNonTensor_Hip *data; 474 475 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 476 CeedCallBackend(CeedCalloc(1, &data)); 477 478 // Copy basis data to GPU 479 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 480 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 481 if (q_weight) { 482 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); 483 CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); 484 } 485 if (interp) { 486 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 487 488 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); 489 CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); 490 } 491 if (grad) { 492 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 493 494 CeedCallHip(ceed, hipMalloc((void **)&data->d_grad, grad_bytes)); 495 CeedCallHip(ceed, hipMemcpy(data->d_grad, grad, grad_bytes, hipMemcpyHostToDevice)); 496 } 497 498 // Compile basis kernels 499 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 500 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); 501 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 502 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 503 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 504 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 505 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 506 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 507 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 508 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); 509 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 510 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 511 CeedCallBackend(CeedFree(&basis_kernel_path)); 512 CeedCallBackend(CeedFree(&basis_kernel_source)); 513 514 CeedCallBackend(CeedBasisSetData(basis, data)); 515 516 // Register backend functions 517 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); 518 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); 519 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); 520 return CEED_ERROR_SUCCESS; 521 } 522 523 //------------------------------------------------------------------------------ 524 // Create non-tensor H(div) 525 //------------------------------------------------------------------------------ 526 int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 527 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 528 Ceed ceed; 529 char *basis_kernel_source; 530 const char *basis_kernel_path; 531 CeedInt num_comp, q_comp_interp, q_comp_div; 532 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 533 CeedBasisNonTensor_Hip *data; 534 535 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 536 CeedCallBackend(CeedCalloc(1, &data)); 537 538 // Copy basis data to GPU 539 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 540 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 541 if (q_weight) { 542 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); 543 CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); 544 } 545 if (interp) { 546 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 547 548 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); 549 CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); 550 } 551 if (div) { 552 const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 553 554 CeedCallHip(ceed, hipMalloc((void **)&data->d_div, div_bytes)); 555 CeedCallHip(ceed, hipMemcpy(data->d_div, div, div_bytes, hipMemcpyHostToDevice)); 556 } 557 558 // Compile basis kernels 559 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 560 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); 561 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 562 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 563 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 564 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 565 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 566 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 567 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 568 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); 569 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 570 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 571 CeedCallBackend(CeedFree(&basis_kernel_path)); 572 CeedCallBackend(CeedFree(&basis_kernel_source)); 573 574 CeedCallBackend(CeedBasisSetData(basis, data)); 575 576 // Register backend functions 577 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); 578 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); 579 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); 580 return CEED_ERROR_SUCCESS; 581 } 582 583 //------------------------------------------------------------------------------ 584 // Create non-tensor H(curl) 585 //------------------------------------------------------------------------------ 586 int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 587 const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 588 Ceed ceed; 589 char *basis_kernel_source; 590 const char *basis_kernel_path; 591 CeedInt num_comp, q_comp_interp, q_comp_curl; 592 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 593 CeedBasisNonTensor_Hip *data; 594 595 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 596 CeedCallBackend(CeedCalloc(1, &data)); 597 598 // Copy basis data to GPU 599 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 600 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 601 if (q_weight) { 602 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); 603 CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); 604 } 605 if (interp) { 606 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 607 608 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); 609 CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); 610 } 611 if (curl) { 612 const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 613 614 CeedCallHip(ceed, hipMalloc((void **)&data->d_curl, curl_bytes)); 615 CeedCallHip(ceed, hipMemcpy(data->d_curl, curl, curl_bytes, hipMemcpyHostToDevice)); 616 } 617 618 // Compile basis kernels 619 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 620 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); 621 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 622 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 623 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 624 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 625 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 626 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 627 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 628 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); 629 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 630 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 631 CeedCallBackend(CeedFree(&basis_kernel_path)); 632 CeedCallBackend(CeedFree(&basis_kernel_source)); 633 634 CeedCallBackend(CeedBasisSetData(basis, data)); 635 636 // Register backend functions 637 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); 638 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); 639 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); 640 return CEED_ERROR_SUCCESS; 641 } 642 643 //------------------------------------------------------------------------------ 644