1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <string.h> 12 #include <hip/hip_runtime.h> 13 14 #include "../hip/ceed-hip-common.h" 15 #include "../hip/ceed-hip-compile.h" 16 #include "ceed-hip-ref.h" 17 18 //------------------------------------------------------------------------------ 19 // Basis apply - tensor 20 //------------------------------------------------------------------------------ 21 static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 22 CeedVector u, CeedVector v) { 23 Ceed ceed; 24 CeedInt Q_1d, dim; 25 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 26 const int max_block_size = 64; 27 const CeedScalar *d_u; 28 CeedScalar *d_v; 29 CeedBasis_Hip *data; 30 31 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 32 CeedCallBackend(CeedBasisGetData(basis, &data)); 33 34 // Get read/write access to u, v 35 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 36 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 37 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 38 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 39 40 // Clear v for transpose operation 41 if (is_transpose && !apply_add) { 42 CeedSize length; 43 44 CeedCallBackend(CeedVectorGetLength(v, &length)); 45 CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); 46 } 47 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 48 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 49 50 // Basis action 51 switch (eval_mode) { 52 case CEED_EVAL_INTERP: { 53 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 54 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 55 56 CeedCallBackend(CeedRunKernel_Hip(ceed, data->Interp, num_elem, block_size, interp_args)); 57 } break; 58 case CEED_EVAL_GRAD: { 59 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 60 const CeedInt block_size = max_block_size; 61 62 CeedCallBackend(CeedRunKernel_Hip(ceed, data->Grad, num_elem, block_size, grad_args)); 63 } break; 64 case CEED_EVAL_WEIGHT: { 65 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 66 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 67 const int block_size_x = Q_1d; 68 const int block_size_y = dim >= 2 ? Q_1d : 1; 69 70 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 71 } break; 72 case CEED_EVAL_NONE: /* handled separately below */ 73 break; 74 // LCOV_EXCL_START 75 case CEED_EVAL_DIV: 76 case CEED_EVAL_CURL: 77 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 78 // LCOV_EXCL_STOP 79 } 80 81 // Restore vectors, cover CEED_EVAL_NONE 82 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 83 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 84 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 85 return CEED_ERROR_SUCCESS; 86 } 87 88 static int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 89 CeedCallBackend(CeedBasisApplyCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v)); 90 return CEED_ERROR_SUCCESS; 91 } 92 93 static int CeedBasisApplyAdd_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 94 CeedVector v) { 95 CeedCallBackend(CeedBasisApplyCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v)); 96 return CEED_ERROR_SUCCESS; 97 } 98 99 //------------------------------------------------------------------------------ 100 // Basis apply - tensor AtPoints 101 //------------------------------------------------------------------------------ 102 static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 103 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 104 Ceed ceed; 105 CeedInt Q_1d, dim, max_num_points = num_points[0]; 106 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 107 const int max_block_size = 32; 108 const CeedScalar *d_x, *d_u; 109 CeedScalar *d_v; 110 CeedBasis_Hip *data; 111 112 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 113 CeedCallBackend(CeedBasisGetData(basis, &data)); 114 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 115 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 116 117 // Weight handled separately 118 if (eval_mode == CEED_EVAL_WEIGHT) { 119 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 120 return CEED_ERROR_SUCCESS; 121 } 122 123 // Check padded to uniform number of points per elem 124 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 125 { 126 CeedInt num_comp, q_comp; 127 CeedSize len, len_required; 128 129 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 130 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 131 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 132 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 133 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 134 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 135 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 136 len, len_required); 137 } 138 139 // Move num_points array to device 140 if (is_transpose) { 141 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 142 143 if (num_elem != data->num_elem_at_points) { 144 data->num_elem_at_points = num_elem; 145 146 if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); 147 CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes)); 148 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 149 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 150 } 151 if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) { 152 memcpy(data->h_points_per_elem, num_points, num_bytes); 153 CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice)); 154 } 155 } 156 157 // Build kernels if needed 158 if (data->num_points != max_num_points) { 159 CeedInt P_1d; 160 161 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 162 data->num_points = max_num_points; 163 164 // -- Create interp matrix to Chebyshev coefficients 165 if (!data->d_chebyshev_interp_1d) { 166 CeedSize interp_bytes; 167 CeedScalar *chebyshev_interp_1d; 168 169 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 170 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 171 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 172 CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 173 CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 174 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 175 } 176 177 // -- Compile kernels 178 char *basis_kernel_source; 179 const char *basis_kernel_path; 180 CeedInt num_comp; 181 182 if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); 183 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 184 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h", &basis_kernel_path)); 185 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 186 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 187 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 188 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 189 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 190 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 191 max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 192 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 193 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 194 CeedCallBackend(CeedFree(&basis_kernel_path)); 195 CeedCallBackend(CeedFree(&basis_kernel_source)); 196 } 197 198 // Get read/write access to u, v 199 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 200 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 201 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 202 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 203 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 204 205 // Clear v for transpose operation 206 if (is_transpose && !apply_add) { 207 CeedSize length; 208 209 CeedCallBackend(CeedVectorGetLength(v, &length)); 210 CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); 211 } 212 213 // Basis action 214 switch (eval_mode) { 215 case CEED_EVAL_INTERP: { 216 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 217 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 218 219 CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); 220 } break; 221 case CEED_EVAL_GRAD: { 222 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 223 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 224 225 CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); 226 } break; 227 case CEED_EVAL_WEIGHT: 228 case CEED_EVAL_NONE: /* handled separately below */ 229 break; 230 // LCOV_EXCL_START 231 case CEED_EVAL_DIV: 232 case CEED_EVAL_CURL: 233 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 234 // LCOV_EXCL_STOP 235 } 236 237 // Restore vectors, cover CEED_EVAL_NONE 238 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 239 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 240 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 241 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 242 return CEED_ERROR_SUCCESS; 243 } 244 245 static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 246 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 247 CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 248 return CEED_ERROR_SUCCESS; 249 } 250 251 static int CeedBasisApplyAddAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 252 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 253 CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 254 return CEED_ERROR_SUCCESS; 255 } 256 257 //------------------------------------------------------------------------------ 258 // Basis apply - non-tensor 259 //------------------------------------------------------------------------------ 260 static int CeedBasisApplyNonTensorCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 261 CeedVector u, CeedVector v) { 262 Ceed ceed; 263 CeedInt num_nodes, num_qpts; 264 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 265 const int elems_per_block = 1; 266 const int grid = CeedDivUpInt(num_elem, elems_per_block); 267 const CeedScalar *d_u; 268 CeedScalar *d_v; 269 CeedBasisNonTensor_Hip *data; 270 271 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 272 CeedCallBackend(CeedBasisGetData(basis, &data)); 273 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 274 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 275 276 // Get read/write access to u, v 277 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 278 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 279 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 280 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 281 282 // Clear v for transpose operation 283 if (is_transpose && !apply_add) { 284 CeedSize length; 285 286 CeedCallBackend(CeedVectorGetLength(v, &length)); 287 CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); 288 } 289 290 // Apply basis operation 291 switch (eval_mode) { 292 case CEED_EVAL_INTERP: { 293 void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 294 const int block_size_x = is_transpose ? num_nodes : num_qpts; 295 296 if (is_transpose) { 297 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 298 } else { 299 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 300 } 301 } break; 302 case CEED_EVAL_GRAD: { 303 void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 304 const int block_size_x = is_transpose ? num_nodes : num_qpts; 305 306 if (is_transpose) { 307 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 308 } else { 309 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 310 } 311 } break; 312 case CEED_EVAL_DIV: { 313 void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 314 const int block_size_x = is_transpose ? num_nodes : num_qpts; 315 316 if (is_transpose) { 317 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 318 } else { 319 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 320 } 321 } break; 322 case CEED_EVAL_CURL: { 323 void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 324 const int block_size_x = is_transpose ? num_nodes : num_qpts; 325 326 if (is_transpose) { 327 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 328 } else { 329 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 330 } 331 } break; 332 case CEED_EVAL_WEIGHT: { 333 CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 334 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 335 336 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 337 } break; 338 case CEED_EVAL_NONE: /* handled separately below */ 339 break; 340 } 341 342 // Restore vectors, cover CEED_EVAL_NONE 343 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 344 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 345 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 346 return CEED_ERROR_SUCCESS; 347 } 348 349 static int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 350 CeedVector v) { 351 CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v)); 352 return CEED_ERROR_SUCCESS; 353 } 354 355 static int CeedBasisApplyAddNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 356 CeedVector v) { 357 CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v)); 358 return CEED_ERROR_SUCCESS; 359 } 360 361 //------------------------------------------------------------------------------ 362 // Destroy tensor basis 363 //------------------------------------------------------------------------------ 364 static int CeedBasisDestroy_Hip(CeedBasis basis) { 365 Ceed ceed; 366 CeedBasis_Hip *data; 367 368 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 369 CeedCallBackend(CeedBasisGetData(basis, &data)); 370 CeedCallHip(ceed, hipModuleUnload(data->module)); 371 if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); 372 if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); 373 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 374 if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); 375 CeedCallHip(ceed, hipFree(data->d_interp_1d)); 376 CeedCallHip(ceed, hipFree(data->d_grad_1d)); 377 CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d)); 378 CeedCallBackend(CeedFree(&data)); 379 return CEED_ERROR_SUCCESS; 380 } 381 382 //------------------------------------------------------------------------------ 383 // Destroy non-tensor basis 384 //------------------------------------------------------------------------------ 385 static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { 386 Ceed ceed; 387 CeedBasisNonTensor_Hip *data; 388 389 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 390 CeedCallBackend(CeedBasisGetData(basis, &data)); 391 CeedCallHip(ceed, hipModuleUnload(data->module)); 392 if (data->d_q_weight) CeedCallHip(ceed, hipFree(data->d_q_weight)); 393 CeedCallHip(ceed, hipFree(data->d_interp)); 394 CeedCallHip(ceed, hipFree(data->d_grad)); 395 CeedCallHip(ceed, hipFree(data->d_div)); 396 CeedCallHip(ceed, hipFree(data->d_curl)); 397 CeedCallBackend(CeedFree(&data)); 398 return CEED_ERROR_SUCCESS; 399 } 400 401 //------------------------------------------------------------------------------ 402 // Create tensor 403 //------------------------------------------------------------------------------ 404 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 405 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 406 Ceed ceed; 407 char *basis_kernel_source; 408 const char *basis_kernel_path; 409 CeedInt num_comp; 410 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 411 const CeedInt interp_bytes = q_bytes * P_1d; 412 CeedBasis_Hip *data; 413 414 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 415 CeedCallBackend(CeedCalloc(1, &data)); 416 417 // Copy data to GPU 418 if (q_weight_1d) { 419 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); 420 CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice)); 421 } 422 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes)); 423 CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice)); 424 CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes)); 425 CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice)); 426 427 // Compile basis kernels 428 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 429 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor.h", &basis_kernel_path)); 430 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 431 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 432 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 433 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 434 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 435 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 436 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 437 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); 438 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 439 CeedCallBackend(CeedFree(&basis_kernel_path)); 440 CeedCallBackend(CeedFree(&basis_kernel_source)); 441 442 CeedCallBackend(CeedBasisSetData(basis, data)); 443 444 // Register backend functions 445 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip)); 446 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Hip)); 447 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip)); 448 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip)); 449 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip)); 450 return CEED_ERROR_SUCCESS; 451 } 452 453 //------------------------------------------------------------------------------ 454 // Create non-tensor H^1 455 //------------------------------------------------------------------------------ 456 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 457 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 458 Ceed ceed; 459 char *basis_kernel_source; 460 const char *basis_kernel_path; 461 CeedInt num_comp, q_comp_interp, q_comp_grad; 462 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 463 CeedBasisNonTensor_Hip *data; 464 465 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 466 CeedCallBackend(CeedCalloc(1, &data)); 467 468 // Copy basis data to GPU 469 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 470 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 471 if (q_weight) { 472 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); 473 CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); 474 } 475 if (interp) { 476 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 477 478 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); 479 CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); 480 } 481 if (grad) { 482 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 483 484 CeedCallHip(ceed, hipMalloc((void **)&data->d_grad, grad_bytes)); 485 CeedCallHip(ceed, hipMemcpy(data->d_grad, grad, grad_bytes, hipMemcpyHostToDevice)); 486 } 487 488 // Compile basis kernels 489 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 490 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); 491 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 492 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 493 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 494 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 495 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 496 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 497 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 498 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); 499 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 500 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 501 CeedCallBackend(CeedFree(&basis_kernel_path)); 502 CeedCallBackend(CeedFree(&basis_kernel_source)); 503 504 CeedCallBackend(CeedBasisSetData(basis, data)); 505 506 // Register backend functions 507 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); 508 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); 509 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); 510 return CEED_ERROR_SUCCESS; 511 } 512 513 //------------------------------------------------------------------------------ 514 // Create non-tensor H(div) 515 //------------------------------------------------------------------------------ 516 int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 517 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 518 Ceed ceed; 519 char *basis_kernel_source; 520 const char *basis_kernel_path; 521 CeedInt num_comp, q_comp_interp, q_comp_div; 522 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 523 CeedBasisNonTensor_Hip *data; 524 525 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 526 CeedCallBackend(CeedCalloc(1, &data)); 527 528 // Copy basis data to GPU 529 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 530 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 531 if (q_weight) { 532 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); 533 CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); 534 } 535 if (interp) { 536 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 537 538 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); 539 CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); 540 } 541 if (div) { 542 const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 543 544 CeedCallHip(ceed, hipMalloc((void **)&data->d_div, div_bytes)); 545 CeedCallHip(ceed, hipMemcpy(data->d_div, div, div_bytes, hipMemcpyHostToDevice)); 546 } 547 548 // Compile basis kernels 549 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 550 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); 551 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 552 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 553 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 554 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 555 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 556 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 557 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 558 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); 559 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 560 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 561 CeedCallBackend(CeedFree(&basis_kernel_path)); 562 CeedCallBackend(CeedFree(&basis_kernel_source)); 563 564 CeedCallBackend(CeedBasisSetData(basis, data)); 565 566 // Register backend functions 567 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); 568 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); 569 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); 570 return CEED_ERROR_SUCCESS; 571 } 572 573 //------------------------------------------------------------------------------ 574 // Create non-tensor H(curl) 575 //------------------------------------------------------------------------------ 576 int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 577 const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 578 Ceed ceed; 579 char *basis_kernel_source; 580 const char *basis_kernel_path; 581 CeedInt num_comp, q_comp_interp, q_comp_curl; 582 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 583 CeedBasisNonTensor_Hip *data; 584 585 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 586 CeedCallBackend(CeedCalloc(1, &data)); 587 588 // Copy basis data to GPU 589 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 590 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 591 if (q_weight) { 592 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); 593 CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); 594 } 595 if (interp) { 596 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 597 598 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); 599 CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); 600 } 601 if (curl) { 602 const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 603 604 CeedCallHip(ceed, hipMalloc((void **)&data->d_curl, curl_bytes)); 605 CeedCallHip(ceed, hipMemcpy(data->d_curl, curl, curl_bytes, hipMemcpyHostToDevice)); 606 } 607 608 // Compile basis kernels 609 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 610 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); 611 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 612 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 613 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 614 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 615 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 616 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 617 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 618 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); 619 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 620 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 621 CeedCallBackend(CeedFree(&basis_kernel_path)); 622 CeedCallBackend(CeedFree(&basis_kernel_source)); 623 624 CeedCallBackend(CeedBasisSetData(basis, data)); 625 626 // Register backend functions 627 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); 628 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip)); 629 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); 630 return CEED_ERROR_SUCCESS; 631 } 632 633 //------------------------------------------------------------------------------ 634