1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <string.h> 14 15 #include "../cuda/ceed-cuda-common.h" 16 #include "../cuda/ceed-cuda-compile.h" 17 #include "ceed-cuda-ref.h" 18 19 //------------------------------------------------------------------------------ 20 // Basis apply - tensor 21 //------------------------------------------------------------------------------ 22 static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 23 CeedVector u, CeedVector v) { 24 Ceed ceed; 25 CeedInt Q_1d, dim; 26 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 27 const int max_block_size = 32; 28 const CeedScalar *d_u; 29 CeedScalar *d_v; 30 CeedBasis_Cuda *data; 31 32 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 33 CeedCallBackend(CeedBasisGetData(basis, &data)); 34 35 // Get read/write access to u, v 36 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 37 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 38 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 39 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 40 41 // Clear v for transpose operation 42 if (is_transpose && !apply_add) { 43 CeedSize length; 44 45 CeedCallBackend(CeedVectorGetLength(v, &length)); 46 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 47 } 48 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 49 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 50 51 // Basis action 52 switch (eval_mode) { 53 case CEED_EVAL_INTERP: { 54 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 55 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 56 57 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 58 } break; 59 case CEED_EVAL_GRAD: { 60 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 61 const CeedInt block_size = max_block_size; 62 63 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 64 } break; 65 case CEED_EVAL_WEIGHT: { 66 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 67 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 68 const int block_size_x = Q_1d; 69 const int block_size_y = dim >= 2 ? Q_1d : 1; 70 71 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 72 } break; 73 case CEED_EVAL_NONE: /* handled separately below */ 74 break; 75 // LCOV_EXCL_START 76 case CEED_EVAL_DIV: 77 case CEED_EVAL_CURL: 78 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 79 // LCOV_EXCL_STOP 80 } 81 82 // Restore vectors, cover CEED_EVAL_NONE 83 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 84 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 85 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 86 return CEED_ERROR_SUCCESS; 87 } 88 89 static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 90 CeedVector v) { 91 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 92 return CEED_ERROR_SUCCESS; 93 } 94 95 static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 96 CeedVector v) { 97 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 98 return CEED_ERROR_SUCCESS; 99 } 100 101 //------------------------------------------------------------------------------ 102 // Basis apply - tensor AtPoints 103 //------------------------------------------------------------------------------ 104 static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 105 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 106 Ceed ceed; 107 CeedInt Q_1d, dim, max_num_points = num_points[0]; 108 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 109 const int max_block_size = 32; 110 const CeedScalar *d_x, *d_u; 111 CeedScalar *d_v; 112 CeedBasis_Cuda *data; 113 114 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 115 CeedCallBackend(CeedBasisGetData(basis, &data)); 116 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 117 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 118 119 // Weight handled separately 120 if (eval_mode == CEED_EVAL_WEIGHT) { 121 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 122 return CEED_ERROR_SUCCESS; 123 } 124 125 // Check padded to uniform number of points per elem 126 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 127 { 128 CeedInt num_comp, q_comp; 129 CeedSize len, len_required; 130 131 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 132 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 133 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 134 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 135 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 136 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 137 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 138 len, len_required); 139 } 140 141 // Move num_points array to device 142 if (is_transpose) { 143 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 144 145 if (num_elem != data->num_elem_at_points) { 146 data->num_elem_at_points = num_elem; 147 148 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 149 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 150 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 151 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 152 } 153 if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) { 154 memcpy(data->h_points_per_elem, num_points, num_bytes); 155 CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 156 } 157 } 158 159 // Build kernels if needed 160 if (data->num_points != max_num_points) { 161 CeedInt P_1d; 162 163 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 164 data->num_points = max_num_points; 165 166 // -- Create interp matrix to Chebyshev coefficients 167 if (!data->d_chebyshev_interp_1d) { 168 CeedSize interp_bytes; 169 CeedScalar *chebyshev_interp_1d; 170 171 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 172 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 173 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 174 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 175 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 176 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 177 } 178 179 // -- Compile kernels 180 char *basis_kernel_source; 181 const char *basis_kernel_path; 182 CeedInt num_comp; 183 184 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 185 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 186 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path)); 187 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 188 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 189 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 190 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 191 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 192 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 193 max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 194 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 195 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 196 CeedCallBackend(CeedFree(&basis_kernel_path)); 197 CeedCallBackend(CeedFree(&basis_kernel_source)); 198 } 199 200 // Get read/write access to u, v 201 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 202 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 203 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 204 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 205 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 206 207 // Clear v for transpose operation 208 if (is_transpose && !apply_add) { 209 CeedSize length; 210 211 CeedCallBackend(CeedVectorGetLength(v, &length)); 212 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 213 } 214 215 // Basis action 216 switch (eval_mode) { 217 case CEED_EVAL_INTERP: { 218 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 219 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 220 221 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); 222 } break; 223 case CEED_EVAL_GRAD: { 224 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 225 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 226 227 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); 228 } break; 229 case CEED_EVAL_WEIGHT: 230 case CEED_EVAL_NONE: /* handled separately below */ 231 break; 232 // LCOV_EXCL_START 233 case CEED_EVAL_DIV: 234 case CEED_EVAL_CURL: 235 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 236 // LCOV_EXCL_STOP 237 } 238 239 // Restore vectors, cover CEED_EVAL_NONE 240 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 241 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 242 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 243 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 244 return CEED_ERROR_SUCCESS; 245 } 246 247 static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 248 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 249 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 250 return CEED_ERROR_SUCCESS; 251 } 252 253 static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 254 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 255 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 256 return CEED_ERROR_SUCCESS; 257 } 258 259 //------------------------------------------------------------------------------ 260 // Basis apply - non-tensor 261 //------------------------------------------------------------------------------ 262 static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 263 CeedVector u, CeedVector v) { 264 Ceed ceed; 265 CeedInt num_nodes, num_qpts; 266 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 267 const int elems_per_block = 1; 268 const int grid = CeedDivUpInt(num_elem, elems_per_block); 269 const CeedScalar *d_u; 270 CeedScalar *d_v; 271 CeedBasisNonTensor_Cuda *data; 272 273 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 274 CeedCallBackend(CeedBasisGetData(basis, &data)); 275 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 276 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 277 278 // Get read/write access to u, v 279 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 280 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 281 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 282 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 283 284 // Clear v for transpose operation 285 if (is_transpose && !apply_add) { 286 CeedSize length; 287 288 CeedCallBackend(CeedVectorGetLength(v, &length)); 289 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 290 } 291 292 // Apply basis operation 293 switch (eval_mode) { 294 case CEED_EVAL_INTERP: { 295 void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 296 const int block_size_x = is_transpose ? num_nodes : num_qpts; 297 298 if (is_transpose) { 299 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 300 } else { 301 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 302 } 303 } break; 304 case CEED_EVAL_GRAD: { 305 void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 306 const int block_size_x = is_transpose ? num_nodes : num_qpts; 307 308 if (is_transpose) { 309 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 310 } else { 311 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 312 } 313 } break; 314 case CEED_EVAL_DIV: { 315 void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 316 const int block_size_x = is_transpose ? num_nodes : num_qpts; 317 318 if (is_transpose) { 319 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 320 } else { 321 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 322 } 323 } break; 324 case CEED_EVAL_CURL: { 325 void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 326 const int block_size_x = is_transpose ? num_nodes : num_qpts; 327 328 if (is_transpose) { 329 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 330 } else { 331 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 332 } 333 } break; 334 case CEED_EVAL_WEIGHT: { 335 CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 336 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 337 338 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 339 } break; 340 case CEED_EVAL_NONE: /* handled separately below */ 341 break; 342 } 343 344 // Restore vectors, cover CEED_EVAL_NONE 345 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 346 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 347 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 348 return CEED_ERROR_SUCCESS; 349 } 350 351 static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 352 CeedVector v) { 353 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 354 return CEED_ERROR_SUCCESS; 355 } 356 357 static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 358 CeedVector v) { 359 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 360 return CEED_ERROR_SUCCESS; 361 } 362 363 //------------------------------------------------------------------------------ 364 // Destroy tensor basis 365 //------------------------------------------------------------------------------ 366 static int CeedBasisDestroy_Cuda(CeedBasis basis) { 367 Ceed ceed; 368 CeedBasis_Cuda *data; 369 370 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 371 CeedCallBackend(CeedBasisGetData(basis, &data)); 372 CeedCallCuda(ceed, cuModuleUnload(data->module)); 373 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 374 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 375 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 376 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 377 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 378 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 379 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 380 CeedCallBackend(CeedFree(&data)); 381 return CEED_ERROR_SUCCESS; 382 } 383 384 //------------------------------------------------------------------------------ 385 // Destroy non-tensor basis 386 //------------------------------------------------------------------------------ 387 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 388 Ceed ceed; 389 CeedBasisNonTensor_Cuda *data; 390 391 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 392 CeedCallBackend(CeedBasisGetData(basis, &data)); 393 CeedCallCuda(ceed, cuModuleUnload(data->module)); 394 if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 395 CeedCallCuda(ceed, cudaFree(data->d_interp)); 396 CeedCallCuda(ceed, cudaFree(data->d_grad)); 397 CeedCallCuda(ceed, cudaFree(data->d_div)); 398 CeedCallCuda(ceed, cudaFree(data->d_curl)); 399 CeedCallBackend(CeedFree(&data)); 400 return CEED_ERROR_SUCCESS; 401 } 402 403 //------------------------------------------------------------------------------ 404 // Create tensor 405 //------------------------------------------------------------------------------ 406 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 407 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 408 Ceed ceed; 409 char *basis_kernel_source; 410 const char *basis_kernel_path; 411 CeedInt num_comp; 412 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 413 const CeedInt interp_bytes = q_bytes * P_1d; 414 CeedBasis_Cuda *data; 415 416 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 417 CeedCallBackend(CeedCalloc(1, &data)); 418 419 // Copy data to GPU 420 if (q_weight_1d) { 421 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 422 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 423 } 424 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 425 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 426 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 427 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 428 429 // Compile basis kernels 430 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 431 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); 432 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 433 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 434 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 435 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 436 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 437 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 438 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 439 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 440 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 441 CeedCallBackend(CeedFree(&basis_kernel_path)); 442 CeedCallBackend(CeedFree(&basis_kernel_source)); 443 444 CeedCallBackend(CeedBasisSetData(basis, data)); 445 446 // Register backend functions 447 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 448 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); 449 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); 450 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); 451 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 452 return CEED_ERROR_SUCCESS; 453 } 454 455 //------------------------------------------------------------------------------ 456 // Create non-tensor H^1 457 //------------------------------------------------------------------------------ 458 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 459 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 460 Ceed ceed; 461 char *basis_kernel_source; 462 const char *basis_kernel_path; 463 CeedInt num_comp, q_comp_interp, q_comp_grad; 464 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 465 CeedBasisNonTensor_Cuda *data; 466 467 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 468 CeedCallBackend(CeedCalloc(1, &data)); 469 470 // Copy basis data to GPU 471 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 472 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 473 if (q_weight) { 474 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 475 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 476 } 477 if (interp) { 478 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 479 480 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 481 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 482 } 483 if (grad) { 484 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 485 486 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 487 CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 488 } 489 490 // Compile basis kernels 491 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 492 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 493 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 494 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 495 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 496 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 497 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 498 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 499 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 500 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 501 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 502 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 503 CeedCallBackend(CeedFree(&basis_kernel_path)); 504 CeedCallBackend(CeedFree(&basis_kernel_source)); 505 506 CeedCallBackend(CeedBasisSetData(basis, data)); 507 508 // Register backend functions 509 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 510 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 511 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 512 return CEED_ERROR_SUCCESS; 513 } 514 515 //------------------------------------------------------------------------------ 516 // Create non-tensor H(div) 517 //------------------------------------------------------------------------------ 518 int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 519 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 520 Ceed ceed; 521 char *basis_kernel_source; 522 const char *basis_kernel_path; 523 CeedInt num_comp, q_comp_interp, q_comp_div; 524 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 525 CeedBasisNonTensor_Cuda *data; 526 527 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 528 CeedCallBackend(CeedCalloc(1, &data)); 529 530 // Copy basis data to GPU 531 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 532 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 533 if (q_weight) { 534 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 535 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 536 } 537 if (interp) { 538 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 539 540 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 541 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 542 } 543 if (div) { 544 const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 545 546 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); 547 CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); 548 } 549 550 // Compile basis kernels 551 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 552 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 553 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 554 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 555 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 556 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 557 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 558 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 559 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 560 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 561 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 562 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 563 CeedCallBackend(CeedFree(&basis_kernel_path)); 564 CeedCallBackend(CeedFree(&basis_kernel_source)); 565 566 CeedCallBackend(CeedBasisSetData(basis, data)); 567 568 // Register backend functions 569 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 570 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 571 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 572 return CEED_ERROR_SUCCESS; 573 } 574 575 //------------------------------------------------------------------------------ 576 // Create non-tensor H(curl) 577 //------------------------------------------------------------------------------ 578 int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 579 const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 580 Ceed ceed; 581 char *basis_kernel_source; 582 const char *basis_kernel_path; 583 CeedInt num_comp, q_comp_interp, q_comp_curl; 584 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 585 CeedBasisNonTensor_Cuda *data; 586 587 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 588 CeedCallBackend(CeedCalloc(1, &data)); 589 590 // Copy basis data to GPU 591 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 592 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 593 if (q_weight) { 594 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 595 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 596 } 597 if (interp) { 598 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 599 600 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 601 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 602 } 603 if (curl) { 604 const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 605 606 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); 607 CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); 608 } 609 610 // Compile basis kernels 611 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 612 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 613 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 614 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 615 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 616 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 617 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 618 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 619 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 620 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 621 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 622 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 623 CeedCallBackend(CeedFree(&basis_kernel_path)); 624 CeedCallBackend(CeedFree(&basis_kernel_source)); 625 626 CeedCallBackend(CeedBasisSetData(basis, data)); 627 628 // Register backend functions 629 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 630 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 631 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 632 return CEED_ERROR_SUCCESS; 633 } 634 635 //------------------------------------------------------------------------------ 636