1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 14 #include "../cuda/ceed-cuda-common.h" 15 #include "../cuda/ceed-cuda-compile.h" 16 #include "ceed-cuda-ref.h" 17 18 //------------------------------------------------------------------------------ 19 // Basis apply - tensor 20 //------------------------------------------------------------------------------ 21 static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 22 CeedVector u, CeedVector v) { 23 Ceed ceed; 24 CeedInt Q_1d, dim; 25 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 26 const int max_block_size = 32; 27 const CeedScalar *d_u; 28 CeedScalar *d_v; 29 CeedBasis_Cuda *data; 30 31 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 32 CeedCallBackend(CeedBasisGetData(basis, &data)); 33 34 // Get read/write access to u, v 35 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 36 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 37 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 38 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 39 40 // Clear v for transpose operation 41 if (is_transpose && !apply_add) { 42 CeedSize length; 43 44 CeedCallBackend(CeedVectorGetLength(v, &length)); 45 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 46 } 47 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 48 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 49 50 // Basis action 51 switch (eval_mode) { 52 case CEED_EVAL_INTERP: { 53 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 54 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 55 56 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 57 } break; 58 case CEED_EVAL_GRAD: { 59 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 60 const CeedInt block_size = max_block_size; 61 62 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 63 } break; 64 case CEED_EVAL_WEIGHT: { 65 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 66 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 67 const int block_size_x = Q_1d; 68 const int block_size_y = dim >= 2 ? Q_1d : 1; 69 70 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 71 } break; 72 case CEED_EVAL_NONE: /* handled separately below */ 73 break; 74 // LCOV_EXCL_START 75 case CEED_EVAL_DIV: 76 case CEED_EVAL_CURL: 77 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 78 // LCOV_EXCL_STOP 79 } 80 81 // Restore vectors, cover CEED_EVAL_NONE 82 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 83 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 84 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 85 return CEED_ERROR_SUCCESS; 86 } 87 88 static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 89 CeedVector v) { 90 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 91 return CEED_ERROR_SUCCESS; 92 } 93 94 static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 95 CeedVector v) { 96 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 97 return CEED_ERROR_SUCCESS; 98 } 99 100 //------------------------------------------------------------------------------ 101 // Basis apply - tensor AtPoints 102 //------------------------------------------------------------------------------ 103 static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 104 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 105 Ceed ceed; 106 CeedInt Q_1d, dim, max_num_points = num_points[0]; 107 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 108 const int max_block_size = 32; 109 const CeedScalar *d_x, *d_u; 110 CeedScalar *d_v; 111 CeedBasis_Cuda *data; 112 113 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 114 CeedCallBackend(CeedBasisGetData(basis, &data)); 115 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 116 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 117 118 // Check uniform number of points per elem 119 for (CeedInt i = 1; i < num_elem; i++) { 120 CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND, 121 "BasisApplyAtPoints only supported for the same number of points in each element"); 122 } 123 124 // Weight handled separately 125 if (eval_mode == CEED_EVAL_WEIGHT) { 126 CeedCall(CeedVectorSetValue(v, 1.0)); 127 return CEED_ERROR_SUCCESS; 128 } 129 130 // Build kernels if needed 131 if (data->num_points != max_num_points) { 132 CeedInt P_1d; 133 134 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 135 data->num_points = max_num_points; 136 137 // -- Create interp matrix to Chebyshev coefficients 138 if (!data->d_chebyshev_interp_1d) { 139 CeedSize interp_bytes; 140 CeedScalar *chebyshev_interp_1d; 141 142 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 143 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 144 CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 145 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 146 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 147 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 148 } 149 150 // -- Compile kernels 151 char *basis_kernel_source; 152 const char *basis_kernel_path; 153 CeedInt num_comp; 154 155 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 156 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 157 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path)); 158 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 159 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 160 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 161 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 162 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 163 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 164 max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 165 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 166 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 167 CeedCallBackend(CeedFree(&basis_kernel_path)); 168 CeedCallBackend(CeedFree(&basis_kernel_source)); 169 } 170 171 // Get read/write access to u, v 172 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 173 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 174 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 175 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 176 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 177 178 // Clear v for transpose operation 179 if (is_transpose && !apply_add) { 180 CeedSize length; 181 182 CeedCallBackend(CeedVectorGetLength(v, &length)); 183 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 184 } 185 186 // Basis action 187 switch (eval_mode) { 188 case CEED_EVAL_INTERP: { 189 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v}; 190 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 191 192 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); 193 } break; 194 case CEED_EVAL_GRAD: { 195 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v}; 196 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 197 198 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); 199 } break; 200 case CEED_EVAL_WEIGHT: 201 case CEED_EVAL_NONE: /* handled separately below */ 202 break; 203 // LCOV_EXCL_START 204 case CEED_EVAL_DIV: 205 case CEED_EVAL_CURL: 206 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 207 // LCOV_EXCL_STOP 208 } 209 210 // Restore vectors, cover CEED_EVAL_NONE 211 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 212 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 213 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 214 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 215 return CEED_ERROR_SUCCESS; 216 } 217 218 static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 219 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 220 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 221 return CEED_ERROR_SUCCESS; 222 } 223 224 static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 225 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 226 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 227 return CEED_ERROR_SUCCESS; 228 } 229 230 //------------------------------------------------------------------------------ 231 // Basis apply - non-tensor 232 //------------------------------------------------------------------------------ 233 static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 234 CeedVector u, CeedVector v) { 235 Ceed ceed; 236 CeedInt num_nodes, num_qpts; 237 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 238 const int elems_per_block = 1; 239 const int grid = CeedDivUpInt(num_elem, elems_per_block); 240 const CeedScalar *d_u; 241 CeedScalar *d_v; 242 CeedBasisNonTensor_Cuda *data; 243 244 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 245 CeedCallBackend(CeedBasisGetData(basis, &data)); 246 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 247 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 248 249 // Get read/write access to u, v 250 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 251 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 252 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 253 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 254 255 // Clear v for transpose operation 256 if (is_transpose && !apply_add) { 257 CeedSize length; 258 259 CeedCallBackend(CeedVectorGetLength(v, &length)); 260 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 261 } 262 263 // Apply basis operation 264 switch (eval_mode) { 265 case CEED_EVAL_INTERP: { 266 void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 267 const int block_size_x = is_transpose ? num_nodes : num_qpts; 268 269 if (is_transpose) { 270 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 271 } else { 272 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 273 } 274 } break; 275 case CEED_EVAL_GRAD: { 276 void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 277 const int block_size_x = is_transpose ? num_nodes : num_qpts; 278 279 if (is_transpose) { 280 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 281 } else { 282 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 283 } 284 } break; 285 case CEED_EVAL_DIV: { 286 void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 287 const int block_size_x = is_transpose ? num_nodes : num_qpts; 288 289 if (is_transpose) { 290 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 291 } else { 292 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 293 } 294 } break; 295 case CEED_EVAL_CURL: { 296 void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 297 const int block_size_x = is_transpose ? num_nodes : num_qpts; 298 299 if (is_transpose) { 300 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 301 } else { 302 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 303 } 304 } break; 305 case CEED_EVAL_WEIGHT: { 306 CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 307 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 308 309 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 310 } break; 311 case CEED_EVAL_NONE: /* handled separately below */ 312 break; 313 } 314 315 // Restore vectors, cover CEED_EVAL_NONE 316 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 317 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 318 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 319 return CEED_ERROR_SUCCESS; 320 } 321 322 static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 323 CeedVector v) { 324 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 325 return CEED_ERROR_SUCCESS; 326 } 327 328 static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 329 CeedVector v) { 330 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 331 return CEED_ERROR_SUCCESS; 332 } 333 334 //------------------------------------------------------------------------------ 335 // Destroy tensor basis 336 //------------------------------------------------------------------------------ 337 static int CeedBasisDestroy_Cuda(CeedBasis basis) { 338 Ceed ceed; 339 CeedBasis_Cuda *data; 340 341 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 342 CeedCallBackend(CeedBasisGetData(basis, &data)); 343 CeedCallCuda(ceed, cuModuleUnload(data->module)); 344 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 345 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 346 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 347 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 348 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 349 CeedCallBackend(CeedFree(&data)); 350 return CEED_ERROR_SUCCESS; 351 } 352 353 //------------------------------------------------------------------------------ 354 // Destroy non-tensor basis 355 //------------------------------------------------------------------------------ 356 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 357 Ceed ceed; 358 CeedBasisNonTensor_Cuda *data; 359 360 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 361 CeedCallBackend(CeedBasisGetData(basis, &data)); 362 CeedCallCuda(ceed, cuModuleUnload(data->module)); 363 if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 364 CeedCallCuda(ceed, cudaFree(data->d_interp)); 365 CeedCallCuda(ceed, cudaFree(data->d_grad)); 366 CeedCallCuda(ceed, cudaFree(data->d_div)); 367 CeedCallCuda(ceed, cudaFree(data->d_curl)); 368 CeedCallBackend(CeedFree(&data)); 369 return CEED_ERROR_SUCCESS; 370 } 371 372 //------------------------------------------------------------------------------ 373 // Create tensor 374 //------------------------------------------------------------------------------ 375 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 376 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 377 Ceed ceed; 378 char *basis_kernel_source; 379 const char *basis_kernel_path; 380 CeedInt num_comp; 381 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 382 const CeedInt interp_bytes = q_bytes * P_1d; 383 CeedBasis_Cuda *data; 384 385 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 386 CeedCallBackend(CeedCalloc(1, &data)); 387 388 // Copy data to GPU 389 if (q_weight_1d) { 390 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 391 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 392 } 393 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 394 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 395 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 396 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 397 398 // Compile basis kernels 399 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 400 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); 401 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 402 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 403 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 404 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 405 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 406 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 407 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 408 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 409 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 410 CeedCallBackend(CeedFree(&basis_kernel_path)); 411 CeedCallBackend(CeedFree(&basis_kernel_source)); 412 413 CeedCallBackend(CeedBasisSetData(basis, data)); 414 415 // Register backend functions 416 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 417 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); 418 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); 419 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); 420 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 421 return CEED_ERROR_SUCCESS; 422 } 423 424 //------------------------------------------------------------------------------ 425 // Create non-tensor H^1 426 //------------------------------------------------------------------------------ 427 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 428 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 429 Ceed ceed; 430 char *basis_kernel_source; 431 const char *basis_kernel_path; 432 CeedInt num_comp, q_comp_interp, q_comp_grad; 433 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 434 CeedBasisNonTensor_Cuda *data; 435 436 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 437 CeedCallBackend(CeedCalloc(1, &data)); 438 439 // Copy basis data to GPU 440 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 441 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 442 if (q_weight) { 443 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 444 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 445 } 446 if (interp) { 447 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 448 449 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 450 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 451 } 452 if (grad) { 453 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 454 455 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 456 CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 457 } 458 459 // Compile basis kernels 460 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 461 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 462 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 463 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 464 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 465 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 466 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 467 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 468 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 469 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 470 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 471 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 472 CeedCallBackend(CeedFree(&basis_kernel_path)); 473 CeedCallBackend(CeedFree(&basis_kernel_source)); 474 475 CeedCallBackend(CeedBasisSetData(basis, data)); 476 477 // Register backend functions 478 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 479 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 480 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 481 return CEED_ERROR_SUCCESS; 482 } 483 484 //------------------------------------------------------------------------------ 485 // Create non-tensor H(div) 486 //------------------------------------------------------------------------------ 487 int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 488 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 489 Ceed ceed; 490 char *basis_kernel_source; 491 const char *basis_kernel_path; 492 CeedInt num_comp, q_comp_interp, q_comp_div; 493 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 494 CeedBasisNonTensor_Cuda *data; 495 496 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 497 CeedCallBackend(CeedCalloc(1, &data)); 498 499 // Copy basis data to GPU 500 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 501 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 502 if (q_weight) { 503 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 504 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 505 } 506 if (interp) { 507 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 508 509 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 510 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 511 } 512 if (div) { 513 const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 514 515 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); 516 CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); 517 } 518 519 // Compile basis kernels 520 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 521 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 522 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 523 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 524 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 525 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 526 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 527 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 528 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 529 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 530 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 531 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 532 CeedCallBackend(CeedFree(&basis_kernel_path)); 533 CeedCallBackend(CeedFree(&basis_kernel_source)); 534 535 CeedCallBackend(CeedBasisSetData(basis, data)); 536 537 // Register backend functions 538 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 539 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 540 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 541 return CEED_ERROR_SUCCESS; 542 } 543 544 //------------------------------------------------------------------------------ 545 // Create non-tensor H(curl) 546 //------------------------------------------------------------------------------ 547 int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 548 const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 549 Ceed ceed; 550 char *basis_kernel_source; 551 const char *basis_kernel_path; 552 CeedInt num_comp, q_comp_interp, q_comp_curl; 553 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 554 CeedBasisNonTensor_Cuda *data; 555 556 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 557 CeedCallBackend(CeedCalloc(1, &data)); 558 559 // Copy basis data to GPU 560 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 561 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 562 if (q_weight) { 563 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 564 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 565 } 566 if (interp) { 567 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 568 569 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 570 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 571 } 572 if (curl) { 573 const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 574 575 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); 576 CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); 577 } 578 579 // Compile basis kernels 580 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 581 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 582 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 583 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 584 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 585 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 586 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 587 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 588 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 589 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 590 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 591 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 592 CeedCallBackend(CeedFree(&basis_kernel_path)); 593 CeedCallBackend(CeedFree(&basis_kernel_source)); 594 595 CeedCallBackend(CeedBasisSetData(basis, data)); 596 597 // Register backend functions 598 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 599 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 600 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 601 return CEED_ERROR_SUCCESS; 602 } 603 604 //------------------------------------------------------------------------------ 605