1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <string.h> 14 15 #include "../cuda/ceed-cuda-common.h" 16 #include "../cuda/ceed-cuda-compile.h" 17 #include "ceed-cuda-ref.h" 18 19 //------------------------------------------------------------------------------ 20 // Basis apply - tensor 21 //------------------------------------------------------------------------------ 22 static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 23 CeedVector u, CeedVector v) { 24 Ceed ceed; 25 CeedInt Q_1d, dim; 26 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 27 const int max_block_size = 32; 28 const CeedScalar *d_u; 29 CeedScalar *d_v; 30 CeedBasis_Cuda *data; 31 32 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 33 CeedCallBackend(CeedBasisGetData(basis, &data)); 34 35 // Get read/write access to u, v 36 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 37 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 38 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 39 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 40 41 // Clear v for transpose operation 42 if (is_transpose && !apply_add) { 43 CeedInt num_comp, q_comp, num_nodes, num_qpts; 44 CeedSize length; 45 46 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 47 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 48 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 49 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 50 length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp)); 51 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 52 } 53 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 54 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 55 56 // Basis action 57 switch (eval_mode) { 58 case CEED_EVAL_INTERP: { 59 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 60 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 61 62 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 63 } break; 64 case CEED_EVAL_GRAD: { 65 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 66 const CeedInt block_size = max_block_size; 67 68 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 69 } break; 70 case CEED_EVAL_WEIGHT: { 71 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 72 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 73 const int block_size_x = Q_1d; 74 const int block_size_y = dim >= 2 ? Q_1d : 1; 75 76 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 77 } break; 78 case CEED_EVAL_NONE: /* handled separately below */ 79 break; 80 // LCOV_EXCL_START 81 case CEED_EVAL_DIV: 82 case CEED_EVAL_CURL: 83 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 84 // LCOV_EXCL_STOP 85 } 86 87 // Restore vectors, cover CEED_EVAL_NONE 88 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 89 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 90 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 91 CeedCallBackend(CeedDestroy(&ceed)); 92 return CEED_ERROR_SUCCESS; 93 } 94 95 static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 96 CeedVector v) { 97 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 98 return CEED_ERROR_SUCCESS; 99 } 100 101 static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 102 CeedVector v) { 103 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 104 return CEED_ERROR_SUCCESS; 105 } 106 107 //------------------------------------------------------------------------------ 108 // Basis apply - tensor AtPoints 109 //------------------------------------------------------------------------------ 110 static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 111 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 112 Ceed ceed; 113 CeedInt Q_1d, dim, max_num_points = num_points[0]; 114 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 115 const int max_block_size = 32; 116 const CeedScalar *d_x, *d_u; 117 CeedScalar *d_v; 118 CeedBasis_Cuda *data; 119 120 CeedCallBackend(CeedBasisGetData(basis, &data)); 121 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 122 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 123 124 // Weight handled separately 125 if (eval_mode == CEED_EVAL_WEIGHT) { 126 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 127 return CEED_ERROR_SUCCESS; 128 } 129 130 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 131 132 // Check padded to uniform number of points per elem 133 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 134 { 135 CeedInt num_comp, q_comp; 136 CeedSize len, len_required; 137 138 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 139 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 140 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 141 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 142 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 143 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 144 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 145 len, len_required); 146 } 147 148 // Move num_points array to device 149 if (is_transpose) { 150 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 151 152 if (num_elem != data->num_elem_at_points) { 153 data->num_elem_at_points = num_elem; 154 155 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 156 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 157 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 158 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 159 } 160 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 161 memcpy(data->h_points_per_elem, num_points, num_bytes); 162 CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 163 } 164 } 165 166 // Build kernels if needed 167 if (data->num_points != max_num_points) { 168 CeedInt P_1d; 169 170 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 171 data->num_points = max_num_points; 172 173 // -- Create interp matrix to Chebyshev coefficients 174 if (!data->d_chebyshev_interp_1d) { 175 CeedSize interp_bytes; 176 CeedScalar *chebyshev_interp_1d; 177 178 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 179 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 180 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 181 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 182 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 183 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 184 } 185 186 // -- Compile kernels 187 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h>\n"; 188 CeedInt num_comp; 189 190 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 191 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 192 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 193 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 194 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 195 max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 196 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 197 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 198 } 199 200 // Get read/write access to u, v 201 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 202 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 203 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 204 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 205 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 206 207 // Clear v for transpose operation 208 if (is_transpose && !apply_add) { 209 CeedInt num_comp, q_comp, num_nodes; 210 CeedSize length; 211 212 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 213 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 214 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 215 length = 216 (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp)); 217 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 218 } 219 220 // Basis action 221 switch (eval_mode) { 222 case CEED_EVAL_INTERP: { 223 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 224 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 225 226 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); 227 } break; 228 case CEED_EVAL_GRAD: { 229 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 230 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 231 232 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); 233 } break; 234 case CEED_EVAL_WEIGHT: 235 case CEED_EVAL_NONE: /* handled separately below */ 236 break; 237 // LCOV_EXCL_START 238 case CEED_EVAL_DIV: 239 case CEED_EVAL_CURL: 240 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 241 // LCOV_EXCL_STOP 242 } 243 244 // Restore vectors, cover CEED_EVAL_NONE 245 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 246 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 247 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 248 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 249 CeedCallBackend(CeedDestroy(&ceed)); 250 return CEED_ERROR_SUCCESS; 251 } 252 253 static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 254 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 255 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 256 return CEED_ERROR_SUCCESS; 257 } 258 259 static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 260 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 261 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 262 return CEED_ERROR_SUCCESS; 263 } 264 265 //------------------------------------------------------------------------------ 266 // Basis apply - non-tensor 267 //------------------------------------------------------------------------------ 268 static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 269 CeedVector u, CeedVector v) { 270 Ceed ceed; 271 CeedInt num_nodes, num_qpts; 272 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 273 const int elems_per_block = 1; 274 const int grid = CeedDivUpInt(num_elem, elems_per_block); 275 const CeedScalar *d_u; 276 CeedScalar *d_v; 277 CeedBasisNonTensor_Cuda *data; 278 279 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 280 CeedCallBackend(CeedBasisGetData(basis, &data)); 281 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 282 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 283 284 // Get read/write access to u, v 285 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 286 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 287 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 288 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 289 290 // Clear v for transpose operation 291 if (is_transpose && !apply_add) { 292 CeedInt num_comp, q_comp; 293 CeedSize length; 294 295 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 296 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 297 length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp)); 298 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 299 } 300 301 // Apply basis operation 302 switch (eval_mode) { 303 case CEED_EVAL_INTERP: { 304 void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 305 const int block_size_x = is_transpose ? num_nodes : num_qpts; 306 307 if (is_transpose) { 308 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 309 } else { 310 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 311 } 312 } break; 313 case CEED_EVAL_GRAD: { 314 void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 315 const int block_size_x = is_transpose ? num_nodes : num_qpts; 316 317 if (is_transpose) { 318 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 319 } else { 320 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 321 } 322 } break; 323 case CEED_EVAL_DIV: { 324 void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 325 const int block_size_x = is_transpose ? num_nodes : num_qpts; 326 327 if (is_transpose) { 328 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 329 } else { 330 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 331 } 332 } break; 333 case CEED_EVAL_CURL: { 334 void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 335 const int block_size_x = is_transpose ? num_nodes : num_qpts; 336 337 if (is_transpose) { 338 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 339 } else { 340 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 341 } 342 } break; 343 case CEED_EVAL_WEIGHT: { 344 CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 345 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 346 347 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 348 } break; 349 case CEED_EVAL_NONE: /* handled separately below */ 350 break; 351 } 352 353 // Restore vectors, cover CEED_EVAL_NONE 354 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 355 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 356 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 357 CeedCallBackend(CeedDestroy(&ceed)); 358 return CEED_ERROR_SUCCESS; 359 } 360 361 static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 362 CeedVector v) { 363 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 364 return CEED_ERROR_SUCCESS; 365 } 366 367 static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 368 CeedVector v) { 369 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 370 return CEED_ERROR_SUCCESS; 371 } 372 373 //------------------------------------------------------------------------------ 374 // Destroy tensor basis 375 //------------------------------------------------------------------------------ 376 static int CeedBasisDestroy_Cuda(CeedBasis basis) { 377 Ceed ceed; 378 CeedBasis_Cuda *data; 379 380 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 381 CeedCallBackend(CeedBasisGetData(basis, &data)); 382 CeedCallCuda(ceed, cuModuleUnload(data->module)); 383 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 384 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 385 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 386 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 387 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 388 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 389 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 390 CeedCallBackend(CeedFree(&data)); 391 CeedCallBackend(CeedDestroy(&ceed)); 392 return CEED_ERROR_SUCCESS; 393 } 394 395 //------------------------------------------------------------------------------ 396 // Destroy non-tensor basis 397 //------------------------------------------------------------------------------ 398 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 399 Ceed ceed; 400 CeedBasisNonTensor_Cuda *data; 401 402 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 403 CeedCallBackend(CeedBasisGetData(basis, &data)); 404 CeedCallCuda(ceed, cuModuleUnload(data->module)); 405 if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 406 CeedCallCuda(ceed, cudaFree(data->d_interp)); 407 CeedCallCuda(ceed, cudaFree(data->d_grad)); 408 CeedCallCuda(ceed, cudaFree(data->d_div)); 409 CeedCallCuda(ceed, cudaFree(data->d_curl)); 410 CeedCallBackend(CeedFree(&data)); 411 CeedCallBackend(CeedDestroy(&ceed)); 412 return CEED_ERROR_SUCCESS; 413 } 414 415 //------------------------------------------------------------------------------ 416 // Create tensor 417 //------------------------------------------------------------------------------ 418 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 419 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 420 Ceed ceed; 421 CeedInt num_comp; 422 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 423 const CeedInt interp_bytes = q_bytes * P_1d; 424 CeedBasis_Cuda *data; 425 426 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 427 CeedCallBackend(CeedCalloc(1, &data)); 428 429 // Copy data to GPU 430 if (q_weight_1d) { 431 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 432 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 433 } 434 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 435 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 436 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 437 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 438 439 // Compile basis kernels 440 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor.h>\n"; 441 442 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 443 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 444 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 445 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 446 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 447 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 448 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 449 450 CeedCallBackend(CeedBasisSetData(basis, data)); 451 452 // Register backend functions 453 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 454 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); 455 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); 456 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); 457 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 458 CeedCallBackend(CeedDestroy(&ceed)); 459 return CEED_ERROR_SUCCESS; 460 } 461 462 //------------------------------------------------------------------------------ 463 // Create non-tensor H^1 464 //------------------------------------------------------------------------------ 465 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 466 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 467 Ceed ceed; 468 CeedInt num_comp, q_comp_interp, q_comp_grad; 469 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 470 CeedBasisNonTensor_Cuda *data; 471 472 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 473 CeedCallBackend(CeedCalloc(1, &data)); 474 475 // Copy basis data to GPU 476 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 477 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 478 if (q_weight) { 479 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 480 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 481 } 482 if (interp) { 483 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 484 485 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 486 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 487 } 488 if (grad) { 489 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 490 491 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 492 CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 493 } 494 495 // Compile basis kernels 496 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 497 498 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 499 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 500 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 501 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 502 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 503 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 504 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 505 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 506 507 CeedCallBackend(CeedBasisSetData(basis, data)); 508 509 // Register backend functions 510 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 511 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 512 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 513 CeedCallBackend(CeedDestroy(&ceed)); 514 return CEED_ERROR_SUCCESS; 515 } 516 517 //------------------------------------------------------------------------------ 518 // Create non-tensor H(div) 519 //------------------------------------------------------------------------------ 520 int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 521 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 522 Ceed ceed; 523 CeedInt num_comp, q_comp_interp, q_comp_div; 524 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 525 CeedBasisNonTensor_Cuda *data; 526 527 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 528 CeedCallBackend(CeedCalloc(1, &data)); 529 530 // Copy basis data to GPU 531 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 532 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 533 if (q_weight) { 534 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 535 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 536 } 537 if (interp) { 538 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 539 540 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 541 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 542 } 543 if (div) { 544 const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 545 546 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); 547 CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); 548 } 549 550 // Compile basis kernels 551 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 552 553 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 554 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 555 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 556 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 557 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 558 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 559 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 560 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 561 562 CeedCallBackend(CeedBasisSetData(basis, data)); 563 564 // Register backend functions 565 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 566 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 567 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 568 CeedCallBackend(CeedDestroy(&ceed)); 569 return CEED_ERROR_SUCCESS; 570 } 571 572 //------------------------------------------------------------------------------ 573 // Create non-tensor H(curl) 574 //------------------------------------------------------------------------------ 575 int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 576 const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 577 Ceed ceed; 578 CeedInt num_comp, q_comp_interp, q_comp_curl; 579 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 580 CeedBasisNonTensor_Cuda *data; 581 582 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 583 CeedCallBackend(CeedCalloc(1, &data)); 584 585 // Copy basis data to GPU 586 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 587 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 588 if (q_weight) { 589 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 590 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 591 } 592 if (interp) { 593 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 594 595 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 596 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 597 } 598 if (curl) { 599 const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 600 601 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); 602 CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); 603 } 604 605 // Compile basis kernels 606 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 607 608 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 609 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 610 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 611 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 612 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 613 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 614 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 615 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 616 617 CeedCallBackend(CeedBasisSetData(basis, data)); 618 619 // Register backend functions 620 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 621 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 622 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 623 CeedCallBackend(CeedDestroy(&ceed)); 624 return CEED_ERROR_SUCCESS; 625 } 626 627 //------------------------------------------------------------------------------ 628