1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <string.h> 14 15 #include "../cuda/ceed-cuda-common.h" 16 #include "../cuda/ceed-cuda-compile.h" 17 #include "ceed-cuda-ref.h" 18 19 //------------------------------------------------------------------------------ 20 // Basis apply - tensor 21 //------------------------------------------------------------------------------ 22 static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 23 CeedVector u, CeedVector v) { 24 Ceed ceed; 25 CeedInt Q_1d, dim; 26 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 27 const int max_block_size = 32; 28 const CeedScalar *d_u; 29 CeedScalar *d_v; 30 CeedBasis_Cuda *data; 31 32 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 33 CeedCallBackend(CeedBasisGetData(basis, &data)); 34 35 // Get read/write access to u, v 36 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 37 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 38 if (apply_add) { 39 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 40 } else { 41 // Clear v for transpose operation 42 if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); 43 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 44 } 45 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 46 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 47 48 // Basis action 49 switch (eval_mode) { 50 case CEED_EVAL_INTERP: { 51 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 52 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 53 54 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 55 } break; 56 case CEED_EVAL_GRAD: { 57 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 58 const CeedInt block_size = max_block_size; 59 60 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 61 } break; 62 case CEED_EVAL_WEIGHT: { 63 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 64 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 65 const int block_size_x = Q_1d; 66 const int block_size_y = dim >= 2 ? Q_1d : 1; 67 68 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 69 } break; 70 case CEED_EVAL_NONE: /* handled separately below */ 71 break; 72 // LCOV_EXCL_START 73 case CEED_EVAL_DIV: 74 case CEED_EVAL_CURL: 75 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 76 // LCOV_EXCL_STOP 77 } 78 79 // Restore vectors, cover CEED_EVAL_NONE 80 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 81 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 82 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 83 CeedCallBackend(CeedDestroy(&ceed)); 84 return CEED_ERROR_SUCCESS; 85 } 86 87 static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 88 CeedVector v) { 89 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 90 return CEED_ERROR_SUCCESS; 91 } 92 93 static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 94 CeedVector v) { 95 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 96 return CEED_ERROR_SUCCESS; 97 } 98 99 //------------------------------------------------------------------------------ 100 // Basis apply - tensor AtPoints 101 //------------------------------------------------------------------------------ 102 static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 103 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 104 Ceed ceed; 105 CeedInt Q_1d, dim, max_num_points = num_points[0]; 106 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 107 const int max_block_size = 32; 108 const CeedScalar *d_x, *d_u; 109 CeedScalar *d_v; 110 CeedBasis_Cuda *data; 111 112 CeedCallBackend(CeedBasisGetData(basis, &data)); 113 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 114 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 115 116 // Weight handled separately 117 if (eval_mode == CEED_EVAL_WEIGHT) { 118 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 119 return CEED_ERROR_SUCCESS; 120 } 121 122 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 123 124 // Check padded to uniform number of points per elem 125 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 126 { 127 CeedInt num_comp, q_comp; 128 CeedSize len, len_required; 129 130 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 131 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 132 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 133 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 134 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 135 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 136 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 137 len, len_required); 138 } 139 140 // Move num_points array to device 141 if (is_transpose) { 142 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 143 144 if (num_elem != data->num_elem_at_points) { 145 data->num_elem_at_points = num_elem; 146 147 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 148 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 149 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 150 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 151 } 152 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 153 memcpy(data->h_points_per_elem, num_points, num_bytes); 154 CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 155 } 156 } 157 158 // Build kernels if needed 159 if (data->num_points != max_num_points) { 160 CeedInt P_1d; 161 162 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 163 data->num_points = max_num_points; 164 165 // -- Create interp matrix to Chebyshev coefficients 166 if (!data->d_chebyshev_interp_1d) { 167 CeedSize interp_bytes; 168 CeedScalar *chebyshev_interp_1d; 169 170 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 171 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 172 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 173 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 174 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 175 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 176 } 177 178 // -- Compile kernels 179 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h>\n"; 180 CeedInt num_comp; 181 182 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 183 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 184 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 185 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 186 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 187 max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 188 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 189 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); 190 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 191 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); 192 } 193 194 // Get read/write access to u, v 195 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 196 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 197 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 198 if (apply_add) { 199 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 200 } else { 201 // Clear v for transpose operation 202 if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); 203 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 204 } 205 206 // Basis action 207 switch (eval_mode) { 208 case CEED_EVAL_INTERP: { 209 void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 210 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 211 212 CeedCallBackend( 213 CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args)); 214 } break; 215 case CEED_EVAL_GRAD: { 216 void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 217 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 218 219 CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); 220 } break; 221 case CEED_EVAL_WEIGHT: 222 case CEED_EVAL_NONE: /* handled separately below */ 223 break; 224 // LCOV_EXCL_START 225 case CEED_EVAL_DIV: 226 case CEED_EVAL_CURL: 227 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 228 // LCOV_EXCL_STOP 229 } 230 231 // Restore vectors, cover CEED_EVAL_NONE 232 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 233 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 234 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 235 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 236 CeedCallBackend(CeedDestroy(&ceed)); 237 return CEED_ERROR_SUCCESS; 238 } 239 240 static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 241 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 242 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 243 return CEED_ERROR_SUCCESS; 244 } 245 246 static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 247 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 248 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 249 return CEED_ERROR_SUCCESS; 250 } 251 252 //------------------------------------------------------------------------------ 253 // Basis apply - non-tensor 254 //------------------------------------------------------------------------------ 255 static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 256 CeedVector u, CeedVector v) { 257 Ceed ceed; 258 CeedInt num_nodes, num_qpts; 259 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 260 const int elems_per_block = 1; 261 const int grid = CeedDivUpInt(num_elem, elems_per_block); 262 const CeedScalar *d_u; 263 CeedScalar *d_v; 264 CeedBasisNonTensor_Cuda *data; 265 266 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 267 CeedCallBackend(CeedBasisGetData(basis, &data)); 268 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 269 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 270 271 // Get read/write access to u, v 272 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 273 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 274 if (apply_add) { 275 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 276 } else { 277 // Clear v for transpose operation 278 if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); 279 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 280 } 281 282 // Apply basis operation 283 switch (eval_mode) { 284 case CEED_EVAL_INTERP: { 285 void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 286 const int block_size_x = is_transpose ? num_nodes : num_qpts; 287 288 if (is_transpose) { 289 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 290 } else { 291 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 292 } 293 } break; 294 case CEED_EVAL_GRAD: { 295 void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 296 const int block_size_x = is_transpose ? num_nodes : num_qpts; 297 298 if (is_transpose) { 299 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 300 } else { 301 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 302 } 303 } break; 304 case CEED_EVAL_DIV: { 305 void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 306 const int block_size_x = is_transpose ? num_nodes : num_qpts; 307 308 if (is_transpose) { 309 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 310 } else { 311 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 312 } 313 } break; 314 case CEED_EVAL_CURL: { 315 void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 316 const int block_size_x = is_transpose ? num_nodes : num_qpts; 317 318 if (is_transpose) { 319 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 320 } else { 321 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 322 } 323 } break; 324 case CEED_EVAL_WEIGHT: { 325 CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 326 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 327 328 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 329 } break; 330 case CEED_EVAL_NONE: /* handled separately below */ 331 break; 332 } 333 334 // Restore vectors, cover CEED_EVAL_NONE 335 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 336 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 337 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 338 CeedCallBackend(CeedDestroy(&ceed)); 339 return CEED_ERROR_SUCCESS; 340 } 341 342 static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 343 CeedVector v) { 344 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 345 return CEED_ERROR_SUCCESS; 346 } 347 348 static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 349 CeedVector v) { 350 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 351 return CEED_ERROR_SUCCESS; 352 } 353 354 //------------------------------------------------------------------------------ 355 // Destroy tensor basis 356 //------------------------------------------------------------------------------ 357 static int CeedBasisDestroy_Cuda(CeedBasis basis) { 358 Ceed ceed; 359 CeedBasis_Cuda *data; 360 361 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 362 CeedCallBackend(CeedBasisGetData(basis, &data)); 363 CeedCallCuda(ceed, cuModuleUnload(data->module)); 364 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 365 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 366 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 367 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 368 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 369 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 370 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 371 CeedCallBackend(CeedFree(&data)); 372 CeedCallBackend(CeedDestroy(&ceed)); 373 return CEED_ERROR_SUCCESS; 374 } 375 376 //------------------------------------------------------------------------------ 377 // Destroy non-tensor basis 378 //------------------------------------------------------------------------------ 379 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 380 Ceed ceed; 381 CeedBasisNonTensor_Cuda *data; 382 383 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 384 CeedCallBackend(CeedBasisGetData(basis, &data)); 385 CeedCallCuda(ceed, cuModuleUnload(data->module)); 386 if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 387 CeedCallCuda(ceed, cudaFree(data->d_interp)); 388 CeedCallCuda(ceed, cudaFree(data->d_grad)); 389 CeedCallCuda(ceed, cudaFree(data->d_div)); 390 CeedCallCuda(ceed, cudaFree(data->d_curl)); 391 CeedCallBackend(CeedFree(&data)); 392 CeedCallBackend(CeedDestroy(&ceed)); 393 return CEED_ERROR_SUCCESS; 394 } 395 396 //------------------------------------------------------------------------------ 397 // Create tensor 398 //------------------------------------------------------------------------------ 399 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 400 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 401 Ceed ceed; 402 CeedInt num_comp; 403 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 404 const CeedInt interp_bytes = q_bytes * P_1d; 405 CeedBasis_Cuda *data; 406 407 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 408 CeedCallBackend(CeedCalloc(1, &data)); 409 410 // Copy data to GPU 411 if (q_weight_1d) { 412 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 413 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 414 } 415 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 416 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 417 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 418 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 419 420 // Compile basis kernels 421 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor.h>\n"; 422 423 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 424 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 425 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 426 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 427 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 428 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 429 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 430 431 CeedCallBackend(CeedBasisSetData(basis, data)); 432 433 // Register backend functions 434 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 435 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); 436 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); 437 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); 438 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 439 CeedCallBackend(CeedDestroy(&ceed)); 440 return CEED_ERROR_SUCCESS; 441 } 442 443 //------------------------------------------------------------------------------ 444 // Create non-tensor H^1 445 //------------------------------------------------------------------------------ 446 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 447 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 448 Ceed ceed; 449 CeedInt num_comp, q_comp_interp, q_comp_grad; 450 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 451 CeedBasisNonTensor_Cuda *data; 452 453 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 454 CeedCallBackend(CeedCalloc(1, &data)); 455 456 // Copy basis data to GPU 457 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 458 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 459 if (q_weight) { 460 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 461 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 462 } 463 if (interp) { 464 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 465 466 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 467 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 468 } 469 if (grad) { 470 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 471 472 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 473 CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 474 } 475 476 // Compile basis kernels 477 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 478 479 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 480 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 481 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 482 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 483 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 484 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 485 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 486 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 487 488 CeedCallBackend(CeedBasisSetData(basis, data)); 489 490 // Register backend functions 491 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 492 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 493 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 494 CeedCallBackend(CeedDestroy(&ceed)); 495 return CEED_ERROR_SUCCESS; 496 } 497 498 //------------------------------------------------------------------------------ 499 // Create non-tensor H(div) 500 //------------------------------------------------------------------------------ 501 int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 502 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 503 Ceed ceed; 504 CeedInt num_comp, q_comp_interp, q_comp_div; 505 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 506 CeedBasisNonTensor_Cuda *data; 507 508 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 509 CeedCallBackend(CeedCalloc(1, &data)); 510 511 // Copy basis data to GPU 512 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 513 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 514 if (q_weight) { 515 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 516 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 517 } 518 if (interp) { 519 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 520 521 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 522 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 523 } 524 if (div) { 525 const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 526 527 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); 528 CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); 529 } 530 531 // Compile basis kernels 532 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 533 534 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 535 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 536 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 537 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 538 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 539 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 540 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 541 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 542 543 CeedCallBackend(CeedBasisSetData(basis, data)); 544 545 // Register backend functions 546 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 547 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 548 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 549 CeedCallBackend(CeedDestroy(&ceed)); 550 return CEED_ERROR_SUCCESS; 551 } 552 553 //------------------------------------------------------------------------------ 554 // Create non-tensor H(curl) 555 //------------------------------------------------------------------------------ 556 int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 557 const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 558 Ceed ceed; 559 CeedInt num_comp, q_comp_interp, q_comp_curl; 560 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 561 CeedBasisNonTensor_Cuda *data; 562 563 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 564 CeedCallBackend(CeedCalloc(1, &data)); 565 566 // Copy basis data to GPU 567 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 568 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 569 if (q_weight) { 570 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 571 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 572 } 573 if (interp) { 574 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 575 576 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 577 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 578 } 579 if (curl) { 580 const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 581 582 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); 583 CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); 584 } 585 586 // Compile basis kernels 587 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 588 589 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 590 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 591 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 592 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 593 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 594 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 595 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 596 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 597 598 CeedCallBackend(CeedBasisSetData(basis, data)); 599 600 // Register backend functions 601 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 602 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 603 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 604 CeedCallBackend(CeedDestroy(&ceed)); 605 return CEED_ERROR_SUCCESS; 606 } 607 608 //------------------------------------------------------------------------------ 609