1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <string.h> 14 15 #include "../cuda/ceed-cuda-common.h" 16 #include "../cuda/ceed-cuda-compile.h" 17 #include "ceed-cuda-ref.h" 18 19 //------------------------------------------------------------------------------ 20 // Basis apply - tensor 21 //------------------------------------------------------------------------------ 22 static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 23 CeedVector u, CeedVector v) { 24 Ceed ceed; 25 CeedInt Q_1d, dim; 26 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 27 const int max_block_size = 32; 28 const CeedScalar *d_u; 29 CeedScalar *d_v; 30 CeedBasis_Cuda *data; 31 32 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 33 CeedCallBackend(CeedBasisGetData(basis, &data)); 34 35 // Get read/write access to u, v 36 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 37 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 38 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 39 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 40 41 // Clear v for transpose operation 42 if (is_transpose && !apply_add) { 43 CeedInt num_comp, q_comp, num_nodes, num_qpts; 44 CeedSize length; 45 46 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 47 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 48 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 49 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 50 length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp)); 51 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 52 } 53 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 54 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 55 56 // Basis action 57 switch (eval_mode) { 58 case CEED_EVAL_INTERP: { 59 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 60 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 61 62 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 63 } break; 64 case CEED_EVAL_GRAD: { 65 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 66 const CeedInt block_size = max_block_size; 67 68 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 69 } break; 70 case CEED_EVAL_WEIGHT: { 71 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 72 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 73 const int block_size_x = Q_1d; 74 const int block_size_y = dim >= 2 ? Q_1d : 1; 75 76 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 77 } break; 78 case CEED_EVAL_NONE: /* handled separately below */ 79 break; 80 // LCOV_EXCL_START 81 case CEED_EVAL_DIV: 82 case CEED_EVAL_CURL: 83 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 84 // LCOV_EXCL_STOP 85 } 86 87 // Restore vectors, cover CEED_EVAL_NONE 88 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 89 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 90 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 91 CeedCallBackend(CeedDestroy(&ceed)); 92 return CEED_ERROR_SUCCESS; 93 } 94 95 static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 96 CeedVector v) { 97 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 98 return CEED_ERROR_SUCCESS; 99 } 100 101 static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 102 CeedVector v) { 103 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 104 return CEED_ERROR_SUCCESS; 105 } 106 107 //------------------------------------------------------------------------------ 108 // Basis apply - tensor AtPoints 109 //------------------------------------------------------------------------------ 110 static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 111 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 112 Ceed ceed; 113 CeedInt Q_1d, dim, max_num_points = num_points[0]; 114 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 115 const int max_block_size = 32; 116 const CeedScalar *d_x, *d_u; 117 CeedScalar *d_v; 118 CeedBasis_Cuda *data; 119 120 CeedCallBackend(CeedBasisGetData(basis, &data)); 121 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 122 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 123 124 // Weight handled separately 125 if (eval_mode == CEED_EVAL_WEIGHT) { 126 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 127 return CEED_ERROR_SUCCESS; 128 } 129 130 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 131 132 // Check padded to uniform number of points per elem 133 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 134 { 135 CeedInt num_comp, q_comp; 136 CeedSize len, len_required; 137 138 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 139 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 140 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 141 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 142 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 143 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 144 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 145 len, len_required); 146 } 147 148 // Move num_points array to device 149 if (is_transpose) { 150 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 151 152 if (num_elem != data->num_elem_at_points) { 153 data->num_elem_at_points = num_elem; 154 155 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 156 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 157 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 158 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 159 } 160 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 161 memcpy(data->h_points_per_elem, num_points, num_bytes); 162 CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 163 } 164 } 165 166 // Build kernels if needed 167 if (data->num_points != max_num_points) { 168 CeedInt P_1d; 169 170 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 171 data->num_points = max_num_points; 172 173 // -- Create interp matrix to Chebyshev coefficients 174 if (!data->d_chebyshev_interp_1d) { 175 CeedSize interp_bytes; 176 CeedScalar *chebyshev_interp_1d; 177 178 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 179 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 180 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 181 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 182 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 183 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 184 } 185 186 // -- Compile kernels 187 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h>\n"; 188 CeedInt num_comp; 189 190 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 191 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 192 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 193 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 194 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 195 max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 196 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 197 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); 198 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 199 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); 200 } 201 202 // Get read/write access to u, v 203 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 204 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 205 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 206 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 207 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 208 209 // Clear v for transpose operation 210 if (is_transpose && !apply_add) { 211 CeedInt num_comp, q_comp, num_nodes; 212 CeedSize length; 213 214 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 215 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 216 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 217 length = 218 (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp)); 219 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 220 } 221 222 // Basis action 223 switch (eval_mode) { 224 case CEED_EVAL_INTERP: { 225 void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 226 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 227 228 CeedCallBackend( 229 CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args)); 230 } break; 231 case CEED_EVAL_GRAD: { 232 void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 233 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 234 235 CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); 236 } break; 237 case CEED_EVAL_WEIGHT: 238 case CEED_EVAL_NONE: /* handled separately below */ 239 break; 240 // LCOV_EXCL_START 241 case CEED_EVAL_DIV: 242 case CEED_EVAL_CURL: 243 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 244 // LCOV_EXCL_STOP 245 } 246 247 // Restore vectors, cover CEED_EVAL_NONE 248 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 249 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 250 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 251 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 252 CeedCallBackend(CeedDestroy(&ceed)); 253 return CEED_ERROR_SUCCESS; 254 } 255 256 static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 257 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 258 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 259 return CEED_ERROR_SUCCESS; 260 } 261 262 static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 263 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 264 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 265 return CEED_ERROR_SUCCESS; 266 } 267 268 //------------------------------------------------------------------------------ 269 // Basis apply - non-tensor 270 //------------------------------------------------------------------------------ 271 static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 272 CeedVector u, CeedVector v) { 273 Ceed ceed; 274 CeedInt num_nodes, num_qpts; 275 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 276 const int elems_per_block = 1; 277 const int grid = CeedDivUpInt(num_elem, elems_per_block); 278 const CeedScalar *d_u; 279 CeedScalar *d_v; 280 CeedBasisNonTensor_Cuda *data; 281 282 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 283 CeedCallBackend(CeedBasisGetData(basis, &data)); 284 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 285 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 286 287 // Get read/write access to u, v 288 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 289 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 290 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 291 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 292 293 // Clear v for transpose operation 294 if (is_transpose && !apply_add) { 295 CeedInt num_comp, q_comp; 296 CeedSize length; 297 298 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 299 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 300 length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp)); 301 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 302 } 303 304 // Apply basis operation 305 switch (eval_mode) { 306 case CEED_EVAL_INTERP: { 307 void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 308 const int block_size_x = is_transpose ? num_nodes : num_qpts; 309 310 if (is_transpose) { 311 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 312 } else { 313 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 314 } 315 } break; 316 case CEED_EVAL_GRAD: { 317 void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 318 const int block_size_x = is_transpose ? num_nodes : num_qpts; 319 320 if (is_transpose) { 321 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 322 } else { 323 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 324 } 325 } break; 326 case CEED_EVAL_DIV: { 327 void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 328 const int block_size_x = is_transpose ? num_nodes : num_qpts; 329 330 if (is_transpose) { 331 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 332 } else { 333 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 334 } 335 } break; 336 case CEED_EVAL_CURL: { 337 void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 338 const int block_size_x = is_transpose ? num_nodes : num_qpts; 339 340 if (is_transpose) { 341 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 342 } else { 343 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 344 } 345 } break; 346 case CEED_EVAL_WEIGHT: { 347 CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 348 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 349 350 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 351 } break; 352 case CEED_EVAL_NONE: /* handled separately below */ 353 break; 354 } 355 356 // Restore vectors, cover CEED_EVAL_NONE 357 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 358 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 359 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 360 CeedCallBackend(CeedDestroy(&ceed)); 361 return CEED_ERROR_SUCCESS; 362 } 363 364 static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 365 CeedVector v) { 366 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 367 return CEED_ERROR_SUCCESS; 368 } 369 370 static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 371 CeedVector v) { 372 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 373 return CEED_ERROR_SUCCESS; 374 } 375 376 //------------------------------------------------------------------------------ 377 // Destroy tensor basis 378 //------------------------------------------------------------------------------ 379 static int CeedBasisDestroy_Cuda(CeedBasis basis) { 380 Ceed ceed; 381 CeedBasis_Cuda *data; 382 383 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 384 CeedCallBackend(CeedBasisGetData(basis, &data)); 385 CeedCallCuda(ceed, cuModuleUnload(data->module)); 386 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 387 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 388 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 389 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 390 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 391 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 392 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 393 CeedCallBackend(CeedFree(&data)); 394 CeedCallBackend(CeedDestroy(&ceed)); 395 return CEED_ERROR_SUCCESS; 396 } 397 398 //------------------------------------------------------------------------------ 399 // Destroy non-tensor basis 400 //------------------------------------------------------------------------------ 401 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 402 Ceed ceed; 403 CeedBasisNonTensor_Cuda *data; 404 405 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 406 CeedCallBackend(CeedBasisGetData(basis, &data)); 407 CeedCallCuda(ceed, cuModuleUnload(data->module)); 408 if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 409 CeedCallCuda(ceed, cudaFree(data->d_interp)); 410 CeedCallCuda(ceed, cudaFree(data->d_grad)); 411 CeedCallCuda(ceed, cudaFree(data->d_div)); 412 CeedCallCuda(ceed, cudaFree(data->d_curl)); 413 CeedCallBackend(CeedFree(&data)); 414 CeedCallBackend(CeedDestroy(&ceed)); 415 return CEED_ERROR_SUCCESS; 416 } 417 418 //------------------------------------------------------------------------------ 419 // Create tensor 420 //------------------------------------------------------------------------------ 421 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 422 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 423 Ceed ceed; 424 CeedInt num_comp; 425 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 426 const CeedInt interp_bytes = q_bytes * P_1d; 427 CeedBasis_Cuda *data; 428 429 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 430 CeedCallBackend(CeedCalloc(1, &data)); 431 432 // Copy data to GPU 433 if (q_weight_1d) { 434 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 435 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 436 } 437 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 438 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 439 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 440 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 441 442 // Compile basis kernels 443 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor.h>\n"; 444 445 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 446 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 447 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 448 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 449 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 450 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 451 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 452 453 CeedCallBackend(CeedBasisSetData(basis, data)); 454 455 // Register backend functions 456 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 457 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); 458 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); 459 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); 460 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 461 CeedCallBackend(CeedDestroy(&ceed)); 462 return CEED_ERROR_SUCCESS; 463 } 464 465 //------------------------------------------------------------------------------ 466 // Create non-tensor H^1 467 //------------------------------------------------------------------------------ 468 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 469 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 470 Ceed ceed; 471 CeedInt num_comp, q_comp_interp, q_comp_grad; 472 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 473 CeedBasisNonTensor_Cuda *data; 474 475 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 476 CeedCallBackend(CeedCalloc(1, &data)); 477 478 // Copy basis data to GPU 479 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 480 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 481 if (q_weight) { 482 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 483 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 484 } 485 if (interp) { 486 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 487 488 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 489 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 490 } 491 if (grad) { 492 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 493 494 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 495 CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 496 } 497 498 // Compile basis kernels 499 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 500 501 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 502 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 503 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 504 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 505 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 506 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 507 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 508 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 509 510 CeedCallBackend(CeedBasisSetData(basis, data)); 511 512 // Register backend functions 513 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 514 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 515 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 516 CeedCallBackend(CeedDestroy(&ceed)); 517 return CEED_ERROR_SUCCESS; 518 } 519 520 //------------------------------------------------------------------------------ 521 // Create non-tensor H(div) 522 //------------------------------------------------------------------------------ 523 int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 524 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 525 Ceed ceed; 526 CeedInt num_comp, q_comp_interp, q_comp_div; 527 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 528 CeedBasisNonTensor_Cuda *data; 529 530 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 531 CeedCallBackend(CeedCalloc(1, &data)); 532 533 // Copy basis data to GPU 534 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 535 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 536 if (q_weight) { 537 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 538 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 539 } 540 if (interp) { 541 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 542 543 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 544 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 545 } 546 if (div) { 547 const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 548 549 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); 550 CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); 551 } 552 553 // Compile basis kernels 554 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 555 556 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 557 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 558 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 559 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 560 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 561 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 562 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 563 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 564 565 CeedCallBackend(CeedBasisSetData(basis, data)); 566 567 // Register backend functions 568 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 569 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 570 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 571 CeedCallBackend(CeedDestroy(&ceed)); 572 return CEED_ERROR_SUCCESS; 573 } 574 575 //------------------------------------------------------------------------------ 576 // Create non-tensor H(curl) 577 //------------------------------------------------------------------------------ 578 int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 579 const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 580 Ceed ceed; 581 CeedInt num_comp, q_comp_interp, q_comp_curl; 582 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 583 CeedBasisNonTensor_Cuda *data; 584 585 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 586 CeedCallBackend(CeedCalloc(1, &data)); 587 588 // Copy basis data to GPU 589 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 590 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 591 if (q_weight) { 592 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 593 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 594 } 595 if (interp) { 596 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 597 598 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 599 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 600 } 601 if (curl) { 602 const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 603 604 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); 605 CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); 606 } 607 608 // Compile basis kernels 609 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 610 611 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 612 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 613 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 614 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 615 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 616 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 617 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 618 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 619 620 CeedCallBackend(CeedBasisSetData(basis, data)); 621 622 // Register backend functions 623 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 624 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 625 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 626 CeedCallBackend(CeedDestroy(&ceed)); 627 return CEED_ERROR_SUCCESS; 628 } 629 630 //------------------------------------------------------------------------------ 631