1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <string.h> 14 15 #include "../cuda/ceed-cuda-common.h" 16 #include "../cuda/ceed-cuda-compile.h" 17 #include "ceed-cuda-ref.h" 18 19 //------------------------------------------------------------------------------ 20 // Basis apply - tensor 21 //------------------------------------------------------------------------------ 22 static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 23 CeedVector u, CeedVector v) { 24 Ceed ceed; 25 CeedInt Q_1d, dim; 26 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 27 const int max_block_size = 32; 28 const CeedScalar *d_u; 29 CeedScalar *d_v; 30 CeedBasis_Cuda *data; 31 32 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 33 CeedCallBackend(CeedBasisGetData(basis, &data)); 34 35 // Get read/write access to u, v 36 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 37 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 38 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 39 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 40 41 // Clear v for transpose operation 42 if (is_transpose && !apply_add) { 43 CeedInt num_comp, q_comp, num_nodes, num_qpts; 44 CeedSize length; 45 46 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 47 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 48 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 49 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 50 length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp)); 51 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 52 } 53 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 54 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 55 56 // Basis action 57 switch (eval_mode) { 58 case CEED_EVAL_INTERP: { 59 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 60 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 61 62 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 63 } break; 64 case CEED_EVAL_GRAD: { 65 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 66 const CeedInt block_size = max_block_size; 67 68 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 69 } break; 70 case CEED_EVAL_WEIGHT: { 71 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 72 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 73 const int block_size_x = Q_1d; 74 const int block_size_y = dim >= 2 ? Q_1d : 1; 75 76 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 77 } break; 78 case CEED_EVAL_NONE: /* handled separately below */ 79 break; 80 // LCOV_EXCL_START 81 case CEED_EVAL_DIV: 82 case CEED_EVAL_CURL: 83 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 84 // LCOV_EXCL_STOP 85 } 86 87 // Restore vectors, cover CEED_EVAL_NONE 88 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 89 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 90 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 91 return CEED_ERROR_SUCCESS; 92 } 93 94 static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 95 CeedVector v) { 96 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 97 return CEED_ERROR_SUCCESS; 98 } 99 100 static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 101 CeedVector v) { 102 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 103 return CEED_ERROR_SUCCESS; 104 } 105 106 //------------------------------------------------------------------------------ 107 // Basis apply - tensor AtPoints 108 //------------------------------------------------------------------------------ 109 static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 110 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 111 Ceed ceed; 112 CeedInt Q_1d, dim, max_num_points = num_points[0]; 113 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 114 const int max_block_size = 32; 115 const CeedScalar *d_x, *d_u; 116 CeedScalar *d_v; 117 CeedBasis_Cuda *data; 118 119 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 120 CeedCallBackend(CeedBasisGetData(basis, &data)); 121 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 122 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 123 124 // Weight handled separately 125 if (eval_mode == CEED_EVAL_WEIGHT) { 126 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 127 return CEED_ERROR_SUCCESS; 128 } 129 130 // Check padded to uniform number of points per elem 131 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 132 { 133 CeedInt num_comp, q_comp; 134 CeedSize len, len_required; 135 136 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 137 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 138 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 139 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 140 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 141 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 142 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 143 len, len_required); 144 } 145 146 // Move num_points array to device 147 if (is_transpose) { 148 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 149 150 if (num_elem != data->num_elem_at_points) { 151 data->num_elem_at_points = num_elem; 152 153 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 154 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 155 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 156 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 157 } 158 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 159 memcpy(data->h_points_per_elem, num_points, num_bytes); 160 CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 161 } 162 } 163 164 // Build kernels if needed 165 if (data->num_points != max_num_points) { 166 CeedInt P_1d; 167 168 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 169 data->num_points = max_num_points; 170 171 // -- Create interp matrix to Chebyshev coefficients 172 if (!data->d_chebyshev_interp_1d) { 173 CeedSize interp_bytes; 174 CeedScalar *chebyshev_interp_1d; 175 176 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 177 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 178 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 179 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 180 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 181 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 182 } 183 184 // -- Compile kernels 185 char *basis_kernel_source; 186 const char *basis_kernel_path; 187 CeedInt num_comp; 188 189 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 190 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 191 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path)); 192 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 193 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 194 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 195 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 196 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 197 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 198 max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 199 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 200 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 201 CeedCallBackend(CeedFree(&basis_kernel_path)); 202 CeedCallBackend(CeedFree(&basis_kernel_source)); 203 } 204 205 // Get read/write access to u, v 206 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 207 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 208 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 209 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 210 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 211 212 // Clear v for transpose operation 213 if (is_transpose && !apply_add) { 214 CeedInt num_comp, q_comp, num_nodes; 215 CeedSize length; 216 217 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 218 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 219 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 220 length = 221 (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp)); 222 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 223 } 224 225 // Basis action 226 switch (eval_mode) { 227 case CEED_EVAL_INTERP: { 228 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 229 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 230 231 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); 232 } break; 233 case CEED_EVAL_GRAD: { 234 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 235 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 236 237 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); 238 } break; 239 case CEED_EVAL_WEIGHT: 240 case CEED_EVAL_NONE: /* handled separately below */ 241 break; 242 // LCOV_EXCL_START 243 case CEED_EVAL_DIV: 244 case CEED_EVAL_CURL: 245 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 246 // LCOV_EXCL_STOP 247 } 248 249 // Restore vectors, cover CEED_EVAL_NONE 250 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 251 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 252 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 253 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 254 return CEED_ERROR_SUCCESS; 255 } 256 257 static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 258 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 259 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 260 return CEED_ERROR_SUCCESS; 261 } 262 263 static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 264 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 265 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 266 return CEED_ERROR_SUCCESS; 267 } 268 269 //------------------------------------------------------------------------------ 270 // Basis apply - non-tensor 271 //------------------------------------------------------------------------------ 272 static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 273 CeedVector u, CeedVector v) { 274 Ceed ceed; 275 CeedInt num_nodes, num_qpts; 276 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 277 const int elems_per_block = 1; 278 const int grid = CeedDivUpInt(num_elem, elems_per_block); 279 const CeedScalar *d_u; 280 CeedScalar *d_v; 281 CeedBasisNonTensor_Cuda *data; 282 283 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 284 CeedCallBackend(CeedBasisGetData(basis, &data)); 285 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 286 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 287 288 // Get read/write access to u, v 289 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 290 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 291 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 292 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 293 294 // Clear v for transpose operation 295 if (is_transpose && !apply_add) { 296 CeedInt num_comp, q_comp; 297 CeedSize length; 298 299 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 300 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 301 length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp)); 302 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 303 } 304 305 // Apply basis operation 306 switch (eval_mode) { 307 case CEED_EVAL_INTERP: { 308 void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 309 const int block_size_x = is_transpose ? num_nodes : num_qpts; 310 311 if (is_transpose) { 312 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 313 } else { 314 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 315 } 316 } break; 317 case CEED_EVAL_GRAD: { 318 void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 319 const int block_size_x = is_transpose ? num_nodes : num_qpts; 320 321 if (is_transpose) { 322 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 323 } else { 324 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 325 } 326 } break; 327 case CEED_EVAL_DIV: { 328 void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 329 const int block_size_x = is_transpose ? num_nodes : num_qpts; 330 331 if (is_transpose) { 332 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 333 } else { 334 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 335 } 336 } break; 337 case CEED_EVAL_CURL: { 338 void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 339 const int block_size_x = is_transpose ? num_nodes : num_qpts; 340 341 if (is_transpose) { 342 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 343 } else { 344 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 345 } 346 } break; 347 case CEED_EVAL_WEIGHT: { 348 CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 349 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 350 351 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 352 } break; 353 case CEED_EVAL_NONE: /* handled separately below */ 354 break; 355 } 356 357 // Restore vectors, cover CEED_EVAL_NONE 358 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 359 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 360 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 361 return CEED_ERROR_SUCCESS; 362 } 363 364 static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 365 CeedVector v) { 366 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 367 return CEED_ERROR_SUCCESS; 368 } 369 370 static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 371 CeedVector v) { 372 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 373 return CEED_ERROR_SUCCESS; 374 } 375 376 //------------------------------------------------------------------------------ 377 // Destroy tensor basis 378 //------------------------------------------------------------------------------ 379 static int CeedBasisDestroy_Cuda(CeedBasis basis) { 380 Ceed ceed; 381 CeedBasis_Cuda *data; 382 383 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 384 CeedCallBackend(CeedBasisGetData(basis, &data)); 385 CeedCallCuda(ceed, cuModuleUnload(data->module)); 386 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 387 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 388 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 389 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 390 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 391 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 392 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 393 CeedCallBackend(CeedFree(&data)); 394 return CEED_ERROR_SUCCESS; 395 } 396 397 //------------------------------------------------------------------------------ 398 // Destroy non-tensor basis 399 //------------------------------------------------------------------------------ 400 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 401 Ceed ceed; 402 CeedBasisNonTensor_Cuda *data; 403 404 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 405 CeedCallBackend(CeedBasisGetData(basis, &data)); 406 CeedCallCuda(ceed, cuModuleUnload(data->module)); 407 if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 408 CeedCallCuda(ceed, cudaFree(data->d_interp)); 409 CeedCallCuda(ceed, cudaFree(data->d_grad)); 410 CeedCallCuda(ceed, cudaFree(data->d_div)); 411 CeedCallCuda(ceed, cudaFree(data->d_curl)); 412 CeedCallBackend(CeedFree(&data)); 413 return CEED_ERROR_SUCCESS; 414 } 415 416 //------------------------------------------------------------------------------ 417 // Create tensor 418 //------------------------------------------------------------------------------ 419 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 420 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 421 Ceed ceed; 422 char *basis_kernel_source; 423 const char *basis_kernel_path; 424 CeedInt num_comp; 425 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 426 const CeedInt interp_bytes = q_bytes * P_1d; 427 CeedBasis_Cuda *data; 428 429 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 430 CeedCallBackend(CeedCalloc(1, &data)); 431 432 // Copy data to GPU 433 if (q_weight_1d) { 434 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 435 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 436 } 437 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 438 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 439 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 440 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 441 442 // Compile basis kernels 443 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 444 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); 445 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 446 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 447 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 448 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 449 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 450 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 451 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 452 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 453 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 454 CeedCallBackend(CeedFree(&basis_kernel_path)); 455 CeedCallBackend(CeedFree(&basis_kernel_source)); 456 457 CeedCallBackend(CeedBasisSetData(basis, data)); 458 459 // Register backend functions 460 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 461 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); 462 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); 463 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); 464 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 465 return CEED_ERROR_SUCCESS; 466 } 467 468 //------------------------------------------------------------------------------ 469 // Create non-tensor H^1 470 //------------------------------------------------------------------------------ 471 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 472 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 473 Ceed ceed; 474 char *basis_kernel_source; 475 const char *basis_kernel_path; 476 CeedInt num_comp, q_comp_interp, q_comp_grad; 477 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 478 CeedBasisNonTensor_Cuda *data; 479 480 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 481 CeedCallBackend(CeedCalloc(1, &data)); 482 483 // Copy basis data to GPU 484 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 485 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 486 if (q_weight) { 487 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 488 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 489 } 490 if (interp) { 491 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 492 493 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 494 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 495 } 496 if (grad) { 497 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 498 499 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 500 CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 501 } 502 503 // Compile basis kernels 504 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 505 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 506 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 507 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 508 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 509 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 510 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 511 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 512 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 513 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 514 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 515 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 516 CeedCallBackend(CeedFree(&basis_kernel_path)); 517 CeedCallBackend(CeedFree(&basis_kernel_source)); 518 519 CeedCallBackend(CeedBasisSetData(basis, data)); 520 521 // Register backend functions 522 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 523 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 524 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 525 return CEED_ERROR_SUCCESS; 526 } 527 528 //------------------------------------------------------------------------------ 529 // Create non-tensor H(div) 530 //------------------------------------------------------------------------------ 531 int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 532 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 533 Ceed ceed; 534 char *basis_kernel_source; 535 const char *basis_kernel_path; 536 CeedInt num_comp, q_comp_interp, q_comp_div; 537 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 538 CeedBasisNonTensor_Cuda *data; 539 540 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 541 CeedCallBackend(CeedCalloc(1, &data)); 542 543 // Copy basis data to GPU 544 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 545 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 546 if (q_weight) { 547 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 548 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 549 } 550 if (interp) { 551 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 552 553 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 554 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 555 } 556 if (div) { 557 const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 558 559 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); 560 CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); 561 } 562 563 // Compile basis kernels 564 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 565 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 566 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 567 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 568 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 569 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 570 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 571 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 572 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 573 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 574 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 575 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 576 CeedCallBackend(CeedFree(&basis_kernel_path)); 577 CeedCallBackend(CeedFree(&basis_kernel_source)); 578 579 CeedCallBackend(CeedBasisSetData(basis, data)); 580 581 // Register backend functions 582 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 583 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 584 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 585 return CEED_ERROR_SUCCESS; 586 } 587 588 //------------------------------------------------------------------------------ 589 // Create non-tensor H(curl) 590 //------------------------------------------------------------------------------ 591 int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 592 const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 593 Ceed ceed; 594 char *basis_kernel_source; 595 const char *basis_kernel_path; 596 CeedInt num_comp, q_comp_interp, q_comp_curl; 597 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 598 CeedBasisNonTensor_Cuda *data; 599 600 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 601 CeedCallBackend(CeedCalloc(1, &data)); 602 603 // Copy basis data to GPU 604 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 605 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 606 if (q_weight) { 607 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 608 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 609 } 610 if (interp) { 611 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 612 613 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 614 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 615 } 616 if (curl) { 617 const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 618 619 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); 620 CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); 621 } 622 623 // Compile basis kernels 624 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 625 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 626 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 627 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 628 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 629 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 630 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 631 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 632 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 633 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 634 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 635 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 636 CeedCallBackend(CeedFree(&basis_kernel_path)); 637 CeedCallBackend(CeedFree(&basis_kernel_source)); 638 639 CeedCallBackend(CeedBasisSetData(basis, data)); 640 641 // Register backend functions 642 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 643 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 644 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 645 return CEED_ERROR_SUCCESS; 646 } 647 648 //------------------------------------------------------------------------------ 649