1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 #include <string.h> 16 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #include "ceed-cuda-shared.h" 20 21 //------------------------------------------------------------------------------ 22 // Apply tensor basis 23 //------------------------------------------------------------------------------ 24 static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 25 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 26 Ceed ceed; 27 Ceed_Cuda *ceed_Cuda; 28 CeedInt dim, num_comp; 29 const CeedScalar *d_u; 30 CeedScalar *d_v; 31 CeedBasis_Cuda_shared *data; 32 33 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 34 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 35 CeedCallBackend(CeedBasisGetData(basis, &data)); 36 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 37 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 38 39 // Get read/write access to u, v 40 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 41 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 42 if (apply_add) { 43 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 44 } else { 45 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 46 } 47 48 // Apply basis operation 49 switch (eval_mode) { 50 case CEED_EVAL_INTERP: { 51 CeedInt P_1d, Q_1d; 52 53 CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", CeedEvalModes[eval_mode]); 54 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 55 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 56 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 57 58 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; 59 60 if (dim == 1) { 61 // avoid >512 total threads 62 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 63 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 64 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 65 66 if (t_mode == CEED_TRANSPOSE) { 67 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1, 68 elems_per_block, shared_mem, interp_args)); 69 } else { 70 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); 71 } 72 } else if (dim == 2) { 73 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 74 // elems_per_block must be at least 1 75 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 76 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 77 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 78 79 if (t_mode == CEED_TRANSPOSE) { 80 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 81 thread_1d, elems_per_block, shared_mem, interp_args)); 82 } else { 83 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, 84 interp_args)); 85 } 86 } else if (dim == 3) { 87 CeedInt elems_per_block = 1; 88 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 89 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 90 91 if (t_mode == CEED_TRANSPOSE) { 92 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 93 thread_1d, elems_per_block, shared_mem, interp_args)); 94 } else { 95 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, 96 interp_args)); 97 } 98 } 99 } break; 100 case CEED_EVAL_GRAD: { 101 CeedInt P_1d, Q_1d; 102 103 CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEvalModes[eval_mode]); 104 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 105 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 106 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 107 CeedScalar *d_grad_1d = data->d_grad_1d; 108 109 if (data->d_collo_grad_1d) { 110 d_grad_1d = data->d_collo_grad_1d; 111 } 112 void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v}; 113 114 if (dim == 1) { 115 // avoid >512 total threads 116 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 117 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 118 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 119 120 if (t_mode == CEED_TRANSPOSE) { 121 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 1, 122 elems_per_block, shared_mem, grad_args)); 123 } else { 124 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); 125 } 126 } else if (dim == 2) { 127 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 128 // elems_per_block must be at least 1 129 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 130 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 131 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 132 133 if (t_mode == CEED_TRANSPOSE) { 134 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 135 thread_1d, elems_per_block, shared_mem, grad_args)); 136 } else { 137 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 138 } 139 } else if (dim == 3) { 140 CeedInt elems_per_block = 1; 141 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 142 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 143 144 if (t_mode == CEED_TRANSPOSE) { 145 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 146 thread_1d, elems_per_block, shared_mem, grad_args)); 147 } else { 148 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 149 } 150 } 151 } break; 152 case CEED_EVAL_WEIGHT: { 153 CeedInt Q_1d; 154 CeedInt block_size = 32; 155 156 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 157 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 158 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 159 if (dim == 1) { 160 const CeedInt elems_per_block = block_size / Q_1d; 161 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 162 163 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args)); 164 } else if (dim == 2) { 165 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 166 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 167 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 168 169 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 170 } else if (dim == 3) { 171 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 172 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 173 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 174 175 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 176 } 177 } break; 178 case CEED_EVAL_NONE: /* handled separately below */ 179 break; 180 // LCOV_EXCL_START 181 case CEED_EVAL_DIV: 182 case CEED_EVAL_CURL: 183 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 184 // LCOV_EXCL_STOP 185 } 186 187 // Restore vectors, cover CEED_EVAL_NONE 188 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 189 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 190 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 191 CeedCallBackend(CeedDestroy(&ceed)); 192 return CEED_ERROR_SUCCESS; 193 } 194 195 static int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 196 CeedVector v) { 197 CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 198 return CEED_ERROR_SUCCESS; 199 } 200 201 static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 202 CeedVector u, CeedVector v) { 203 CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 204 return CEED_ERROR_SUCCESS; 205 } 206 207 //------------------------------------------------------------------------------ 208 // Basis apply - tensor AtPoints 209 //------------------------------------------------------------------------------ 210 static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 211 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 212 Ceed ceed; 213 Ceed_Cuda *ceed_Cuda; 214 CeedInt Q_1d, dim, num_comp, max_num_points = num_points[0]; 215 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 216 const CeedScalar *d_x, *d_u; 217 CeedScalar *d_v; 218 CeedBasis_Cuda_shared *data; 219 220 CeedCallBackend(CeedBasisGetData(basis, &data)); 221 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 222 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 223 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 224 225 // Weight handled separately 226 if (eval_mode == CEED_EVAL_WEIGHT) { 227 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 228 return CEED_ERROR_SUCCESS; 229 } 230 231 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 232 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 233 234 // Check padded to uniform number of points per elem 235 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 236 { 237 CeedInt q_comp; 238 CeedSize len, len_required; 239 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 240 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 241 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 242 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 243 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 244 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 245 len, len_required); 246 } 247 248 // Move num_points array to device 249 if (is_transpose) { 250 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 251 252 if (num_elem != data->num_elem_at_points) { 253 data->num_elem_at_points = num_elem; 254 255 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 256 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 257 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 258 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 259 } 260 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 261 memcpy(data->h_points_per_elem, num_points, num_bytes); 262 CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 263 } 264 } 265 266 // Build kernels if needed 267 if (data->num_points != max_num_points) { 268 CeedInt P_1d; 269 270 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 271 data->num_points = max_num_points; 272 273 // -- Create interp matrix to Chebyshev coefficients 274 if (!data->d_chebyshev_interp_1d) { 275 CeedSize interp_bytes; 276 CeedScalar *chebyshev_interp_1d; 277 278 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 279 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 280 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 281 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 282 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 283 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 284 } 285 286 // -- Compile kernels 287 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h>\n"; 288 CeedInt num_comp; 289 290 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 291 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 292 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D", 293 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 294 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points)); 295 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 296 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); 297 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAddAtPoints", &data->InterpTransposeAddAtPoints)); 298 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 299 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); 300 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAddAtPoints", &data->GradTransposeAddAtPoints)); 301 } 302 303 // Get read/write access to u, v 304 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 305 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 306 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 307 if (apply_add) { 308 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 309 } else { 310 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 311 } 312 313 // Basis action 314 switch (eval_mode) { 315 case CEED_EVAL_INTERP: { 316 CeedInt P_1d, Q_1d; 317 318 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 319 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 320 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 321 322 void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 323 324 if (dim == 1) { 325 // avoid >512 total threads 326 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 327 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 328 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 329 330 if (is_transpose) { 331 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid, 332 thread_1d, 1, elems_per_block, shared_mem, interp_args)); 333 } else { 334 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, 335 interp_args)); 336 } 337 } else if (dim == 2) { 338 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 339 // elems_per_block must be at least 1 340 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 341 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 342 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 343 344 if (is_transpose) { 345 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid, 346 thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 347 } else { 348 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, 349 interp_args)); 350 } 351 } else if (dim == 3) { 352 CeedInt elems_per_block = 1; 353 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 354 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 355 356 if (is_transpose) { 357 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid, 358 thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 359 } else { 360 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, 361 interp_args)); 362 } 363 } 364 } break; 365 case CEED_EVAL_GRAD: { 366 CeedInt P_1d, Q_1d; 367 368 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 369 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 370 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 371 372 void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 373 374 if (dim == 1) { 375 // avoid >512 total threads 376 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 377 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 378 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 379 380 if (is_transpose) { 381 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid, 382 thread_1d, 1, elems_per_block, shared_mem, grad_args)); 383 } else { 384 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); 385 } 386 } else if (dim == 2) { 387 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 388 // elems_per_block must be at least 1 389 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 390 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 391 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 392 393 if (is_transpose) { 394 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid, 395 thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 396 } else { 397 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, 398 grad_args)); 399 } 400 } else if (dim == 3) { 401 CeedInt elems_per_block = 1; 402 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 403 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 404 405 if (is_transpose) { 406 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid, 407 thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 408 } else { 409 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, 410 grad_args)); 411 } 412 } 413 } break; 414 case CEED_EVAL_WEIGHT: 415 case CEED_EVAL_NONE: /* handled separately below */ 416 break; 417 // LCOV_EXCL_START 418 case CEED_EVAL_DIV: 419 case CEED_EVAL_CURL: 420 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 421 // LCOV_EXCL_STOP 422 } 423 424 // Restore vectors, cover CEED_EVAL_NONE 425 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 426 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 427 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 428 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 429 CeedCallBackend(CeedDestroy(&ceed)); 430 return CEED_ERROR_SUCCESS; 431 } 432 433 static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 434 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 435 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 436 return CEED_ERROR_SUCCESS; 437 } 438 439 static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 440 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 441 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 442 return CEED_ERROR_SUCCESS; 443 } 444 445 //------------------------------------------------------------------------------ 446 // Apply non-tensor basis 447 //------------------------------------------------------------------------------ 448 static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 449 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 450 Ceed ceed; 451 Ceed_Cuda *ceed_Cuda; 452 CeedInt dim; 453 const CeedScalar *d_u; 454 CeedScalar *d_v; 455 CeedBasis_Cuda_shared *data; 456 457 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 458 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 459 CeedCallBackend(CeedBasisGetData(basis, &data)); 460 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 461 462 // Get read/write access to u, v 463 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 464 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 465 if (apply_add) { 466 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 467 } else { 468 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 469 } 470 471 // Apply basis operation 472 switch (eval_mode) { 473 case CEED_EVAL_INTERP: { 474 CeedInt P, Q; 475 476 CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEvalModes[eval_mode]); 477 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 478 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 479 CeedInt thread = CeedIntMax(Q, P); 480 481 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; 482 483 { 484 // avoid >512 total threads 485 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 486 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 487 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 488 489 if (t_mode == CEED_TRANSPOSE) { 490 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1, 491 elems_per_block, shared_mem, interp_args)); 492 } else { 493 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread, 1, elems_per_block, shared_mem, interp_args)); 494 } 495 } 496 } break; 497 case CEED_EVAL_GRAD: { 498 CeedInt P, Q; 499 500 CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalModes[eval_mode]); 501 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 502 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 503 CeedInt thread = CeedIntMax(Q, P); 504 505 void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v}; 506 507 { 508 // avoid >512 total threads 509 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 510 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 511 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 512 513 if (t_mode == CEED_TRANSPOSE) { 514 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1, 515 elems_per_block, shared_mem, grad_args)); 516 } else { 517 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread, 1, elems_per_block, shared_mem, grad_args)); 518 } 519 } 520 } break; 521 case CEED_EVAL_WEIGHT: { 522 CeedInt P, Q; 523 524 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 525 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 526 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 527 CeedInt thread = CeedIntMax(Q, P); 528 529 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 530 531 { 532 // avoid >512 total threads 533 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 534 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 535 536 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight_args)); 537 } 538 } break; 539 case CEED_EVAL_NONE: /* handled separately below */ 540 break; 541 // LCOV_EXCL_START 542 case CEED_EVAL_DIV: 543 case CEED_EVAL_CURL: 544 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 545 // LCOV_EXCL_STOP 546 } 547 548 // Restore vectors, cover CEED_EVAL_NONE 549 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 550 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 551 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 552 CeedCallBackend(CeedDestroy(&ceed)); 553 return CEED_ERROR_SUCCESS; 554 } 555 556 static int CeedBasisApplyNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 557 CeedVector u, CeedVector v) { 558 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 559 return CEED_ERROR_SUCCESS; 560 } 561 562 static int CeedBasisApplyAddNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 563 CeedVector u, CeedVector v) { 564 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 565 return CEED_ERROR_SUCCESS; 566 } 567 568 //------------------------------------------------------------------------------ 569 // Destroy basis 570 //------------------------------------------------------------------------------ 571 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 572 Ceed ceed; 573 CeedBasis_Cuda_shared *data; 574 575 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 576 CeedCallBackend(CeedBasisGetData(basis, &data)); 577 CeedCallCuda(ceed, cuModuleUnload(data->module)); 578 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 579 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 580 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 581 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 582 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 583 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 584 CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d)); 585 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 586 CeedCallBackend(CeedFree(&data)); 587 CeedCallBackend(CeedDestroy(&ceed)); 588 return CEED_ERROR_SUCCESS; 589 } 590 591 //------------------------------------------------------------------------------ 592 // Create tensor basis 593 //------------------------------------------------------------------------------ 594 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 595 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 596 Ceed ceed; 597 CeedInt num_comp; 598 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 599 const CeedInt interp_bytes = q_bytes * P_1d; 600 CeedBasis_Cuda_shared *data; 601 602 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 603 CeedCallBackend(CeedCalloc(1, &data)); 604 605 // Copy basis data to GPU 606 if (q_weight_1d) { 607 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 608 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 609 } 610 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 611 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 612 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 613 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 614 615 // Compute collocated gradient and copy to GPU 616 data->d_collo_grad_1d = NULL; 617 bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; 618 619 if (has_collocated_grad) { 620 CeedScalar *collo_grad_1d; 621 622 CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); 623 CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); 624 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d)); 625 CeedCallCuda(ceed, cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, cudaMemcpyHostToDevice)); 626 CeedCallBackend(CeedFree(&collo_grad_1d)); 627 } 628 629 // Compile basis kernels 630 bool is_collocated = false; 631 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor.h>\n"; 632 633 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 634 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D", 635 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 636 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad)); 637 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 638 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocated" : "Interp", &data->Interp)); 639 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTranspose" : "InterpTranspose", &data->InterpTranspose)); 640 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTransposeAdd" : "InterpTransposeAdd", 641 &data->InterpTransposeAdd)); 642 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocated" : "Grad", &data->Grad)); 643 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocatedTranspose" : "GradTranspose", &data->GradTranspose)); 644 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocatedTransposeAdd" : "GradTransposeAdd", &data->GradTransposeAdd)); 645 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 646 647 CeedCallBackend(CeedBasisSetData(basis, data)); 648 649 // Register backend functions 650 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared)); 651 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared)); 652 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared)); 653 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda_shared)); 654 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 655 CeedCallBackend(CeedDestroy(&ceed)); 656 return CEED_ERROR_SUCCESS; 657 } 658 659 //------------------------------------------------------------------------------ 660 // Create non-tensor basis 661 //------------------------------------------------------------------------------ 662 int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 663 const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 664 Ceed ceed; 665 CeedInt num_comp, q_comp_interp, q_comp_grad; 666 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 667 CeedBasis_Cuda_shared *data; 668 669 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 670 671 // Check shared memory size 672 { 673 Ceed_Cuda *cuda_data; 674 675 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 676 if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) > 677 cuda_data->device_prop.sharedMemPerBlock) { 678 CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis)); 679 CeedCallBackend(CeedDestroy(&ceed)); 680 return CEED_ERROR_SUCCESS; 681 } 682 } 683 684 CeedCallBackend(CeedCalloc(1, &data)); 685 686 // Copy basis data to GPU 687 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 688 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 689 if (q_weight) { 690 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 691 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight, q_bytes, cudaMemcpyHostToDevice)); 692 } 693 if (interp) { 694 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 695 696 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 697 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp, interp_bytes, cudaMemcpyHostToDevice)); 698 } 699 if (grad) { 700 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 701 702 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, grad_bytes)); 703 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad, grad_bytes, cudaMemcpyHostToDevice)); 704 } 705 706 // Compile basis kernels 707 const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor.h>\n"; 708 709 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 710 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_T_1D", 711 CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp)); 712 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 713 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 714 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 715 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 716 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); 717 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 718 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 719 720 CeedCallBackend(CeedBasisSetData(basis, data)); 721 722 // Register backend functions 723 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda_shared)); 724 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda_shared)); 725 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 726 CeedCallBackend(CeedDestroy(&ceed)); 727 return CEED_ERROR_SUCCESS; 728 } 729 730 //------------------------------------------------------------------------------ 731