1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-source/cuda/cuda-types.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stddef.h> 14 #include <string.h> 15 16 #include "../cuda/ceed-cuda-common.h" 17 #include "../cuda/ceed-cuda-compile.h" 18 #include "ceed-cuda-gen-operator-build.h" 19 #include "ceed-cuda-gen.h" 20 21 //------------------------------------------------------------------------------ 22 // Destroy operator 23 //------------------------------------------------------------------------------ 24 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 25 Ceed ceed; 26 CeedOperator_Cuda_gen *impl; 27 28 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 29 CeedCallBackend(CeedOperatorGetData(op, &impl)); 30 if (impl->module) CeedCallCuda(ceed, cuModuleUnload(impl->module)); 31 if (impl->module_assemble_full) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_full)); 32 if (impl->module_assemble_diagonal) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_diagonal)); 33 if (impl->module_assemble_qfunction) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_qfunction)); 34 if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem)); 35 CeedCallBackend(CeedFree(&impl)); 36 CeedCallBackend(CeedDestroy(&ceed)); 37 return CEED_ERROR_SUCCESS; 38 } 39 40 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, int elems_per_block) { 41 int useful_threads_per_block = threads_per_elem * elems_per_block; 42 // round up to nearest multiple of warp_size 43 int block_size = CeedDivUpInt(useful_threads_per_block, warp_size) * warp_size; 44 int blocks_per_sm = threads_per_sm / block_size; 45 return threads_per_sm - useful_threads_per_block * blocks_per_sm; 46 } 47 48 // Choose the least wasteful block size constrained by blocks_per_sm of max_threads_per_block. 49 // 50 // The x and y part of block[] contains per-element sizes (specified on input) while the z part is number of elements. 51 // 52 // Problem setting: we'd like to make occupancy high with relatively few inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how 53 // many threads can run. 54 // 55 // Note that full occupancy sometimes can't be achieved by one thread block. 56 // For example, an SM might support 1536 threads in total, but only 1024 within a single thread block. 57 // So cuOccupancyMaxPotentialBlockSize may suggest a block size of 768 so that two blocks can run, versus one block of 1024 will prevent a second 58 // block from running. The cuda-gen kernels are pretty heavy with lots of instruction-level parallelism (ILP) so we'll generally be okay with 59 // relatively low occupancy and smaller thread blocks, but we solve a reasonably general problem here. Empirically, we find that blocks bigger than 60 // about 256 have higher latency and worse load balancing when the number of elements is modest. 61 // 62 // cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of the number of quadrature points (or number of basis functions). 63 // They also have a lot of __syncthreads(), which is another point against excessively large thread blocks. 64 // Suppose I have elements with 7x7x7 quadrature points. 65 // This will loop over the last dimension, so we have 7*7=49 threads per element. 66 // Suppose we have two elements = 2*49=98 useful threads. 67 // CUDA schedules in units of full warps (32 threads), so 128 CUDA hardware threads are effectively committed to that block. 68 // Now suppose cuOccupancyMaxPotentialBlockSize returned 352. 69 // We can schedule 2 blocks of size 98 (196 useful threads using 256 hardware threads), but not a third block (which would need a total of 384 70 // hardware threads). 71 // 72 // If instead, we had packed 3 elements, we'd have 3*49=147 useful threads occupying 160 slots, and could schedule two blocks. 73 // Alternatively, we could pack a single block of 7 elements (2*49=343 useful threads) into the 354 slots. 74 // The latter has the least "waste", but __syncthreads() over-synchronizes and it might not pay off relative to smaller blocks. 75 static int BlockGridCalculate(CeedInt num_elem, int blocks_per_sm, int max_threads_per_block, int max_threads_z, int warp_size, int block[3], 76 int *grid) { 77 const int threads_per_sm = blocks_per_sm * max_threads_per_block; 78 const int threads_per_elem = block[0] * block[1]; 79 int elems_per_block = 1; 80 int waste = Waste(threads_per_sm, warp_size, threads_per_elem, 1); 81 82 for (int i = 2; i <= CeedIntMin(max_threads_per_block / threads_per_elem, num_elem); i++) { 83 int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i); 84 85 // We want to minimize waste, but smaller kernels have lower latency and less __syncthreads() overhead so when a larger block size has the same 86 // waste as a smaller one, go ahead and prefer the smaller block. 87 if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) { 88 elems_per_block = i; 89 waste = i_waste; 90 } 91 } 92 // In low-order elements, threads_per_elem may be sufficiently low to give an elems_per_block greater than allowable for the device, so we must 93 // check before setting the z-dimension size of the block. 94 block[2] = CeedIntMin(elems_per_block, max_threads_z); 95 *grid = CeedDivUpInt(num_elem, elems_per_block); 96 return CEED_ERROR_SUCCESS; 97 } 98 99 // callback for cuOccupancyMaxPotentialBlockSize, providing the amount of dynamic shared memory required for a thread block of size threads. 100 static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); } 101 102 //------------------------------------------------------------------------------ 103 // Apply and add to output 104 //------------------------------------------------------------------------------ 105 static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good, 106 CeedRequest *request) { 107 bool is_at_points, is_tensor; 108 Ceed ceed; 109 Ceed_Cuda *cuda_data; 110 CeedInt num_elem, num_input_fields, num_output_fields; 111 CeedEvalMode eval_mode; 112 CeedQFunctionField *qf_input_fields, *qf_output_fields; 113 CeedQFunction_Cuda_gen *qf_data; 114 CeedQFunction qf; 115 CeedOperatorField *op_input_fields, *op_output_fields; 116 CeedOperator_Cuda_gen *data; 117 118 // Build the operator kernel 119 CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, is_run_good)); 120 if (!(*is_run_good)) return CEED_ERROR_SUCCESS; 121 122 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 123 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 124 CeedCallBackend(CeedOperatorGetData(op, &data)); 125 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 126 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 127 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 128 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 129 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 130 131 // Input vectors 132 for (CeedInt i = 0; i < num_input_fields; i++) { 133 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 134 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 135 data->fields.inputs[i] = NULL; 136 } else { 137 bool is_active; 138 CeedVector vec; 139 140 // Get input vector 141 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 142 is_active = vec == CEED_VECTOR_ACTIVE; 143 if (is_active) data->fields.inputs[i] = input_arr; 144 else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); 145 CeedCallBackend(CeedVectorDestroy(&vec)); 146 } 147 } 148 149 // Output vectors 150 for (CeedInt i = 0; i < num_output_fields; i++) { 151 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 152 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 153 data->fields.outputs[i] = NULL; 154 } else { 155 bool is_active; 156 CeedVector vec; 157 158 // Get output vector 159 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 160 is_active = vec == CEED_VECTOR_ACTIVE; 161 if (is_active) data->fields.outputs[i] = output_arr; 162 else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i])); 163 CeedCallBackend(CeedVectorDestroy(&vec)); 164 } 165 } 166 167 // Point coordinates, if needed 168 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 169 if (is_at_points) { 170 // Coords 171 CeedVector vec; 172 173 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 174 CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); 175 CeedCallBackend(CeedVectorDestroy(&vec)); 176 177 // Points per elem 178 if (num_elem != data->points.num_elem) { 179 CeedInt *points_per_elem; 180 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 181 CeedElemRestriction rstr_points = NULL; 182 183 data->points.num_elem = num_elem; 184 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 185 CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); 186 for (CeedInt e = 0; e < num_elem; e++) { 187 CeedInt num_points_elem; 188 189 CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); 190 points_per_elem[e] = num_points_elem; 191 } 192 if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem)); 193 CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes)); 194 CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice)); 195 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 196 CeedCallBackend(CeedFree(&points_per_elem)); 197 } 198 } 199 200 // Get context data 201 CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); 202 203 // Apply operator 204 void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points}; 205 int max_threads_per_block, min_grid_size, grid; 206 207 CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor)); 208 CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 209 int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; 210 211 if (is_tensor) { 212 CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block, 213 cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); 214 } else { 215 CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1)); 216 217 grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 218 block[2] = elems_per_block; 219 } 220 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 221 222 CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, stream, grid, block[0], block[1], block[2], shared_mem, is_run_good, opargs)); 223 224 // Restore input arrays 225 for (CeedInt i = 0; i < num_input_fields; i++) { 226 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 227 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 228 } else { 229 bool is_active; 230 CeedVector vec; 231 232 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 233 is_active = vec == CEED_VECTOR_ACTIVE; 234 if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); 235 CeedCallBackend(CeedVectorDestroy(&vec)); 236 } 237 } 238 239 // Restore output arrays 240 for (CeedInt i = 0; i < num_output_fields; i++) { 241 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 242 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 243 } else { 244 bool is_active; 245 CeedVector vec; 246 247 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 248 is_active = vec == CEED_VECTOR_ACTIVE; 249 if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i])); 250 CeedCallBackend(CeedVectorDestroy(&vec)); 251 } 252 } 253 254 // Restore point coordinates, if needed 255 if (is_at_points) { 256 CeedVector vec; 257 258 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 259 CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); 260 CeedCallBackend(CeedVectorDestroy(&vec)); 261 } 262 263 // Restore context data 264 CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); 265 266 // Cleanup 267 CeedCallBackend(CeedDestroy(&ceed)); 268 CeedCallBackend(CeedQFunctionDestroy(&qf)); 269 if (!(*is_run_good)) data->use_fallback = true; 270 return CEED_ERROR_SUCCESS; 271 } 272 273 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { 274 bool is_run_good = false; 275 const CeedScalar *input_arr = NULL; 276 CeedScalar *output_arr = NULL; 277 278 // Try to run kernel 279 if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr)); 280 if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr)); 281 CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, NULL, input_arr, output_arr, &is_run_good, request)); 282 if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr)); 283 if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr)); 284 285 // Fallback on unsuccessful run 286 if (!is_run_good) { 287 CeedOperator op_fallback; 288 289 CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for ApplyAdd\n"); 290 CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); 291 CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); 292 } 293 return CEED_ERROR_SUCCESS; 294 } 295 296 static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { 297 bool is_run_good[CEED_COMPOSITE_MAX] = {false}; 298 CeedInt num_suboperators; 299 const CeedScalar *input_arr = NULL; 300 CeedScalar *output_arr = NULL; 301 Ceed ceed; 302 CeedOperator *sub_operators; 303 304 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 305 CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); 306 CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); 307 if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr)); 308 if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr)); 309 for (CeedInt i = 0; i < num_suboperators; i++) { 310 CeedInt num_elem = 0; 311 312 CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem)); 313 if (num_elem > 0) { 314 cudaStream_t stream = NULL; 315 316 CeedCallCuda(ceed, cudaStreamCreate(&stream)); 317 CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(sub_operators[i], stream, input_arr, output_arr, &is_run_good[i], request)); 318 CeedCallCuda(ceed, cudaStreamDestroy(stream)); 319 } 320 } 321 if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr)); 322 if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr)); 323 CeedCallCuda(ceed, cudaDeviceSynchronize()); 324 325 // Fallback on unsuccessful run 326 for (CeedInt i = 0; i < num_suboperators; i++) { 327 if (!is_run_good[i]) { 328 CeedOperator op_fallback; 329 330 CeedDebug(ceed, "\nFalling back to /gpu/cuda/ref CeedOperator for ApplyAdd\n"); 331 CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback)); 332 CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); 333 } 334 } 335 CeedCallBackend(CeedDestroy(&ceed)); 336 return CEED_ERROR_SUCCESS; 337 } 338 339 //------------------------------------------------------------------------------ 340 // QFunction assembly 341 //------------------------------------------------------------------------------ 342 static int CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, 343 CeedRequest *request) { 344 Ceed ceed; 345 CeedOperator_Cuda_gen *data; 346 347 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 348 CeedCallBackend(CeedOperatorGetData(op, &data)); 349 350 // Build the assembly kernel 351 if (!data->assemble_qfunction && !data->use_assembly_fallback) { 352 bool is_build_good = false; 353 354 CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good)); 355 if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(op, &is_build_good)); 356 if (!is_build_good) data->use_assembly_fallback = true; 357 } 358 359 // Try assembly 360 if (!data->use_assembly_fallback) { 361 bool is_run_good = true; 362 Ceed_Cuda *cuda_data; 363 CeedInt num_elem, num_input_fields, num_output_fields; 364 CeedEvalMode eval_mode; 365 CeedScalar *assembled_array; 366 CeedQFunctionField *qf_input_fields, *qf_output_fields; 367 CeedQFunction_Cuda_gen *qf_data; 368 CeedQFunction qf; 369 CeedOperatorField *op_input_fields, *op_output_fields; 370 371 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 372 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 373 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 374 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 375 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 376 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 377 378 // Input vectors 379 for (CeedInt i = 0; i < num_input_fields; i++) { 380 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 381 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 382 data->fields.inputs[i] = NULL; 383 } else { 384 bool is_active; 385 CeedVector vec; 386 387 // Get input vector 388 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 389 is_active = vec == CEED_VECTOR_ACTIVE; 390 if (is_active) data->fields.inputs[i] = NULL; 391 else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); 392 CeedCallBackend(CeedVectorDestroy(&vec)); 393 } 394 } 395 396 // Get context data 397 CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); 398 399 // Build objects if needed 400 if (build_objects) { 401 CeedInt qf_size_in = 0, qf_size_out = 0, Q; 402 403 // Count number of active input fields 404 { 405 for (CeedInt i = 0; i < num_input_fields; i++) { 406 CeedInt field_size; 407 CeedVector vec; 408 409 // Get input vector 410 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 411 // Check if active input 412 if (vec == CEED_VECTOR_ACTIVE) { 413 CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size)); 414 qf_size_in += field_size; 415 } 416 CeedCallBackend(CeedVectorDestroy(&vec)); 417 } 418 CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); 419 } 420 421 // Count number of active output fields 422 { 423 for (CeedInt i = 0; i < num_output_fields; i++) { 424 CeedInt field_size; 425 CeedVector vec; 426 427 // Get output vector 428 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 429 // Check if active output 430 if (vec == CEED_VECTOR_ACTIVE) { 431 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); 432 qf_size_out += field_size; 433 } 434 CeedCallBackend(CeedVectorDestroy(&vec)); 435 } 436 CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); 437 } 438 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 439 440 // Actually build objects now 441 const CeedSize l_size = (CeedSize)num_elem * Q * qf_size_in * qf_size_out; 442 CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ 443 444 // Create output restriction 445 CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out, 446 (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, 447 rstr)); 448 // Create assembled vector 449 CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled)); 450 } 451 452 // Assembly array 453 CeedCallBackend(CeedVectorGetArrayWrite(*assembled, CEED_MEM_DEVICE, &assembled_array)); 454 455 // Assemble QFunction 456 void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array}; 457 bool is_tensor = false; 458 int max_threads_per_block, min_grid_size, grid; 459 460 CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor)); 461 CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 462 int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; 463 464 if (is_tensor) { 465 CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block, 466 cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); 467 } else { 468 CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1)); 469 470 grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 471 block[2] = elems_per_block; 472 } 473 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 474 475 CeedCallBackend( 476 CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_qfunction, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs)); 477 CeedCallCuda(ceed, cudaDeviceSynchronize()); 478 479 // Restore input arrays 480 for (CeedInt i = 0; i < num_input_fields; i++) { 481 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 482 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 483 } else { 484 bool is_active; 485 CeedVector vec; 486 487 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 488 is_active = vec == CEED_VECTOR_ACTIVE; 489 if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); 490 CeedCallBackend(CeedVectorDestroy(&vec)); 491 } 492 } 493 494 // Restore context data 495 CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); 496 497 // Restore assembly array 498 CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); 499 500 // Cleanup 501 CeedCallBackend(CeedQFunctionDestroy(&qf)); 502 if (!is_run_good) { 503 data->use_assembly_fallback = true; 504 if (build_objects) { 505 CeedCallBackend(CeedVectorDestroy(assembled)); 506 CeedCallBackend(CeedElemRestrictionDestroy(rstr)); 507 } 508 } 509 } 510 CeedCallBackend(CeedDestroy(&ceed)); 511 512 // Fallback, if needed 513 if (data->use_assembly_fallback) { 514 CeedOperator op_fallback; 515 516 CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for LinearAssemblyQFunction\n"); 517 CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); 518 CeedCallBackend(CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request)); 519 return CEED_ERROR_SUCCESS; 520 } 521 return CEED_ERROR_SUCCESS; 522 } 523 524 static int CeedOperatorLinearAssembleQFunction_Cuda_gen(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { 525 return CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(op, true, assembled, rstr, request); 526 } 527 528 static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda_gen(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { 529 return CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(op, false, &assembled, &rstr, request); 530 } 531 532 //------------------------------------------------------------------------------ 533 // AtPoints diagonal assembly 534 //------------------------------------------------------------------------------ 535 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen(CeedOperator op, CeedVector assembled, CeedRequest *request) { 536 Ceed ceed; 537 CeedOperator_Cuda_gen *data; 538 539 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 540 CeedCallBackend(CeedOperatorGetData(op, &data)); 541 542 // Build the assembly kernel 543 if (!data->assemble_diagonal && !data->use_assembly_fallback) { 544 bool is_build_good = false; 545 CeedInt num_active_bases_in, num_active_bases_out; 546 CeedOperatorAssemblyData assembly_data; 547 548 CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data)); 549 CeedCallBackend( 550 CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL)); 551 if (num_active_bases_in == num_active_bases_out) { 552 CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good)); 553 if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(op, &is_build_good)); 554 } 555 if (!is_build_good) data->use_assembly_fallback = true; 556 } 557 558 // Try assembly 559 if (!data->use_assembly_fallback) { 560 bool is_run_good = true; 561 Ceed_Cuda *cuda_data; 562 CeedInt num_elem, num_input_fields, num_output_fields; 563 CeedEvalMode eval_mode; 564 CeedScalar *assembled_array; 565 CeedQFunctionField *qf_input_fields, *qf_output_fields; 566 CeedQFunction_Cuda_gen *qf_data; 567 CeedQFunction qf; 568 CeedOperatorField *op_input_fields, *op_output_fields; 569 570 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 571 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 572 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 573 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 574 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 575 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 576 577 // Input vectors 578 for (CeedInt i = 0; i < num_input_fields; i++) { 579 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 580 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 581 data->fields.inputs[i] = NULL; 582 } else { 583 bool is_active; 584 CeedVector vec; 585 586 // Get input vector 587 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 588 is_active = vec == CEED_VECTOR_ACTIVE; 589 if (is_active) data->fields.inputs[i] = NULL; 590 else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); 591 CeedCallBackend(CeedVectorDestroy(&vec)); 592 } 593 } 594 595 // Point coordinates 596 { 597 CeedVector vec; 598 599 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 600 CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); 601 CeedCallBackend(CeedVectorDestroy(&vec)); 602 603 // Points per elem 604 if (num_elem != data->points.num_elem) { 605 CeedInt *points_per_elem; 606 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 607 CeedElemRestriction rstr_points = NULL; 608 609 data->points.num_elem = num_elem; 610 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 611 CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); 612 for (CeedInt e = 0; e < num_elem; e++) { 613 CeedInt num_points_elem; 614 615 CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); 616 points_per_elem[e] = num_points_elem; 617 } 618 if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem)); 619 CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes)); 620 CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice)); 621 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 622 CeedCallBackend(CeedFree(&points_per_elem)); 623 } 624 } 625 626 // Get context data 627 CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); 628 629 // Assembly array 630 CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array)); 631 632 // Assemble diagonal 633 void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array}; 634 int max_threads_per_block, min_grid_size, grid; 635 636 CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 637 int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; 638 639 CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, 1, 640 cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); 641 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 642 643 CeedCallBackend( 644 CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_diagonal, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs)); 645 CeedCallCuda(ceed, cudaDeviceSynchronize()); 646 647 // Restore input arrays 648 for (CeedInt i = 0; i < num_input_fields; i++) { 649 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 650 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 651 } else { 652 bool is_active; 653 CeedVector vec; 654 655 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 656 is_active = vec == CEED_VECTOR_ACTIVE; 657 if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); 658 CeedCallBackend(CeedVectorDestroy(&vec)); 659 } 660 } 661 662 // Restore point coordinates 663 { 664 CeedVector vec; 665 666 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 667 CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); 668 CeedCallBackend(CeedVectorDestroy(&vec)); 669 } 670 671 // Restore context data 672 CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); 673 674 // Restore assembly array 675 CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array)); 676 677 // Cleanup 678 CeedCallBackend(CeedQFunctionDestroy(&qf)); 679 if (!is_run_good) data->use_assembly_fallback = true; 680 } 681 CeedCallBackend(CeedDestroy(&ceed)); 682 683 // Fallback, if needed 684 if (data->use_assembly_fallback) { 685 CeedOperator op_fallback; 686 687 CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for AtPoints LinearAssembleAddDiagonal\n"); 688 CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); 689 CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request)); 690 return CEED_ERROR_SUCCESS; 691 } 692 return CEED_ERROR_SUCCESS; 693 } 694 695 //------------------------------------------------------------------------------ 696 // AtPoints full assembly 697 //------------------------------------------------------------------------------ 698 static int CeedSingleOperatorAssembleAtPoints_Cuda_gen(CeedOperator op, CeedInt offset, CeedVector assembled) { 699 Ceed ceed; 700 CeedOperator_Cuda_gen *data; 701 702 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 703 CeedCallBackend(CeedOperatorGetData(op, &data)); 704 705 // Build the assembly kernel 706 if (!data->assemble_full && !data->use_assembly_fallback) { 707 bool is_build_good = false; 708 CeedInt num_active_bases_in, num_active_bases_out; 709 CeedOperatorAssemblyData assembly_data; 710 711 CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data)); 712 CeedCallBackend( 713 CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL)); 714 if (num_active_bases_in == num_active_bases_out) { 715 CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good)); 716 if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(op, &is_build_good)); 717 } 718 if (!is_build_good) data->use_assembly_fallback = true; 719 } 720 721 // Try assembly 722 if (!data->use_assembly_fallback) { 723 bool is_run_good = true; 724 Ceed_Cuda *cuda_data; 725 CeedInt num_elem, num_input_fields, num_output_fields; 726 CeedEvalMode eval_mode; 727 CeedScalar *assembled_array; 728 CeedQFunctionField *qf_input_fields, *qf_output_fields; 729 CeedQFunction_Cuda_gen *qf_data; 730 CeedQFunction qf; 731 CeedOperatorField *op_input_fields, *op_output_fields; 732 733 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 734 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 735 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 736 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 737 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 738 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 739 740 // Input vectors 741 for (CeedInt i = 0; i < num_input_fields; i++) { 742 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 743 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 744 data->fields.inputs[i] = NULL; 745 } else { 746 bool is_active; 747 CeedVector vec; 748 749 // Get input vector 750 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 751 is_active = vec == CEED_VECTOR_ACTIVE; 752 if (is_active) data->fields.inputs[i] = NULL; 753 else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); 754 CeedCallBackend(CeedVectorDestroy(&vec)); 755 } 756 } 757 758 // Point coordinates 759 { 760 CeedVector vec; 761 762 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 763 CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); 764 CeedCallBackend(CeedVectorDestroy(&vec)); 765 766 // Points per elem 767 if (num_elem != data->points.num_elem) { 768 CeedInt *points_per_elem; 769 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 770 CeedElemRestriction rstr_points = NULL; 771 772 data->points.num_elem = num_elem; 773 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 774 CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); 775 for (CeedInt e = 0; e < num_elem; e++) { 776 CeedInt num_points_elem; 777 778 CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); 779 points_per_elem[e] = num_points_elem; 780 } 781 if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem)); 782 CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes)); 783 CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice)); 784 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 785 CeedCallBackend(CeedFree(&points_per_elem)); 786 } 787 } 788 789 // Get context data 790 CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); 791 792 // Assembly array 793 CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array)); 794 CeedScalar *assembled_offset_array = &assembled_array[offset]; 795 796 // Assemble diagonal 797 void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, 798 &data->G, &data->W, &data->points, &assembled_offset_array}; 799 int max_threads_per_block, min_grid_size, grid; 800 801 CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 802 int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; 803 804 CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, 1, 805 cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); 806 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 807 808 CeedCallBackend( 809 CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_full, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs)); 810 CeedCallCuda(ceed, cudaDeviceSynchronize()); 811 812 // Restore input arrays 813 for (CeedInt i = 0; i < num_input_fields; i++) { 814 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 815 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 816 } else { 817 bool is_active; 818 CeedVector vec; 819 820 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 821 is_active = vec == CEED_VECTOR_ACTIVE; 822 if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); 823 CeedCallBackend(CeedVectorDestroy(&vec)); 824 } 825 } 826 827 // Restore point coordinates 828 { 829 CeedVector vec; 830 831 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 832 CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); 833 CeedCallBackend(CeedVectorDestroy(&vec)); 834 } 835 836 // Restore context data 837 CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); 838 839 // Restore assembly array 840 CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array)); 841 842 // Cleanup 843 CeedCallBackend(CeedQFunctionDestroy(&qf)); 844 if (!is_run_good) data->use_assembly_fallback = true; 845 } 846 CeedCallBackend(CeedDestroy(&ceed)); 847 848 // Fallback, if needed 849 if (data->use_assembly_fallback) { 850 CeedOperator op_fallback; 851 852 CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for AtPoints SingleOperatorAssemble\n"); 853 CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); 854 CeedCallBackend(CeedSingleOperatorAssemble(op_fallback, offset, assembled)); 855 return CEED_ERROR_SUCCESS; 856 } 857 return CEED_ERROR_SUCCESS; 858 } 859 860 //------------------------------------------------------------------------------ 861 // Create operator 862 //------------------------------------------------------------------------------ 863 int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 864 bool is_composite, is_at_points; 865 Ceed ceed; 866 CeedOperator_Cuda_gen *impl; 867 868 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 869 CeedCallBackend(CeedCalloc(1, &impl)); 870 CeedCallBackend(CeedOperatorSetData(op, impl)); 871 CeedCall(CeedOperatorIsComposite(op, &is_composite)); 872 if (is_composite) { 873 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Cuda_gen)); 874 } else { 875 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen)); 876 } 877 CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); 878 if (is_at_points) { 879 CeedCallBackend( 880 CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen)); 881 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssembleAtPoints_Cuda_gen)); 882 } 883 if (!is_at_points) { 884 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda_gen)); 885 CeedCallBackend( 886 CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Cuda_gen)); 887 } 888 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen)); 889 CeedCallBackend(CeedDestroy(&ceed)); 890 return CEED_ERROR_SUCCESS; 891 } 892 893 //------------------------------------------------------------------------------ 894