1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-source/cuda/cuda-types.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stddef.h> 14 #include <string.h> 15 16 #include "../cuda/ceed-cuda-common.h" 17 #include "../cuda/ceed-cuda-compile.h" 18 #include "ceed-cuda-gen-operator-build.h" 19 #include "ceed-cuda-gen.h" 20 21 //------------------------------------------------------------------------------ 22 // Destroy operator 23 //------------------------------------------------------------------------------ 24 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 25 Ceed ceed; 26 CeedOperator_Cuda_gen *impl; 27 28 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 29 CeedCallBackend(CeedOperatorGetData(op, &impl)); 30 if (impl->module) CeedCallCuda(ceed, cuModuleUnload(impl->module)); 31 if (impl->module_assemble_full) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_full)); 32 if (impl->module_assemble_diagonal) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_diagonal)); 33 if (impl->module_assemble_qfunction) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_qfunction)); 34 if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem)); 35 CeedCallBackend(CeedFree(&impl)); 36 CeedCallBackend(CeedDestroy(&ceed)); 37 return CEED_ERROR_SUCCESS; 38 } 39 40 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, int elems_per_block) { 41 int useful_threads_per_block = threads_per_elem * elems_per_block; 42 // round up to nearest multiple of warp_size 43 int block_size = CeedDivUpInt(useful_threads_per_block, warp_size) * warp_size; 44 int blocks_per_sm = threads_per_sm / block_size; 45 return threads_per_sm - useful_threads_per_block * blocks_per_sm; 46 } 47 48 // Choose the least wasteful block size constrained by blocks_per_sm of max_threads_per_block. 49 // 50 // The x and y part of block[] contains per-element sizes (specified on input) while the z part is number of elements. 51 // 52 // Problem setting: we'd like to make occupancy high with relatively few inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how 53 // many threads can run. 54 // 55 // Note that full occupancy sometimes can't be achieved by one thread block. 56 // For example, an SM might support 1536 threads in total, but only 1024 within a single thread block. 57 // So cuOccupancyMaxPotentialBlockSize may suggest a block size of 768 so that two blocks can run, versus one block of 1024 will prevent a second 58 // block from running. The cuda-gen kernels are pretty heavy with lots of instruction-level parallelism (ILP) so we'll generally be okay with 59 // relatively low occupancy and smaller thread blocks, but we solve a reasonably general problem here. Empirically, we find that blocks bigger than 60 // about 256 have higher latency and worse load balancing when the number of elements is modest. 61 // 62 // cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of the number of quadrature points (or number of basis functions). 63 // They also have a lot of __syncthreads(), which is another point against excessively large thread blocks. 64 // Suppose I have elements with 7x7x7 quadrature points. 65 // This will loop over the last dimension, so we have 7*7=49 threads per element. 66 // Suppose we have two elements = 2*49=98 useful threads. 67 // CUDA schedules in units of full warps (32 threads), so 128 CUDA hardware threads are effectively committed to that block. 68 // Now suppose cuOccupancyMaxPotentialBlockSize returned 352. 69 // We can schedule 2 blocks of size 98 (196 useful threads using 256 hardware threads), but not a third block (which would need a total of 384 70 // hardware threads). 71 // 72 // If instead, we had packed 3 elements, we'd have 3*49=147 useful threads occupying 160 slots, and could schedule two blocks. 73 // Alternatively, we could pack a single block of 7 elements (2*49=343 useful threads) into the 354 slots. 74 // The latter has the least "waste", but __syncthreads() over-synchronizes and it might not pay off relative to smaller blocks. 75 static int BlockGridCalculate(CeedInt num_elem, int blocks_per_sm, int max_threads_per_block, int max_threads_z, int warp_size, int block[3], 76 int *grid) { 77 const int threads_per_sm = blocks_per_sm * max_threads_per_block; 78 const int threads_per_elem = block[0] * block[1]; 79 int elems_per_block = 1; 80 int waste = Waste(threads_per_sm, warp_size, threads_per_elem, 1); 81 82 for (int i = 2; i <= CeedIntMin(max_threads_per_block / threads_per_elem, num_elem); i++) { 83 int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i); 84 85 // We want to minimize waste, but smaller kernels have lower latency and less __syncthreads() overhead so when a larger block size has the same 86 // waste as a smaller one, go ahead and prefer the smaller block. 87 if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) { 88 elems_per_block = i; 89 waste = i_waste; 90 } 91 } 92 // In low-order elements, threads_per_elem may be sufficiently low to give an elems_per_block greater than allowable for the device, so we must 93 // check before setting the z-dimension size of the block. 94 block[2] = CeedIntMin(elems_per_block, max_threads_z); 95 *grid = CeedDivUpInt(num_elem, elems_per_block); 96 return CEED_ERROR_SUCCESS; 97 } 98 99 // callback for cuOccupancyMaxPotentialBlockSize, providing the amount of dynamic shared memory required for a thread block of size threads. 100 static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); } 101 102 //------------------------------------------------------------------------------ 103 // Apply and add to output 104 //------------------------------------------------------------------------------ 105 static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good, 106 CeedRequest *request) { 107 bool is_at_points, is_tensor; 108 Ceed ceed; 109 Ceed_Cuda *cuda_data; 110 CeedInt num_elem, num_input_fields, num_output_fields; 111 CeedEvalMode eval_mode; 112 CeedQFunctionField *qf_input_fields, *qf_output_fields; 113 CeedQFunction_Cuda_gen *qf_data; 114 CeedQFunction qf; 115 CeedOperatorField *op_input_fields, *op_output_fields; 116 CeedOperator_Cuda_gen *data; 117 118 // Build the operator kernel 119 CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, is_run_good)); 120 if (!(*is_run_good)) return CEED_ERROR_SUCCESS; 121 122 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 123 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 124 CeedCallBackend(CeedOperatorGetData(op, &data)); 125 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 126 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 127 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 128 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 129 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 130 131 // Input vectors 132 for (CeedInt i = 0; i < num_input_fields; i++) { 133 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 134 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 135 data->fields.inputs[i] = NULL; 136 } else { 137 bool is_active; 138 CeedVector vec; 139 140 // Get input vector 141 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 142 is_active = vec == CEED_VECTOR_ACTIVE; 143 if (is_active) data->fields.inputs[i] = input_arr; 144 else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); 145 CeedCallBackend(CeedVectorDestroy(&vec)); 146 } 147 } 148 149 // Output vectors 150 for (CeedInt i = 0; i < num_output_fields; i++) { 151 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 152 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 153 data->fields.outputs[i] = NULL; 154 } else { 155 bool is_active; 156 CeedVector vec; 157 158 // Get output vector 159 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 160 is_active = vec == CEED_VECTOR_ACTIVE; 161 if (is_active) data->fields.outputs[i] = output_arr; 162 else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i])); 163 CeedCallBackend(CeedVectorDestroy(&vec)); 164 } 165 } 166 167 // Point coordinates, if needed 168 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 169 if (is_at_points) { 170 // Coords 171 CeedVector vec; 172 173 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 174 CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); 175 CeedCallBackend(CeedVectorDestroy(&vec)); 176 177 // Points per elem 178 if (num_elem != data->points.num_elem) { 179 CeedInt *points_per_elem; 180 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 181 CeedElemRestriction rstr_points = NULL; 182 183 data->points.num_elem = num_elem; 184 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 185 CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); 186 for (CeedInt e = 0; e < num_elem; e++) { 187 CeedInt num_points_elem; 188 189 CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); 190 points_per_elem[e] = num_points_elem; 191 } 192 if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem)); 193 CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes)); 194 CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice)); 195 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 196 CeedCallBackend(CeedFree(&points_per_elem)); 197 } 198 } 199 200 // Get context data 201 CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); 202 203 // Apply operator 204 void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points}; 205 int max_threads_per_block, min_grid_size, grid; 206 207 CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor)); 208 CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 209 int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; 210 211 if (is_tensor) { 212 CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block, 213 cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); 214 } else { 215 CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1)); 216 217 grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 218 block[2] = elems_per_block; 219 } 220 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 221 222 CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, stream, grid, block[0], block[1], block[2], shared_mem, is_run_good, opargs)); 223 224 // Restore input arrays 225 for (CeedInt i = 0; i < num_input_fields; i++) { 226 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 227 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 228 } else { 229 bool is_active; 230 CeedVector vec; 231 232 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 233 is_active = vec == CEED_VECTOR_ACTIVE; 234 if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); 235 CeedCallBackend(CeedVectorDestroy(&vec)); 236 } 237 } 238 239 // Restore output arrays 240 for (CeedInt i = 0; i < num_output_fields; i++) { 241 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 242 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 243 } else { 244 bool is_active; 245 CeedVector vec; 246 247 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 248 is_active = vec == CEED_VECTOR_ACTIVE; 249 if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i])); 250 CeedCallBackend(CeedVectorDestroy(&vec)); 251 } 252 } 253 254 // Restore point coordinates, if needed 255 if (is_at_points) { 256 CeedVector vec; 257 258 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 259 CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); 260 CeedCallBackend(CeedVectorDestroy(&vec)); 261 } 262 263 // Restore context data 264 CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); 265 266 // Cleanup 267 CeedCallBackend(CeedDestroy(&ceed)); 268 CeedCallBackend(CeedQFunctionDestroy(&qf)); 269 if (!(*is_run_good)) data->use_fallback = true; 270 return CEED_ERROR_SUCCESS; 271 } 272 273 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { 274 bool is_run_good = false; 275 const CeedScalar *input_arr = NULL; 276 CeedScalar *output_arr = NULL; 277 278 // Try to run kernel 279 if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr)); 280 if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr)); 281 CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, NULL, input_arr, output_arr, &is_run_good, request)); 282 if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr)); 283 if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr)); 284 285 // Fallback on unsuccessful run 286 if (!is_run_good) { 287 CeedOperator op_fallback; 288 289 CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for ApplyAdd\n"); 290 CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); 291 CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); 292 } 293 return CEED_ERROR_SUCCESS; 294 } 295 296 static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { 297 bool is_run_good[CEED_COMPOSITE_MAX] = {false}, is_sequential; 298 CeedInt num_suboperators; 299 const CeedScalar *input_arr = NULL; 300 CeedScalar *output_arr = NULL; 301 Ceed ceed; 302 CeedOperator *sub_operators; 303 cudaStream_t stream = NULL; 304 305 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 306 CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators)); 307 CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators)); 308 CeedCall(CeedOperatorCompositeIsSequential(op, &is_sequential)); 309 if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr)); 310 if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr)); 311 if (is_sequential) CeedCallCuda(ceed, cudaStreamCreate(&stream)); 312 for (CeedInt i = 0; i < num_suboperators; i++) { 313 CeedInt num_elem = 0; 314 315 CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem)); 316 if (num_elem > 0) { 317 if (!is_sequential) CeedCallCuda(ceed, cudaStreamCreate(&stream)); 318 CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(sub_operators[i], stream, input_arr, output_arr, &is_run_good[i], request)); 319 if (!is_sequential) CeedCallCuda(ceed, cudaStreamDestroy(stream)); 320 } 321 } 322 if (is_sequential) CeedCallCuda(ceed, cudaStreamDestroy(stream)); 323 if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr)); 324 if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr)); 325 CeedCallCuda(ceed, cudaDeviceSynchronize()); 326 327 // Fallback on unsuccessful run 328 for (CeedInt i = 0; i < num_suboperators; i++) { 329 if (!is_run_good[i]) { 330 CeedOperator op_fallback; 331 332 CeedDebug(ceed, "\nFalling back to /gpu/cuda/ref CeedOperator for ApplyAdd\n"); 333 CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback)); 334 CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); 335 } 336 } 337 CeedCallBackend(CeedDestroy(&ceed)); 338 return CEED_ERROR_SUCCESS; 339 } 340 341 //------------------------------------------------------------------------------ 342 // QFunction assembly 343 //------------------------------------------------------------------------------ 344 static int CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, 345 CeedRequest *request) { 346 Ceed ceed; 347 CeedOperator_Cuda_gen *data; 348 349 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 350 CeedCallBackend(CeedOperatorGetData(op, &data)); 351 352 // Build the assembly kernel 353 if (!data->assemble_qfunction && !data->use_assembly_fallback) { 354 bool is_build_good = false; 355 356 CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good)); 357 if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(op, &is_build_good)); 358 if (!is_build_good) data->use_assembly_fallback = true; 359 } 360 361 // Try assembly 362 if (!data->use_assembly_fallback) { 363 bool is_run_good = true; 364 Ceed_Cuda *cuda_data; 365 CeedInt num_elem, num_input_fields, num_output_fields; 366 CeedEvalMode eval_mode; 367 CeedScalar *assembled_array; 368 CeedQFunctionField *qf_input_fields, *qf_output_fields; 369 CeedQFunction_Cuda_gen *qf_data; 370 CeedQFunction qf; 371 CeedOperatorField *op_input_fields, *op_output_fields; 372 373 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 374 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 375 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 376 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 377 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 378 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 379 380 // Input vectors 381 for (CeedInt i = 0; i < num_input_fields; i++) { 382 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 383 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 384 data->fields.inputs[i] = NULL; 385 } else { 386 bool is_active; 387 CeedVector vec; 388 389 // Get input vector 390 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 391 is_active = vec == CEED_VECTOR_ACTIVE; 392 if (is_active) data->fields.inputs[i] = NULL; 393 else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); 394 CeedCallBackend(CeedVectorDestroy(&vec)); 395 } 396 } 397 398 // Get context data 399 CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); 400 401 // Build objects if needed 402 if (build_objects) { 403 CeedInt qf_size_in = 0, qf_size_out = 0, Q; 404 405 // Count number of active input fields 406 { 407 for (CeedInt i = 0; i < num_input_fields; i++) { 408 CeedInt field_size; 409 CeedVector vec; 410 411 // Get input vector 412 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 413 // Check if active input 414 if (vec == CEED_VECTOR_ACTIVE) { 415 CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size)); 416 qf_size_in += field_size; 417 } 418 CeedCallBackend(CeedVectorDestroy(&vec)); 419 } 420 CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); 421 } 422 423 // Count number of active output fields 424 { 425 for (CeedInt i = 0; i < num_output_fields; i++) { 426 CeedInt field_size; 427 CeedVector vec; 428 429 // Get output vector 430 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 431 // Check if active output 432 if (vec == CEED_VECTOR_ACTIVE) { 433 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); 434 qf_size_out += field_size; 435 } 436 CeedCallBackend(CeedVectorDestroy(&vec)); 437 } 438 CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); 439 } 440 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 441 442 // Actually build objects now 443 const CeedSize l_size = (CeedSize)num_elem * Q * qf_size_in * qf_size_out; 444 CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ 445 446 // Create output restriction 447 CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out, 448 (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, 449 rstr)); 450 // Create assembled vector 451 CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled)); 452 } 453 454 // Assembly array 455 CeedCallBackend(CeedVectorGetArrayWrite(*assembled, CEED_MEM_DEVICE, &assembled_array)); 456 457 // Assemble QFunction 458 void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array}; 459 bool is_tensor = false; 460 int max_threads_per_block, min_grid_size, grid; 461 462 CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor)); 463 CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 464 int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; 465 466 if (is_tensor) { 467 CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block, 468 cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); 469 } else { 470 CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1)); 471 472 grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 473 block[2] = elems_per_block; 474 } 475 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 476 477 CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_qfunction, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, 478 opargs)); 479 CeedCallCuda(ceed, cudaDeviceSynchronize()); 480 481 // Restore input arrays 482 for (CeedInt i = 0; i < num_input_fields; i++) { 483 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 484 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 485 } else { 486 bool is_active; 487 CeedVector vec; 488 489 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 490 is_active = vec == CEED_VECTOR_ACTIVE; 491 if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); 492 CeedCallBackend(CeedVectorDestroy(&vec)); 493 } 494 } 495 496 // Restore context data 497 CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); 498 499 // Restore assembly array 500 CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); 501 502 // Cleanup 503 CeedCallBackend(CeedQFunctionDestroy(&qf)); 504 if (!is_run_good) { 505 data->use_assembly_fallback = true; 506 if (build_objects) { 507 CeedCallBackend(CeedVectorDestroy(assembled)); 508 CeedCallBackend(CeedElemRestrictionDestroy(rstr)); 509 } 510 } 511 } 512 CeedCallBackend(CeedDestroy(&ceed)); 513 514 // Fallback, if needed 515 if (data->use_assembly_fallback) { 516 CeedOperator op_fallback; 517 518 CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for LinearAssemblyQFunction\n"); 519 CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); 520 CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request)); 521 return CEED_ERROR_SUCCESS; 522 } 523 return CEED_ERROR_SUCCESS; 524 } 525 526 static int CeedOperatorLinearAssembleQFunction_Cuda_gen(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { 527 return CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(op, true, assembled, rstr, request); 528 } 529 530 static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda_gen(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { 531 return CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(op, false, &assembled, &rstr, request); 532 } 533 534 //------------------------------------------------------------------------------ 535 // AtPoints diagonal assembly 536 //------------------------------------------------------------------------------ 537 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen(CeedOperator op, CeedVector assembled, CeedRequest *request) { 538 Ceed ceed; 539 CeedOperator_Cuda_gen *data; 540 541 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 542 CeedCallBackend(CeedOperatorGetData(op, &data)); 543 544 // Build the assembly kernel 545 if (!data->assemble_diagonal && !data->use_assembly_fallback) { 546 bool is_build_good = false; 547 CeedInt num_active_bases_in, num_active_bases_out; 548 CeedOperatorAssemblyData assembly_data; 549 550 CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data)); 551 CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, 552 NULL, NULL)); 553 if (num_active_bases_in == num_active_bases_out) { 554 CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good)); 555 if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(op, &is_build_good)); 556 } 557 if (!is_build_good) data->use_assembly_fallback = true; 558 } 559 560 // Try assembly 561 if (!data->use_assembly_fallback) { 562 bool is_run_good = true; 563 Ceed_Cuda *cuda_data; 564 CeedInt num_elem, num_input_fields, num_output_fields; 565 CeedEvalMode eval_mode; 566 CeedScalar *assembled_array; 567 CeedQFunctionField *qf_input_fields, *qf_output_fields; 568 CeedQFunction_Cuda_gen *qf_data; 569 CeedQFunction qf; 570 CeedOperatorField *op_input_fields, *op_output_fields; 571 572 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 573 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 574 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 575 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 576 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 577 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 578 579 // Input vectors 580 for (CeedInt i = 0; i < num_input_fields; i++) { 581 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 582 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 583 data->fields.inputs[i] = NULL; 584 } else { 585 bool is_active; 586 CeedVector vec; 587 588 // Get input vector 589 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 590 is_active = vec == CEED_VECTOR_ACTIVE; 591 if (is_active) data->fields.inputs[i] = NULL; 592 else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); 593 CeedCallBackend(CeedVectorDestroy(&vec)); 594 } 595 } 596 597 // Point coordinates 598 { 599 CeedVector vec; 600 601 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 602 CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); 603 CeedCallBackend(CeedVectorDestroy(&vec)); 604 605 // Points per elem 606 if (num_elem != data->points.num_elem) { 607 CeedInt *points_per_elem; 608 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 609 CeedElemRestriction rstr_points = NULL; 610 611 data->points.num_elem = num_elem; 612 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 613 CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); 614 for (CeedInt e = 0; e < num_elem; e++) { 615 CeedInt num_points_elem; 616 617 CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); 618 points_per_elem[e] = num_points_elem; 619 } 620 if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem)); 621 CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes)); 622 CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice)); 623 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 624 CeedCallBackend(CeedFree(&points_per_elem)); 625 } 626 } 627 628 // Get context data 629 CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); 630 631 // Assembly array 632 CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array)); 633 634 // Assemble diagonal 635 void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array}; 636 int max_threads_per_block, min_grid_size, grid; 637 638 CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 639 int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; 640 641 CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, 1, 642 cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); 643 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 644 645 CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_diagonal, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, 646 opargs)); 647 CeedCallCuda(ceed, cudaDeviceSynchronize()); 648 649 // Restore input arrays 650 for (CeedInt i = 0; i < num_input_fields; i++) { 651 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 652 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 653 } else { 654 bool is_active; 655 CeedVector vec; 656 657 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 658 is_active = vec == CEED_VECTOR_ACTIVE; 659 if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); 660 CeedCallBackend(CeedVectorDestroy(&vec)); 661 } 662 } 663 664 // Restore point coordinates 665 { 666 CeedVector vec; 667 668 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 669 CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); 670 CeedCallBackend(CeedVectorDestroy(&vec)); 671 } 672 673 // Restore context data 674 CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); 675 676 // Restore assembly array 677 CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array)); 678 679 // Cleanup 680 CeedCallBackend(CeedQFunctionDestroy(&qf)); 681 if (!is_run_good) data->use_assembly_fallback = true; 682 } 683 CeedCallBackend(CeedDestroy(&ceed)); 684 685 // Fallback, if needed 686 if (data->use_assembly_fallback) { 687 CeedOperator op_fallback; 688 689 CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for AtPoints LinearAssembleAddDiagonal\n"); 690 CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); 691 CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request)); 692 return CEED_ERROR_SUCCESS; 693 } 694 return CEED_ERROR_SUCCESS; 695 } 696 697 //------------------------------------------------------------------------------ 698 // AtPoints full assembly 699 //------------------------------------------------------------------------------ 700 static int CeedOperatorAssembleSingleAtPoints_Cuda_gen(CeedOperator op, CeedInt offset, CeedVector assembled) { 701 Ceed ceed; 702 CeedOperator_Cuda_gen *data; 703 704 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 705 CeedCallBackend(CeedOperatorGetData(op, &data)); 706 707 // Build the assembly kernel 708 if (!data->assemble_full && !data->use_assembly_fallback) { 709 bool is_build_good = false; 710 CeedInt num_active_bases_in, num_active_bases_out; 711 CeedOperatorAssemblyData assembly_data; 712 713 CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data)); 714 CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, 715 NULL, NULL)); 716 if (num_active_bases_in == num_active_bases_out) { 717 CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good)); 718 if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(op, &is_build_good)); 719 } 720 if (!is_build_good) data->use_assembly_fallback = true; 721 } 722 723 // Try assembly 724 if (!data->use_assembly_fallback) { 725 bool is_run_good = true; 726 Ceed_Cuda *cuda_data; 727 CeedInt num_elem, num_input_fields, num_output_fields; 728 CeedEvalMode eval_mode; 729 CeedScalar *assembled_array; 730 CeedQFunctionField *qf_input_fields, *qf_output_fields; 731 CeedQFunction_Cuda_gen *qf_data; 732 CeedQFunction qf; 733 CeedOperatorField *op_input_fields, *op_output_fields; 734 735 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 736 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 737 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 738 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 739 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 740 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 741 742 // Input vectors 743 for (CeedInt i = 0; i < num_input_fields; i++) { 744 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 745 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 746 data->fields.inputs[i] = NULL; 747 } else { 748 bool is_active; 749 CeedVector vec; 750 751 // Get input vector 752 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 753 is_active = vec == CEED_VECTOR_ACTIVE; 754 if (is_active) data->fields.inputs[i] = NULL; 755 else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); 756 CeedCallBackend(CeedVectorDestroy(&vec)); 757 } 758 } 759 760 // Point coordinates 761 { 762 CeedVector vec; 763 764 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 765 CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); 766 CeedCallBackend(CeedVectorDestroy(&vec)); 767 768 // Points per elem 769 if (num_elem != data->points.num_elem) { 770 CeedInt *points_per_elem; 771 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 772 CeedElemRestriction rstr_points = NULL; 773 774 data->points.num_elem = num_elem; 775 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 776 CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); 777 for (CeedInt e = 0; e < num_elem; e++) { 778 CeedInt num_points_elem; 779 780 CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); 781 points_per_elem[e] = num_points_elem; 782 } 783 if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem)); 784 CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes)); 785 CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice)); 786 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 787 CeedCallBackend(CeedFree(&points_per_elem)); 788 } 789 } 790 791 // Get context data 792 CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); 793 794 // Assembly array 795 CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array)); 796 CeedScalar *assembled_offset_array = &assembled_array[offset]; 797 798 // Assemble diagonal 799 void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, 800 &data->G, &data->W, &data->points, &assembled_offset_array}; 801 int max_threads_per_block, min_grid_size, grid; 802 803 CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 804 int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; 805 806 CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, 1, 807 cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); 808 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 809 810 CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_full, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, 811 opargs)); 812 CeedCallCuda(ceed, cudaDeviceSynchronize()); 813 814 // Restore input arrays 815 for (CeedInt i = 0; i < num_input_fields; i++) { 816 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 817 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 818 } else { 819 bool is_active; 820 CeedVector vec; 821 822 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 823 is_active = vec == CEED_VECTOR_ACTIVE; 824 if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); 825 CeedCallBackend(CeedVectorDestroy(&vec)); 826 } 827 } 828 829 // Restore point coordinates 830 { 831 CeedVector vec; 832 833 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 834 CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); 835 CeedCallBackend(CeedVectorDestroy(&vec)); 836 } 837 838 // Restore context data 839 CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); 840 841 // Restore assembly array 842 CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array)); 843 844 // Cleanup 845 CeedCallBackend(CeedQFunctionDestroy(&qf)); 846 if (!is_run_good) data->use_assembly_fallback = true; 847 } 848 CeedCallBackend(CeedDestroy(&ceed)); 849 850 // Fallback, if needed 851 if (data->use_assembly_fallback) { 852 CeedOperator op_fallback; 853 854 CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for AtPoints SingleOperatorAssemble\n"); 855 CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); 856 CeedCallBackend(CeedOperatorAssembleSingle(op_fallback, offset, assembled)); 857 return CEED_ERROR_SUCCESS; 858 } 859 return CEED_ERROR_SUCCESS; 860 } 861 862 //------------------------------------------------------------------------------ 863 // Create operator 864 //------------------------------------------------------------------------------ 865 int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 866 bool is_composite, is_at_points; 867 Ceed ceed; 868 CeedOperator_Cuda_gen *impl; 869 870 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 871 CeedCallBackend(CeedCalloc(1, &impl)); 872 CeedCallBackend(CeedOperatorSetData(op, impl)); 873 CeedCall(CeedOperatorIsComposite(op, &is_composite)); 874 if (is_composite) { 875 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Cuda_gen)); 876 } else { 877 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen)); 878 } 879 CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); 880 if (is_at_points) { 881 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", 882 CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen)); 883 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Cuda_gen)); 884 } 885 if (!is_at_points) { 886 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda_gen)); 887 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", 888 CeedOperatorLinearAssembleQFunctionUpdate_Cuda_gen)); 889 } 890 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen)); 891 CeedCallBackend(CeedDestroy(&ceed)); 892 return CEED_ERROR_SUCCESS; 893 } 894 895 //------------------------------------------------------------------------------ 896