xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator.c (revision 9ba83ac0e4b1fca39d6fa6737a318a9f0cbc172d)
1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-source/hip/hip-types.h>
11 #include <stddef.h>
12 #include <hip/hiprtc.h>
13 
14 #include "../hip/ceed-hip-common.h"
15 #include "../hip/ceed-hip-compile.h"
16 #include "ceed-hip-gen-operator-build.h"
17 #include "ceed-hip-gen.h"
18 
19 //------------------------------------------------------------------------------
20 // Destroy operator
21 //------------------------------------------------------------------------------
22 static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
23   Ceed                  ceed;
24   CeedOperator_Hip_gen *impl;
25   bool                  is_composite;
26 
27   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
28   CeedCallBackend(CeedOperatorGetData(op, &impl));
29   CeedCallBackend(CeedOperatorIsComposite(op, &is_composite));
30   if (is_composite) {
31     CeedInt num_suboperators;
32 
33     CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
34     for (CeedInt i = 0; i < num_suboperators; i++) {
35       if (impl->streams[i]) CeedCallHip(ceed, hipStreamDestroy(impl->streams[i]));
36       impl->streams[i] = NULL;
37     }
38   }
39   if (impl->module) CeedCallHip(ceed, hipModuleUnload(impl->module));
40   if (impl->module_assemble_full) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_full));
41   if (impl->module_assemble_diagonal) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_diagonal));
42   if (impl->module_assemble_qfunction) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_qfunction));
43   if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem));
44   CeedCallBackend(CeedFree(&impl));
45   CeedCallBackend(CeedDestroy(&ceed));
46   return CEED_ERROR_SUCCESS;
47 }
48 
49 //------------------------------------------------------------------------------
50 // Apply and add to output
51 //------------------------------------------------------------------------------
52 static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream, const CeedScalar *input_arr, CeedScalar *output_arr,
53                                             bool *is_run_good, CeedRequest *request) {
54   bool                   is_at_points, is_tensor;
55   Ceed                   ceed;
56   CeedInt                num_elem, num_input_fields, num_output_fields;
57   CeedEvalMode           eval_mode;
58   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
59   CeedQFunction_Hip_gen *qf_data;
60   CeedQFunction          qf;
61   CeedOperatorField     *op_input_fields, *op_output_fields;
62   CeedOperator_Hip_gen  *data;
63 
64   // Creation of the operator
65   CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, is_run_good));
66   if (!(*is_run_good)) return CEED_ERROR_SUCCESS;
67 
68   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
69   CeedCallBackend(CeedOperatorGetData(op, &data));
70   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
71   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
72   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
73   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
74   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
75 
76   // Input vectors
77   for (CeedInt i = 0; i < num_input_fields; i++) {
78     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
79     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
80       data->fields.inputs[i] = NULL;
81     } else {
82       bool       is_active;
83       CeedVector vec;
84 
85       // Get input vector
86       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
87       is_active = vec == CEED_VECTOR_ACTIVE;
88       if (is_active) data->fields.inputs[i] = input_arr;
89       else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
90       CeedCallBackend(CeedVectorDestroy(&vec));
91     }
92   }
93 
94   // Output vectors
95   for (CeedInt i = 0; i < num_output_fields; i++) {
96     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
97     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
98       data->fields.outputs[i] = NULL;
99     } else {
100       bool       is_active;
101       CeedVector vec;
102 
103       // Get output vector
104       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
105       is_active = vec == CEED_VECTOR_ACTIVE;
106       if (is_active) data->fields.outputs[i] = output_arr;
107       else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
108       CeedCallBackend(CeedVectorDestroy(&vec));
109     }
110   }
111 
112   // Point coordinates, if needed
113   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
114   if (is_at_points) {
115     // Coords
116     CeedVector vec;
117 
118     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
119     CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
120     CeedCallBackend(CeedVectorDestroy(&vec));
121 
122     // Points per elem
123     if (num_elem != data->points.num_elem) {
124       CeedInt            *points_per_elem;
125       const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
126       CeedElemRestriction rstr_points = NULL;
127 
128       data->points.num_elem = num_elem;
129       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
130       CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
131       for (CeedInt e = 0; e < num_elem; e++) {
132         CeedInt num_points_elem;
133 
134         CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
135         points_per_elem[e] = num_points_elem;
136       }
137       if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
138       CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
139       CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
140       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
141       CeedCallBackend(CeedFree(&points_per_elem));
142     }
143   }
144 
145   // Get context data
146   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
147 
148   // Apply operator
149   void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
150 
151   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
152   CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
153 
154   if (is_tensor) {
155     CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
156   } else {
157     CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
158 
159     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
160     block_sizes[2]  = elems_per_block;
161   }
162   if (data->dim == 1 || !is_tensor) {
163     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
164     CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
165 
166     CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
167                                                   is_run_good, opargs));
168   } else if (data->dim == 2) {
169     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
170     CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
171 
172     CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
173                                                   is_run_good, opargs));
174   } else if (data->dim == 3) {
175     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
176     CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
177 
178     CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
179                                                   is_run_good, opargs));
180   }
181 
182   // Restore input arrays
183   for (CeedInt i = 0; i < num_input_fields; i++) {
184     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
185     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
186     } else {
187       bool       is_active;
188       CeedVector vec;
189 
190       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
191       is_active = vec == CEED_VECTOR_ACTIVE;
192       if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
193       CeedCallBackend(CeedVectorDestroy(&vec));
194     }
195   }
196 
197   // Restore output arrays
198   for (CeedInt i = 0; i < num_output_fields; i++) {
199     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
200     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
201     } else {
202       bool       is_active;
203       CeedVector vec;
204 
205       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
206       is_active = vec == CEED_VECTOR_ACTIVE;
207       if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
208       CeedCallBackend(CeedVectorDestroy(&vec));
209     }
210   }
211 
212   // Restore point coordinates, if needed
213   if (is_at_points) {
214     CeedVector vec;
215 
216     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
217     CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
218     CeedCallBackend(CeedVectorDestroy(&vec));
219   }
220 
221   // Restore context data
222   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
223 
224   // Cleanup
225   CeedCallBackend(CeedDestroy(&ceed));
226   CeedCallBackend(CeedQFunctionDestroy(&qf));
227   if (!(*is_run_good)) data->use_fallback = true;
228   return CEED_ERROR_SUCCESS;
229 }
230 
231 static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
232   bool              is_run_good = false;
233   const CeedScalar *input_arr   = NULL;
234   CeedScalar       *output_arr  = NULL;
235 
236   // Try to run kernel
237   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
238   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
239   CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(op, NULL, input_arr, output_arr, &is_run_good, request));
240   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
241   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
242 
243   // Fallback on unsuccessful run
244   if (!is_run_good) {
245     CeedOperator op_fallback;
246 
247     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for ApplyAdd\n");
248     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
249     CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
250   }
251   return CEED_ERROR_SUCCESS;
252 }
253 
254 static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
255   bool                  is_run_good[CEED_COMPOSITE_MAX] = {false};
256   CeedInt               num_suboperators;
257   const CeedScalar     *input_arr  = NULL;
258   CeedScalar           *output_arr = NULL;
259   Ceed                  ceed;
260   CeedOperator_Hip_gen *impl;
261   CeedOperator         *sub_operators;
262 
263   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
264   CeedCallBackend(CeedOperatorGetData(op, &impl));
265   CeedCallBackend(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
266   CeedCallBackend(CeedOperatorCompositeGetSubList(op, &sub_operators));
267   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
268   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
269   for (CeedInt i = 0; i < num_suboperators; i++) {
270     CeedInt num_elem = 0;
271 
272     CeedCallBackend(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
273     if (num_elem > 0) {
274       if (!impl->streams[i]) CeedCallHip(ceed, hipStreamCreate(&impl->streams[i]));
275       CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], impl->streams[i], input_arr, output_arr, &is_run_good[i], request));
276     } else {
277       is_run_good[i] = true;
278     }
279   }
280 
281   for (CeedInt i = 0; i < num_suboperators; i++) {
282     if (impl->streams[i]) {
283       if (is_run_good[i]) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[i]));
284     }
285   }
286   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
287   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
288   CeedCallHip(ceed, hipDeviceSynchronize());
289 
290   // Fallback on unsuccessful run
291   for (CeedInt i = 0; i < num_suboperators; i++) {
292     if (!is_run_good[i]) {
293       CeedOperator op_fallback;
294 
295       CeedDebug(ceed, "\nFalling back to /gpu/hip/ref CeedOperator for ApplyAdd\n");
296       CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback));
297       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
298     }
299   }
300   CeedCallBackend(CeedDestroy(&ceed));
301   return CEED_ERROR_SUCCESS;
302 }
303 
304 //------------------------------------------------------------------------------
305 // QFunction assembly
306 //------------------------------------------------------------------------------
307 static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
308                                                            CeedRequest *request) {
309   Ceed                  ceed;
310   CeedOperator_Hip_gen *data;
311 
312   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
313   CeedCallBackend(CeedOperatorGetData(op, &data));
314 
315   // Build the assembly kernel
316   if (!data->assemble_qfunction && !data->use_assembly_fallback) {
317     bool is_build_good = false;
318 
319     CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
320     if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(op, &is_build_good));
321     if (!is_build_good) data->use_assembly_fallback = true;
322   }
323 
324   // Try assembly
325   if (!data->use_assembly_fallback) {
326     bool                   is_run_good = true;
327     Ceed_Hip              *hip_data;
328     CeedInt                num_elem, num_input_fields, num_output_fields;
329     CeedEvalMode           eval_mode;
330     CeedScalar            *assembled_array;
331     CeedQFunctionField    *qf_input_fields, *qf_output_fields;
332     CeedQFunction_Hip_gen *qf_data;
333     CeedQFunction          qf;
334     CeedOperatorField     *op_input_fields, *op_output_fields;
335 
336     CeedCallBackend(CeedGetData(ceed, &hip_data));
337     CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
338     CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
339     CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
340     CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
341     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
342 
343     // Input vectors
344     for (CeedInt i = 0; i < num_input_fields; i++) {
345       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
346       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
347         data->fields.inputs[i] = NULL;
348       } else {
349         bool       is_active;
350         CeedVector vec;
351 
352         // Get input vector
353         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
354         is_active = vec == CEED_VECTOR_ACTIVE;
355         if (is_active) data->fields.inputs[i] = NULL;
356         else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
357         CeedCallBackend(CeedVectorDestroy(&vec));
358       }
359     }
360 
361     // Get context data
362     CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
363 
364     // Build objects if needed
365     if (build_objects) {
366       CeedInt qf_size_in = 0, qf_size_out = 0, Q;
367 
368       // Count number of active input fields
369       {
370         for (CeedInt i = 0; i < num_input_fields; i++) {
371           CeedInt    field_size;
372           CeedVector vec;
373 
374           // Get input vector
375           CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
376           // Check if active input
377           if (vec == CEED_VECTOR_ACTIVE) {
378             CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
379             qf_size_in += field_size;
380           }
381           CeedCallBackend(CeedVectorDestroy(&vec));
382         }
383         CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
384       }
385 
386       // Count number of active output fields
387       {
388         for (CeedInt i = 0; i < num_output_fields; i++) {
389           CeedInt    field_size;
390           CeedVector vec;
391 
392           // Get output vector
393           CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
394           // Check if active output
395           if (vec == CEED_VECTOR_ACTIVE) {
396             CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
397             qf_size_out += field_size;
398           }
399           CeedCallBackend(CeedVectorDestroy(&vec));
400         }
401         CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
402       }
403       CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
404 
405       // Actually build objects now
406       const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
407       CeedInt        strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
408 
409       // Create output restriction
410       CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out,
411                                                        (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides,
412                                                        rstr));
413       // Create assembled vector
414       CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
415     }
416 
417     // Assembly array
418     CeedCallBackend(CeedVectorGetArrayWrite(*assembled, CEED_MEM_DEVICE, &assembled_array));
419 
420     // Assemble QFunction
421     bool  is_tensor = false;
422     void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
423 
424     CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
425     CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
426 
427     if (is_tensor) {
428       CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
429     } else {
430       CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
431 
432       elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
433       block_sizes[2]  = elems_per_block;
434     }
435     if (data->dim == 1 || !is_tensor) {
436       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
437       CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
438 
439       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
440                                                     sharedMem, &is_run_good, opargs));
441     } else if (data->dim == 2) {
442       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
443       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
444 
445       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
446                                                     sharedMem, &is_run_good, opargs));
447     } else if (data->dim == 3) {
448       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
449       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
450 
451       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
452                                                     sharedMem, &is_run_good, opargs));
453     }
454 
455     // Restore input arrays
456     for (CeedInt i = 0; i < num_input_fields; i++) {
457       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
458       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
459       } else {
460         bool       is_active;
461         CeedVector vec;
462 
463         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
464         is_active = vec == CEED_VECTOR_ACTIVE;
465         if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
466         CeedCallBackend(CeedVectorDestroy(&vec));
467       }
468     }
469 
470     // Restore context data
471     CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
472 
473     // Restore assembly array
474     CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
475 
476     // Cleanup
477     CeedCallBackend(CeedQFunctionDestroy(&qf));
478     if (!is_run_good) {
479       data->use_assembly_fallback = true;
480       if (build_objects) {
481         CeedCallBackend(CeedVectorDestroy(assembled));
482         CeedCallBackend(CeedElemRestrictionDestroy(rstr));
483       }
484     }
485   }
486   CeedCallBackend(CeedDestroy(&ceed));
487 
488   // Fallback, if needed
489   if (data->use_assembly_fallback) {
490     CeedOperator op_fallback;
491 
492     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LineearAssembleQFunction\n");
493     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
494     CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
495     return CEED_ERROR_SUCCESS;
496   }
497   return CEED_ERROR_SUCCESS;
498 }
499 
500 static int CeedOperatorLinearAssembleQFunction_Hip_gen(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
501   return CeedOperatorLinearAssembleQFunctionCore_Hip_gen(op, true, assembled, rstr, request);
502 }
503 
504 static int CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
505   return CeedOperatorLinearAssembleQFunctionCore_Hip_gen(op, false, &assembled, &rstr, request);
506 }
507 
508 //------------------------------------------------------------------------------
509 // AtPoints diagonal assembly
510 //------------------------------------------------------------------------------
511 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen(CeedOperator op, CeedVector assembled, CeedRequest *request) {
512   Ceed                  ceed;
513   CeedOperator_Hip_gen *data;
514 
515   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
516   CeedCallBackend(CeedOperatorGetData(op, &data));
517 
518   // Build the assembly kernel
519   if (!data->assemble_diagonal && !data->use_assembly_fallback) {
520     bool                     is_build_good = false;
521     CeedInt                  num_active_bases_in, num_active_bases_out;
522     CeedOperatorAssemblyData assembly_data;
523 
524     CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
525     CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL,
526                                                          NULL, NULL));
527     if (num_active_bases_in == num_active_bases_out) {
528       CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
529       if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(op, &is_build_good));
530     }
531     if (!is_build_good) data->use_assembly_fallback = true;
532   }
533 
534   // Try assembly
535   if (!data->use_assembly_fallback) {
536     bool                   is_run_good = true;
537     Ceed_Hip              *hip_data;
538     CeedInt                num_elem, num_input_fields, num_output_fields;
539     CeedEvalMode           eval_mode;
540     CeedScalar            *assembled_array;
541     CeedQFunctionField    *qf_input_fields, *qf_output_fields;
542     CeedQFunction_Hip_gen *qf_data;
543     CeedQFunction          qf;
544     CeedOperatorField     *op_input_fields, *op_output_fields;
545 
546     CeedCallBackend(CeedGetData(ceed, &hip_data));
547     CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
548     CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
549     CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
550     CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
551     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
552 
553     // Input vectors
554     for (CeedInt i = 0; i < num_input_fields; i++) {
555       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
556       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
557         data->fields.inputs[i] = NULL;
558       } else {
559         bool       is_active;
560         CeedVector vec;
561 
562         // Get input vector
563         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
564         is_active = vec == CEED_VECTOR_ACTIVE;
565         if (is_active) data->fields.inputs[i] = NULL;
566         else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
567         CeedCallBackend(CeedVectorDestroy(&vec));
568       }
569     }
570 
571     // Point coordinates
572     {
573       CeedVector vec;
574 
575       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
576       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
577       CeedCallBackend(CeedVectorDestroy(&vec));
578 
579       // Points per elem
580       if (num_elem != data->points.num_elem) {
581         CeedInt            *points_per_elem;
582         const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
583         CeedElemRestriction rstr_points = NULL;
584 
585         data->points.num_elem = num_elem;
586         CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
587         CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
588         for (CeedInt e = 0; e < num_elem; e++) {
589           CeedInt num_points_elem;
590 
591           CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
592           points_per_elem[e] = num_points_elem;
593         }
594         if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
595         CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
596         CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
597         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
598         CeedCallBackend(CeedFree(&points_per_elem));
599       }
600     }
601 
602     // Get context data
603     CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
604 
605     // Assembly array
606     CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
607 
608     // Assemble diagonal
609     void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
610 
611     CeedInt block_sizes[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
612 
613     CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
614     block_sizes[2] = 1;
615     if (data->dim == 1) {
616       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
617       CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
618 
619       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
620                                                     sharedMem, &is_run_good, opargs));
621     } else if (data->dim == 2) {
622       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
623       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
624 
625       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
626                                                     sharedMem, &is_run_good, opargs));
627     } else if (data->dim == 3) {
628       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
629       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
630 
631       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
632                                                     sharedMem, &is_run_good, opargs));
633     }
634     CeedCallHip(ceed, hipDeviceSynchronize());
635 
636     // Restore input arrays
637     for (CeedInt i = 0; i < num_input_fields; i++) {
638       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
639       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
640       } else {
641         bool       is_active;
642         CeedVector vec;
643 
644         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
645         is_active = vec == CEED_VECTOR_ACTIVE;
646         if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
647         CeedCallBackend(CeedVectorDestroy(&vec));
648       }
649     }
650 
651     // Restore point coordinates
652     {
653       CeedVector vec;
654 
655       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
656       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
657       CeedCallBackend(CeedVectorDestroy(&vec));
658     }
659 
660     // Restore context data
661     CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
662 
663     // Restore assembly array
664     CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
665 
666     // Cleanup
667     CeedCallBackend(CeedQFunctionDestroy(&qf));
668     if (!is_run_good) data->use_assembly_fallback = true;
669   }
670   CeedCallBackend(CeedDestroy(&ceed));
671 
672   // Fallback, if needed
673   if (data->use_assembly_fallback) {
674     CeedOperator op_fallback;
675 
676     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints LinearAssembleAddDiagonal\n");
677     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
678     CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request));
679     return CEED_ERROR_SUCCESS;
680   }
681   return CEED_ERROR_SUCCESS;
682 }
683 
684 //------------------------------------------------------------------------------
685 // AtPoints full assembly
686 //------------------------------------------------------------------------------
687 static int CeedOperatorAssembleSingleAtPoints_Hip_gen(CeedOperator op, CeedInt offset, CeedVector assembled) {
688   Ceed                  ceed;
689   CeedOperator_Hip_gen *data;
690 
691   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
692   CeedCallBackend(CeedOperatorGetData(op, &data));
693 
694   // Build the assembly kernel
695   if (!data->assemble_full && !data->use_assembly_fallback) {
696     bool                     is_build_good = false;
697     CeedInt                  num_active_bases_in, num_active_bases_out;
698     CeedOperatorAssemblyData assembly_data;
699 
700     CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
701     CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL,
702                                                          NULL, NULL));
703     if (num_active_bases_in == num_active_bases_out) {
704       CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
705       if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(op, &is_build_good));
706     }
707     if (!is_build_good) {
708       CeedDebug(ceed, "Single Operator Assemble at Points compile failed, using fallback\n");
709       data->use_assembly_fallback = true;
710     }
711   }
712 
713   // Try assembly
714   if (!data->use_assembly_fallback) {
715     bool                   is_run_good = true;
716     Ceed_Hip              *Hip_data;
717     CeedInt                num_elem, num_input_fields, num_output_fields;
718     CeedEvalMode           eval_mode;
719     CeedScalar            *assembled_array;
720     CeedQFunctionField    *qf_input_fields, *qf_output_fields;
721     CeedQFunction_Hip_gen *qf_data;
722     CeedQFunction          qf;
723     CeedOperatorField     *op_input_fields, *op_output_fields;
724 
725     CeedCallBackend(CeedGetData(ceed, &Hip_data));
726     CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
727     CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
728     CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
729     CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
730     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
731     CeedDebug(ceed, "Running single operator assemble for /gpu/hip/gen\n");
732 
733     // Input vectors
734     for (CeedInt i = 0; i < num_input_fields; i++) {
735       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
736       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
737         data->fields.inputs[i] = NULL;
738       } else {
739         bool       is_active;
740         CeedVector vec;
741 
742         // Get input vector
743         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
744         is_active = vec == CEED_VECTOR_ACTIVE;
745         if (is_active) data->fields.inputs[i] = NULL;
746         else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
747         CeedCallBackend(CeedVectorDestroy(&vec));
748       }
749     }
750 
751     // Point coordinates
752     {
753       CeedVector vec;
754 
755       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
756       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
757       CeedCallBackend(CeedVectorDestroy(&vec));
758 
759       // Points per elem
760       if (num_elem != data->points.num_elem) {
761         CeedInt            *points_per_elem;
762         const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
763         CeedElemRestriction rstr_points = NULL;
764 
765         data->points.num_elem = num_elem;
766         CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
767         CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
768         for (CeedInt e = 0; e < num_elem; e++) {
769           CeedInt num_points_elem;
770 
771           CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
772           points_per_elem[e] = num_points_elem;
773         }
774         if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
775         CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
776         CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
777         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
778         CeedCallBackend(CeedFree(&points_per_elem));
779       }
780     }
781 
782     // Get context data
783     CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
784 
785     // Assembly array
786     CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
787     CeedScalar *assembled_offset_array = &assembled_array[offset];
788 
789     // Assemble diagonal
790     void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields,          &data->B,
791                       &data->G,          &data->W,      &data->points,  &assembled_offset_array};
792 
793     CeedInt block_sizes[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
794 
795     CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
796     block_sizes[2] = 1;
797     if (data->dim == 1) {
798       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
799       CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
800 
801       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
802                                                     &is_run_good, opargs));
803     } else if (data->dim == 2) {
804       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
805       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
806 
807       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
808                                                     &is_run_good, opargs));
809     } else if (data->dim == 3) {
810       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
811       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
812 
813       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
814                                                     &is_run_good, opargs));
815     }
816     CeedCallHip(ceed, hipDeviceSynchronize());
817 
818     // Restore input arrays
819     for (CeedInt i = 0; i < num_input_fields; i++) {
820       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
821       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
822       } else {
823         bool       is_active;
824         CeedVector vec;
825 
826         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
827         is_active = vec == CEED_VECTOR_ACTIVE;
828         if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
829         CeedCallBackend(CeedVectorDestroy(&vec));
830       }
831     }
832 
833     // Restore point coordinates
834     {
835       CeedVector vec;
836 
837       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
838       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
839       CeedCallBackend(CeedVectorDestroy(&vec));
840     }
841 
842     // Restore context data
843     CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
844 
845     // Restore assembly array
846     CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
847 
848     // Cleanup
849     CeedCallBackend(CeedQFunctionDestroy(&qf));
850     if (!is_run_good) {
851       CeedDebug(ceed, "Single Operator Assemble at Points run failed, using fallback\n");
852       data->use_assembly_fallback = true;
853     }
854   }
855   CeedCallBackend(CeedDestroy(&ceed));
856 
857   // Fallback, if needed
858   if (data->use_assembly_fallback) {
859     CeedOperator op_fallback;
860 
861     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints SingleOperatorAssemble\n");
862     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
863     CeedCallBackend(CeedOperatorAssembleSingle(op_fallback, offset, assembled));
864     return CEED_ERROR_SUCCESS;
865   }
866   return CEED_ERROR_SUCCESS;
867 }
868 
869 //------------------------------------------------------------------------------
870 // Create operator
871 //------------------------------------------------------------------------------
872 int CeedOperatorCreate_Hip_gen(CeedOperator op) {
873   bool                  is_composite, is_at_points;
874   Ceed                  ceed;
875   CeedOperator_Hip_gen *impl;
876 
877   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
878   CeedCallBackend(CeedCalloc(1, &impl));
879   CeedCallBackend(CeedOperatorSetData(op, impl));
880   CeedCall(CeedOperatorIsComposite(op, &is_composite));
881   if (is_composite) {
882     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Hip_gen));
883   } else {
884     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen));
885   }
886   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
887   if (is_at_points) {
888     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen));
889     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Hip_gen));
890   }
891   if (!is_at_points) {
892     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip_gen));
893     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen));
894   }
895   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen));
896   CeedCallBackend(CeedDestroy(&ceed));
897   return CEED_ERROR_SUCCESS;
898 }
899 
900 //------------------------------------------------------------------------------
901