xref: /libCEED/rust/libceed-sys/c-src/backends/hip-gen/ceed-hip-gen-operator.c (revision ca38d01d95e1d63e8425c25ab66131b07bd24aa2)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-source/hip/hip-types.h>
11 #include <stddef.h>
12 #include <hip/hiprtc.h>
13 
14 #include "../hip/ceed-hip-common.h"
15 #include "../hip/ceed-hip-compile.h"
16 #include "ceed-hip-gen-operator-build.h"
17 #include "ceed-hip-gen.h"
18 
19 //------------------------------------------------------------------------------
20 // Destroy operator
21 //------------------------------------------------------------------------------
22 static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
23   Ceed                  ceed;
24   CeedOperator_Hip_gen *impl;
25   bool                  is_composite;
26 
27   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
28   CeedCallBackend(CeedOperatorGetData(op, &impl));
29   CeedCallBackend(CeedOperatorIsComposite(op, &is_composite));
30   if (is_composite) {
31     CeedInt num_suboperators;
32 
33     CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
34     for (CeedInt i = 0; i < num_suboperators; i++) {
35       if (impl->streams[i]) CeedCallHip(ceed, hipStreamDestroy(impl->streams[i]));
36       impl->streams[i] = NULL;
37     }
38   }
39   if (impl->module) CeedCallHip(ceed, hipModuleUnload(impl->module));
40   if (impl->module_assemble_full) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_full));
41   if (impl->module_assemble_diagonal) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_diagonal));
42   if (impl->module_assemble_qfunction) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_qfunction));
43   if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem));
44   CeedCallBackend(CeedFree(&impl));
45   CeedCallBackend(CeedDestroy(&ceed));
46   return CEED_ERROR_SUCCESS;
47 }
48 
49 //------------------------------------------------------------------------------
50 // Apply and add to output
51 //------------------------------------------------------------------------------
52 static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream, const CeedScalar *input_arr, CeedScalar *output_arr,
53                                             bool *is_run_good, CeedRequest *request) {
54   bool                   is_at_points, is_tensor;
55   Ceed                   ceed;
56   CeedInt                num_elem, num_input_fields, num_output_fields;
57   CeedEvalMode           eval_mode;
58   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
59   CeedQFunction_Hip_gen *qf_data;
60   CeedQFunction          qf;
61   CeedOperatorField     *op_input_fields, *op_output_fields;
62   CeedOperator_Hip_gen  *data;
63 
64   // Creation of the operator
65   CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, is_run_good));
66   if (!(*is_run_good)) return CEED_ERROR_SUCCESS;
67 
68   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
69   CeedCallBackend(CeedOperatorGetData(op, &data));
70   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
71   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
72   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
73   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
74   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
75 
76   // Input vectors
77   for (CeedInt i = 0; i < num_input_fields; i++) {
78     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
79     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
80       data->fields.inputs[i] = NULL;
81     } else {
82       bool       is_active;
83       CeedVector vec;
84 
85       // Get input vector
86       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
87       is_active = vec == CEED_VECTOR_ACTIVE;
88       if (is_active) data->fields.inputs[i] = input_arr;
89       else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
90       CeedCallBackend(CeedVectorDestroy(&vec));
91     }
92   }
93 
94   // Output vectors
95   for (CeedInt i = 0; i < num_output_fields; i++) {
96     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
97     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
98       data->fields.outputs[i] = NULL;
99     } else {
100       bool       is_active;
101       CeedVector vec;
102 
103       // Get output vector
104       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
105       is_active = vec == CEED_VECTOR_ACTIVE;
106       if (is_active) data->fields.outputs[i] = output_arr;
107       else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
108       CeedCallBackend(CeedVectorDestroy(&vec));
109     }
110   }
111 
112   // Point coordinates, if needed
113   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
114   if (is_at_points) {
115     // Coords
116     CeedVector vec;
117 
118     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
119     CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
120     CeedCallBackend(CeedVectorDestroy(&vec));
121 
122     // Points per elem
123     if (num_elem != data->points.num_elem) {
124       CeedInt            *points_per_elem;
125       const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
126       CeedElemRestriction rstr_points = NULL;
127 
128       data->points.num_elem = num_elem;
129       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
130       CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
131       for (CeedInt e = 0; e < num_elem; e++) {
132         CeedInt num_points_elem;
133 
134         CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
135         points_per_elem[e] = num_points_elem;
136       }
137       if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
138       CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
139       CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
140       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
141       CeedCallBackend(CeedFree(&points_per_elem));
142     }
143   }
144 
145   // Get context data
146   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
147 
148   // Apply operator
149   void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
150 
151   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
152   CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
153 
154   if (is_tensor) {
155     CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
156     if (is_at_points) block_sizes[2] = 1;
157   } else {
158     CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
159 
160     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
161     block_sizes[2]  = elems_per_block;
162   }
163   if (data->dim == 1 || !is_tensor) {
164     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
165     CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
166 
167     CeedCallBackend(
168         CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
169   } else if (data->dim == 2) {
170     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
171     CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
172 
173     CeedCallBackend(
174         CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
175   } else if (data->dim == 3) {
176     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
177     CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
178 
179     CeedCallBackend(
180         CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
181   }
182 
183   // Restore input arrays
184   for (CeedInt i = 0; i < num_input_fields; i++) {
185     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
186     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
187     } else {
188       bool       is_active;
189       CeedVector vec;
190 
191       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
192       is_active = vec == CEED_VECTOR_ACTIVE;
193       if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
194       CeedCallBackend(CeedVectorDestroy(&vec));
195     }
196   }
197 
198   // Restore output arrays
199   for (CeedInt i = 0; i < num_output_fields; i++) {
200     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
201     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
202     } else {
203       bool       is_active;
204       CeedVector vec;
205 
206       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
207       is_active = vec == CEED_VECTOR_ACTIVE;
208       if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
209       CeedCallBackend(CeedVectorDestroy(&vec));
210     }
211   }
212 
213   // Restore point coordinates, if needed
214   if (is_at_points) {
215     CeedVector vec;
216 
217     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
218     CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
219     CeedCallBackend(CeedVectorDestroy(&vec));
220   }
221 
222   // Restore context data
223   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
224 
225   // Cleanup
226   CeedCallBackend(CeedDestroy(&ceed));
227   CeedCallBackend(CeedQFunctionDestroy(&qf));
228   if (!(*is_run_good)) data->use_fallback = true;
229   return CEED_ERROR_SUCCESS;
230 }
231 
232 static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
233   bool              is_run_good = false;
234   const CeedScalar *input_arr   = NULL;
235   CeedScalar       *output_arr  = NULL;
236 
237   // Try to run kernel
238   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
239   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
240   CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(op, NULL, input_arr, output_arr, &is_run_good, request));
241   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
242   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
243 
244   // Fallback on unsuccessful run
245   if (!is_run_good) {
246     CeedOperator op_fallback;
247 
248     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for ApplyAdd\n");
249     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
250     CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
251   }
252   return CEED_ERROR_SUCCESS;
253 }
254 
255 static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
256   bool                  is_run_good[CEED_COMPOSITE_MAX] = {false};
257   CeedInt               num_suboperators;
258   const CeedScalar     *input_arr  = NULL;
259   CeedScalar           *output_arr = NULL;
260   Ceed                  ceed;
261   CeedOperator_Hip_gen *impl;
262   CeedOperator         *sub_operators;
263 
264   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
265   CeedCallBackend(CeedOperatorGetData(op, &impl));
266   CeedCallBackend(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
267   CeedCallBackend(CeedCompositeOperatorGetSubList(op, &sub_operators));
268   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
269   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
270   for (CeedInt i = 0; i < num_suboperators; i++) {
271     CeedInt num_elem = 0;
272 
273     CeedCallBackend(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
274     if (num_elem > 0) {
275       if (!impl->streams[i]) CeedCallHip(ceed, hipStreamCreate(&impl->streams[i]));
276       CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], impl->streams[i], input_arr, output_arr, &is_run_good[i], request));
277     } else {
278       is_run_good[i] = true;
279     }
280   }
281 
282   for (CeedInt i = 0; i < num_suboperators; i++) {
283     if (impl->streams[i]) {
284       if (is_run_good[i]) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[i]));
285     }
286   }
287   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
288   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
289   CeedCallHip(ceed, hipDeviceSynchronize());
290 
291   // Fallback on unsuccessful run
292   for (CeedInt i = 0; i < num_suboperators; i++) {
293     if (!is_run_good[i]) {
294       CeedOperator op_fallback;
295 
296       CeedDebug(ceed, "\nFalling back to /gpu/hip/ref CeedOperator for ApplyAdd\n");
297       CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback));
298       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
299     }
300   }
301   CeedCallBackend(CeedDestroy(&ceed));
302   return CEED_ERROR_SUCCESS;
303 }
304 
305 //------------------------------------------------------------------------------
306 // QFunction assembly
307 //------------------------------------------------------------------------------
308 static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
309                                                            CeedRequest *request) {
310   Ceed                  ceed;
311   CeedOperator_Hip_gen *data;
312 
313   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
314   CeedCallBackend(CeedOperatorGetData(op, &data));
315 
316   // Build the assembly kernel
317   if (!data->assemble_qfunction && !data->use_assembly_fallback) {
318     bool is_build_good = false;
319 
320     CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
321     if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(op, &is_build_good));
322     if (!is_build_good) data->use_assembly_fallback = true;
323   }
324 
325   // Try assembly
326   if (!data->use_assembly_fallback) {
327     bool                   is_run_good = true;
328     Ceed_Hip              *hip_data;
329     CeedInt                num_elem, num_input_fields, num_output_fields;
330     CeedEvalMode           eval_mode;
331     CeedScalar            *assembled_array;
332     CeedQFunctionField    *qf_input_fields, *qf_output_fields;
333     CeedQFunction_Hip_gen *qf_data;
334     CeedQFunction          qf;
335     CeedOperatorField     *op_input_fields, *op_output_fields;
336 
337     CeedCallBackend(CeedGetData(ceed, &hip_data));
338     CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
339     CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
340     CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
341     CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
342     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
343 
344     // Input vectors
345     for (CeedInt i = 0; i < num_input_fields; i++) {
346       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
347       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
348         data->fields.inputs[i] = NULL;
349       } else {
350         bool       is_active;
351         CeedVector vec;
352 
353         // Get input vector
354         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
355         is_active = vec == CEED_VECTOR_ACTIVE;
356         if (is_active) data->fields.inputs[i] = NULL;
357         else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
358         CeedCallBackend(CeedVectorDestroy(&vec));
359       }
360     }
361 
362     // Get context data
363     CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
364 
365     // Build objects if needed
366     if (build_objects) {
367       CeedInt qf_size_in = 0, qf_size_out = 0, Q;
368 
369       // Count number of active input fields
370       {
371         for (CeedInt i = 0; i < num_input_fields; i++) {
372           CeedInt    field_size;
373           CeedVector vec;
374 
375           // Get input vector
376           CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
377           // Check if active input
378           if (vec == CEED_VECTOR_ACTIVE) {
379             CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
380             qf_size_in += field_size;
381           }
382           CeedCallBackend(CeedVectorDestroy(&vec));
383         }
384         CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
385       }
386 
387       // Count number of active output fields
388       {
389         for (CeedInt i = 0; i < num_output_fields; i++) {
390           CeedInt    field_size;
391           CeedVector vec;
392 
393           // Get output vector
394           CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
395           // Check if active output
396           if (vec == CEED_VECTOR_ACTIVE) {
397             CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
398             qf_size_out += field_size;
399           }
400           CeedCallBackend(CeedVectorDestroy(&vec));
401         }
402         CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
403       }
404       CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
405 
406       // Actually build objects now
407       const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
408       CeedInt        strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
409 
410       // Create output restriction
411       CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out,
412                                                        (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides,
413                                                        rstr));
414       // Create assembled vector
415       CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
416     }
417 
418     // Assembly array
419     CeedCallBackend(CeedVectorGetArrayWrite(*assembled, CEED_MEM_DEVICE, &assembled_array));
420 
421     // Assemble QFunction
422     bool  is_tensor = false;
423     void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
424 
425     CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
426     CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
427 
428     if (is_tensor) {
429       CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
430     } else {
431       CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
432 
433       elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
434       block_sizes[2]  = elems_per_block;
435     }
436     if (data->dim == 1 || !is_tensor) {
437       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
438       CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
439 
440       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
441                                                     sharedMem, &is_run_good, opargs));
442     } else if (data->dim == 2) {
443       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
444       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
445 
446       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
447                                                     sharedMem, &is_run_good, opargs));
448     } else if (data->dim == 3) {
449       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
450       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
451 
452       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
453                                                     sharedMem, &is_run_good, opargs));
454     }
455 
456     // Restore input arrays
457     for (CeedInt i = 0; i < num_input_fields; i++) {
458       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
459       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
460       } else {
461         bool       is_active;
462         CeedVector vec;
463 
464         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
465         is_active = vec == CEED_VECTOR_ACTIVE;
466         if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
467         CeedCallBackend(CeedVectorDestroy(&vec));
468       }
469     }
470 
471     // Restore context data
472     CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
473 
474     // Restore assembly array
475     CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
476 
477     // Cleanup
478     CeedCallBackend(CeedQFunctionDestroy(&qf));
479     if (!is_run_good) {
480       data->use_assembly_fallback = true;
481       if (build_objects) {
482         CeedCallBackend(CeedVectorDestroy(assembled));
483         CeedCallBackend(CeedElemRestrictionDestroy(rstr));
484       }
485     }
486   }
487   CeedCallBackend(CeedDestroy(&ceed));
488 
489   // Fallback, if needed
490   if (data->use_assembly_fallback) {
491     CeedOperator op_fallback;
492 
493     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LineearAssembleQFunction\n");
494     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
495     CeedCallBackend(CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
496     return CEED_ERROR_SUCCESS;
497   }
498   return CEED_ERROR_SUCCESS;
499 }
500 
501 static int CeedOperatorLinearAssembleQFunction_Hip_gen(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
502   return CeedOperatorLinearAssembleQFunctionCore_Hip_gen(op, true, assembled, rstr, request);
503 }
504 
505 static int CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
506   return CeedOperatorLinearAssembleQFunctionCore_Hip_gen(op, false, &assembled, &rstr, request);
507 }
508 
509 //------------------------------------------------------------------------------
510 // AtPoints diagonal assembly
511 //------------------------------------------------------------------------------
512 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen(CeedOperator op, CeedVector assembled, CeedRequest *request) {
513   Ceed                  ceed;
514   CeedOperator_Hip_gen *data;
515 
516   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
517   CeedCallBackend(CeedOperatorGetData(op, &data));
518 
519   // Build the assembly kernel
520   if (!data->assemble_diagonal && !data->use_assembly_fallback) {
521     bool                     is_build_good = false;
522     CeedInt                  num_active_bases_in, num_active_bases_out;
523     CeedOperatorAssemblyData assembly_data;
524 
525     CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
526     CeedCallBackend(
527         CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL));
528     if (num_active_bases_in == num_active_bases_out) {
529       CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
530       if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(op, &is_build_good));
531     }
532     if (!is_build_good) data->use_assembly_fallback = true;
533   }
534 
535   // Try assembly
536   if (!data->use_assembly_fallback) {
537     bool                   is_run_good = true;
538     Ceed_Hip              *hip_data;
539     CeedInt                num_elem, num_input_fields, num_output_fields;
540     CeedEvalMode           eval_mode;
541     CeedScalar            *assembled_array;
542     CeedQFunctionField    *qf_input_fields, *qf_output_fields;
543     CeedQFunction_Hip_gen *qf_data;
544     CeedQFunction          qf;
545     CeedOperatorField     *op_input_fields, *op_output_fields;
546 
547     CeedCallBackend(CeedGetData(ceed, &hip_data));
548     CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
549     CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
550     CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
551     CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
552     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
553 
554     // Input vectors
555     for (CeedInt i = 0; i < num_input_fields; i++) {
556       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
557       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
558         data->fields.inputs[i] = NULL;
559       } else {
560         bool       is_active;
561         CeedVector vec;
562 
563         // Get input vector
564         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
565         is_active = vec == CEED_VECTOR_ACTIVE;
566         if (is_active) data->fields.inputs[i] = NULL;
567         else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
568         CeedCallBackend(CeedVectorDestroy(&vec));
569       }
570     }
571 
572     // Point coordinates
573     {
574       CeedVector vec;
575 
576       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
577       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
578       CeedCallBackend(CeedVectorDestroy(&vec));
579 
580       // Points per elem
581       if (num_elem != data->points.num_elem) {
582         CeedInt            *points_per_elem;
583         const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
584         CeedElemRestriction rstr_points = NULL;
585 
586         data->points.num_elem = num_elem;
587         CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
588         CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
589         for (CeedInt e = 0; e < num_elem; e++) {
590           CeedInt num_points_elem;
591 
592           CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
593           points_per_elem[e] = num_points_elem;
594         }
595         if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
596         CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
597         CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
598         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
599         CeedCallBackend(CeedFree(&points_per_elem));
600       }
601     }
602 
603     // Get context data
604     CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
605 
606     // Assembly array
607     CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
608 
609     // Assemble diagonal
610     void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
611 
612     CeedInt block_sizes[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
613 
614     CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
615     block_sizes[2] = 1;
616     if (data->dim == 1) {
617       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
618       CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
619 
620       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
621                                                     sharedMem, &is_run_good, opargs));
622     } else if (data->dim == 2) {
623       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
624       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
625 
626       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
627                                                     sharedMem, &is_run_good, opargs));
628     } else if (data->dim == 3) {
629       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
630       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
631 
632       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
633                                                     sharedMem, &is_run_good, opargs));
634     }
635     CeedCallHip(ceed, hipDeviceSynchronize());
636 
637     // Restore input arrays
638     for (CeedInt i = 0; i < num_input_fields; i++) {
639       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
640       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
641       } else {
642         bool       is_active;
643         CeedVector vec;
644 
645         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
646         is_active = vec == CEED_VECTOR_ACTIVE;
647         if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
648         CeedCallBackend(CeedVectorDestroy(&vec));
649       }
650     }
651 
652     // Restore point coordinates
653     {
654       CeedVector vec;
655 
656       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
657       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
658       CeedCallBackend(CeedVectorDestroy(&vec));
659     }
660 
661     // Restore context data
662     CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
663 
664     // Restore assembly array
665     CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
666 
667     // Cleanup
668     CeedCallBackend(CeedQFunctionDestroy(&qf));
669     if (!is_run_good) data->use_assembly_fallback = true;
670   }
671   CeedCallBackend(CeedDestroy(&ceed));
672 
673   // Fallback, if needed
674   if (data->use_assembly_fallback) {
675     CeedOperator op_fallback;
676 
677     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints LinearAssembleAddDiagonal\n");
678     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
679     CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request));
680     return CEED_ERROR_SUCCESS;
681   }
682   return CEED_ERROR_SUCCESS;
683 }
684 
685 //------------------------------------------------------------------------------
686 // AtPoints full assembly
687 //------------------------------------------------------------------------------
688 static int CeedSingleOperatorAssembleAtPoints_Hip_gen(CeedOperator op, CeedInt offset, CeedVector assembled) {
689   Ceed                  ceed;
690   CeedOperator_Hip_gen *data;
691 
692   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
693   CeedCallBackend(CeedOperatorGetData(op, &data));
694 
695   // Build the assembly kernel
696   if (!data->assemble_full && !data->use_assembly_fallback) {
697     bool                     is_build_good = false;
698     CeedInt                  num_active_bases_in, num_active_bases_out;
699     CeedOperatorAssemblyData assembly_data;
700 
701     CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
702     CeedCallBackend(
703         CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL));
704     if (num_active_bases_in == num_active_bases_out) {
705       CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
706       if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(op, &is_build_good));
707     }
708     if (!is_build_good) {
709       CeedDebug(ceed, "Single Operator Assemble at Points compile failed, using fallback\n");
710       data->use_assembly_fallback = true;
711     }
712   }
713 
714   // Try assembly
715   if (!data->use_assembly_fallback) {
716     bool                   is_run_good = true;
717     Ceed_Hip              *Hip_data;
718     CeedInt                num_elem, num_input_fields, num_output_fields;
719     CeedEvalMode           eval_mode;
720     CeedScalar            *assembled_array;
721     CeedQFunctionField    *qf_input_fields, *qf_output_fields;
722     CeedQFunction_Hip_gen *qf_data;
723     CeedQFunction          qf;
724     CeedOperatorField     *op_input_fields, *op_output_fields;
725 
726     CeedCallBackend(CeedGetData(ceed, &Hip_data));
727     CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
728     CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
729     CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
730     CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
731     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
732     CeedDebug(ceed, "Running single operator assemble for /gpu/hip/gen\n");
733 
734     // Input vectors
735     for (CeedInt i = 0; i < num_input_fields; i++) {
736       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
737       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
738         data->fields.inputs[i] = NULL;
739       } else {
740         bool       is_active;
741         CeedVector vec;
742 
743         // Get input vector
744         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
745         is_active = vec == CEED_VECTOR_ACTIVE;
746         if (is_active) data->fields.inputs[i] = NULL;
747         else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
748         CeedCallBackend(CeedVectorDestroy(&vec));
749       }
750     }
751 
752     // Point coordinates
753     {
754       CeedVector vec;
755 
756       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
757       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
758       CeedCallBackend(CeedVectorDestroy(&vec));
759 
760       // Points per elem
761       if (num_elem != data->points.num_elem) {
762         CeedInt            *points_per_elem;
763         const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
764         CeedElemRestriction rstr_points = NULL;
765 
766         data->points.num_elem = num_elem;
767         CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
768         CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
769         for (CeedInt e = 0; e < num_elem; e++) {
770           CeedInt num_points_elem;
771 
772           CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
773           points_per_elem[e] = num_points_elem;
774         }
775         if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
776         CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
777         CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
778         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
779         CeedCallBackend(CeedFree(&points_per_elem));
780       }
781     }
782 
783     // Get context data
784     CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
785 
786     // Assembly array
787     CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
788     CeedScalar *assembled_offset_array = &assembled_array[offset];
789 
790     // Assemble diagonal
791     void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields,          &data->B,
792                       &data->G,          &data->W,      &data->points,  &assembled_offset_array};
793 
794     CeedInt block_sizes[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
795 
796     CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
797     block_sizes[2] = 1;
798     if (data->dim == 1) {
799       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
800       CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
801 
802       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
803                                                     &is_run_good, opargs));
804     } else if (data->dim == 2) {
805       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
806       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
807 
808       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
809                                                     &is_run_good, opargs));
810     } else if (data->dim == 3) {
811       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
812       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
813 
814       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
815                                                     &is_run_good, opargs));
816     }
817     CeedCallHip(ceed, hipDeviceSynchronize());
818 
819     // Restore input arrays
820     for (CeedInt i = 0; i < num_input_fields; i++) {
821       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
822       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
823       } else {
824         bool       is_active;
825         CeedVector vec;
826 
827         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
828         is_active = vec == CEED_VECTOR_ACTIVE;
829         if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
830         CeedCallBackend(CeedVectorDestroy(&vec));
831       }
832     }
833 
834     // Restore point coordinates
835     {
836       CeedVector vec;
837 
838       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
839       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
840       CeedCallBackend(CeedVectorDestroy(&vec));
841     }
842 
843     // Restore context data
844     CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
845 
846     // Restore assembly array
847     CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
848 
849     // Cleanup
850     CeedCallBackend(CeedQFunctionDestroy(&qf));
851     if (!is_run_good) {
852       CeedDebug(ceed, "Single Operator Assemble at Points run failed, using fallback\n");
853       data->use_assembly_fallback = true;
854     }
855   }
856   CeedCallBackend(CeedDestroy(&ceed));
857 
858   // Fallback, if needed
859   if (data->use_assembly_fallback) {
860     CeedOperator op_fallback;
861 
862     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints SingleOperatorAssemble\n");
863     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
864     CeedCallBackend(CeedSingleOperatorAssemble(op_fallback, offset, assembled));
865     return CEED_ERROR_SUCCESS;
866   }
867   return CEED_ERROR_SUCCESS;
868 }
869 
870 //------------------------------------------------------------------------------
871 // Create operator
872 //------------------------------------------------------------------------------
873 int CeedOperatorCreate_Hip_gen(CeedOperator op) {
874   bool                  is_composite, is_at_points;
875   Ceed                  ceed;
876   CeedOperator_Hip_gen *impl;
877 
878   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
879   CeedCallBackend(CeedCalloc(1, &impl));
880   CeedCallBackend(CeedOperatorSetData(op, impl));
881   CeedCall(CeedOperatorIsComposite(op, &is_composite));
882   if (is_composite) {
883     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Hip_gen));
884   } else {
885     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen));
886   }
887   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
888   if (is_at_points) {
889     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen));
890     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssembleAtPoints_Hip_gen));
891   }
892   if (!is_at_points) {
893     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip_gen));
894     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen));
895   }
896   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen));
897   CeedCallBackend(CeedDestroy(&ceed));
898   return CEED_ERROR_SUCCESS;
899 }
900 
901 //------------------------------------------------------------------------------
902