xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator.c (revision a49e5d53e180225109bfad71df325c7cfa170c69)
1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-source/hip/hip-types.h>
11 #include <stddef.h>
12 #include <hip/hiprtc.h>
13 
14 #include "../hip/ceed-hip-common.h"
15 #include "../hip/ceed-hip-compile.h"
16 #include "ceed-hip-gen-operator-build.h"
17 #include "ceed-hip-gen.h"
18 
19 //------------------------------------------------------------------------------
20 // Destroy operator
21 //------------------------------------------------------------------------------
CeedOperatorDestroy_Hip_gen(CeedOperator op)22 static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
23   Ceed                  ceed;
24   CeedOperator_Hip_gen *impl;
25   bool                  is_composite;
26 
27   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
28   CeedCallBackend(CeedOperatorGetData(op, &impl));
29   CeedCallBackend(CeedOperatorIsComposite(op, &is_composite));
30   if (is_composite) {
31     CeedInt num_suboperators;
32 
33     CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
34     for (CeedInt i = 0; i < num_suboperators; i++) {
35       if (impl->streams[i]) CeedCallHip(ceed, hipStreamDestroy(impl->streams[i]));
36       impl->streams[i] = NULL;
37     }
38   }
39   if (impl->module) CeedCallHip(ceed, hipModuleUnload(impl->module));
40   if (impl->module_assemble_full) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_full));
41   if (impl->module_assemble_diagonal) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_diagonal));
42   if (impl->module_assemble_qfunction) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_qfunction));
43   if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem));
44   CeedCallBackend(CeedFree(&impl));
45   CeedCallBackend(CeedDestroy(&ceed));
46   return CEED_ERROR_SUCCESS;
47 }
48 
49 //------------------------------------------------------------------------------
50 // Apply and add to output
51 //------------------------------------------------------------------------------
CeedOperatorApplyAddCore_Hip_gen(CeedOperator op,hipStream_t stream,const CeedScalar * input_arr,CeedScalar * output_arr,bool * is_run_good,CeedRequest * request)52 static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream, const CeedScalar *input_arr, CeedScalar *output_arr,
53                                             bool *is_run_good, CeedRequest *request) {
54   bool                   is_at_points, is_tensor;
55   Ceed                   ceed;
56   CeedInt                num_elem, num_input_fields, num_output_fields;
57   CeedEvalMode           eval_mode;
58   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
59   CeedQFunction_Hip_gen *qf_data;
60   CeedQFunction          qf;
61   CeedOperatorField     *op_input_fields, *op_output_fields;
62   CeedOperator_Hip_gen  *data;
63 
64   // Creation of the operator
65   CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, is_run_good));
66   if (!(*is_run_good)) return CEED_ERROR_SUCCESS;
67 
68   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
69   CeedCallBackend(CeedOperatorGetData(op, &data));
70   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
71   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
72   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
73   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
74   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
75 
76   // Input vectors
77   for (CeedInt i = 0; i < num_input_fields; i++) {
78     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
79     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
80       data->fields.inputs[i] = NULL;
81     } else {
82       bool       is_active;
83       CeedVector vec;
84 
85       // Get input vector
86       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
87       is_active = vec == CEED_VECTOR_ACTIVE;
88       if (is_active) data->fields.inputs[i] = input_arr;
89       else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
90       CeedCallBackend(CeedVectorDestroy(&vec));
91     }
92   }
93 
94   // Output vectors
95   for (CeedInt i = 0; i < num_output_fields; i++) {
96     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
97     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
98       data->fields.outputs[i] = NULL;
99     } else {
100       bool       is_active;
101       CeedVector vec;
102 
103       // Get output vector
104       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
105       is_active = vec == CEED_VECTOR_ACTIVE;
106       if (is_active) data->fields.outputs[i] = output_arr;
107       else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
108       CeedCallBackend(CeedVectorDestroy(&vec));
109     }
110   }
111 
112   // Point coordinates, if needed
113   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
114   if (is_at_points) {
115     // Coords
116     CeedVector vec;
117 
118     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
119     CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
120     CeedCallBackend(CeedVectorDestroy(&vec));
121 
122     // Points per elem
123     if (num_elem != data->points.num_elem) {
124       CeedInt            *points_per_elem;
125       const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
126       CeedElemRestriction rstr_points = NULL;
127 
128       data->points.num_elem = num_elem;
129       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
130       CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
131       for (CeedInt e = 0; e < num_elem; e++) {
132         CeedInt num_points_elem;
133 
134         CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
135         points_per_elem[e] = num_points_elem;
136       }
137       if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
138       CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
139       CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
140       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
141       CeedCallBackend(CeedFree(&points_per_elem));
142     }
143   }
144 
145   // Get context data
146   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
147 
148   // Apply operator
149   void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
150 
151   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
152   CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
153 
154   if (is_tensor) {
155     CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
156   } else {
157     CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
158 
159     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
160     block_sizes[2]  = elems_per_block;
161   }
162   if (data->dim == 1 || !is_tensor) {
163     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
164     CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
165 
166     CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
167                                                   is_run_good, opargs));
168   } else if (data->dim == 2) {
169     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
170     CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
171 
172     CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
173                                                   is_run_good, opargs));
174   } else if (data->dim == 3) {
175     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
176     CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
177 
178     CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
179                                                   is_run_good, opargs));
180   }
181 
182   // Restore input arrays
183   for (CeedInt i = 0; i < num_input_fields; i++) {
184     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
185     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
186     } else {
187       bool       is_active;
188       CeedVector vec;
189 
190       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
191       is_active = vec == CEED_VECTOR_ACTIVE;
192       if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
193       CeedCallBackend(CeedVectorDestroy(&vec));
194     }
195   }
196 
197   // Restore output arrays
198   for (CeedInt i = 0; i < num_output_fields; i++) {
199     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
200     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
201     } else {
202       bool       is_active;
203       CeedVector vec;
204 
205       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
206       is_active = vec == CEED_VECTOR_ACTIVE;
207       if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
208       CeedCallBackend(CeedVectorDestroy(&vec));
209     }
210   }
211 
212   // Restore point coordinates, if needed
213   if (is_at_points) {
214     CeedVector vec;
215 
216     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
217     CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
218     CeedCallBackend(CeedVectorDestroy(&vec));
219   }
220 
221   // Restore context data
222   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
223 
224   // Cleanup
225   CeedCallBackend(CeedDestroy(&ceed));
226   CeedCallBackend(CeedQFunctionDestroy(&qf));
227   if (!(*is_run_good)) data->use_fallback = true;
228   return CEED_ERROR_SUCCESS;
229 }
230 
CeedOperatorApplyAdd_Hip_gen(CeedOperator op,CeedVector input_vec,CeedVector output_vec,CeedRequest * request)231 static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
232   bool              is_run_good = false;
233   const CeedScalar *input_arr   = NULL;
234   CeedScalar       *output_arr  = NULL;
235 
236   // Try to run kernel
237   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
238   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
239   CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(op, NULL, input_arr, output_arr, &is_run_good, request));
240   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
241   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
242 
243   // Fallback on unsuccessful run
244   if (!is_run_good) {
245     CeedOperator op_fallback;
246 
247     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for ApplyAdd\n");
248     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
249     CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
250   }
251   return CEED_ERROR_SUCCESS;
252 }
253 
CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op,CeedVector input_vec,CeedVector output_vec,CeedRequest * request)254 static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
255   bool                  is_run_good[CEED_COMPOSITE_MAX] = {false}, is_sequential;
256   CeedInt               num_suboperators;
257   const CeedScalar     *input_arr  = NULL;
258   CeedScalar           *output_arr = NULL;
259   Ceed                  ceed;
260   CeedOperator_Hip_gen *impl;
261   CeedOperator         *sub_operators;
262 
263   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
264   CeedCallBackend(CeedOperatorGetData(op, &impl));
265   CeedCallBackend(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
266   CeedCallBackend(CeedOperatorCompositeGetSubList(op, &sub_operators));
267   CeedCall(CeedOperatorCompositeIsSequential(op, &is_sequential));
268   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
269   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
270   for (CeedInt i = 0; i < num_suboperators; i++) {
271     CeedInt       num_elem     = 0;
272     const CeedInt stream_index = is_sequential ? 0 : i;
273 
274     CeedCallBackend(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
275     if (num_elem > 0) {
276       if (!impl->streams[stream_index]) CeedCallHip(ceed, hipStreamCreate(&impl->streams[stream_index]));
277       CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], impl->streams[stream_index], input_arr, output_arr, &is_run_good[i],
278                                                        request));
279     } else {
280       is_run_good[i] = true;
281     }
282   }
283   if (is_sequential) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[0]));
284   else {
285     for (CeedInt i = 0; i < num_suboperators; i++) {
286       if (impl->streams[i]) {
287         if (is_run_good[i]) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[i]));
288       }
289     }
290   }
291   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
292   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
293   CeedCallHip(ceed, hipDeviceSynchronize());
294 
295   // Fallback on unsuccessful run
296   for (CeedInt i = 0; i < num_suboperators; i++) {
297     if (!is_run_good[i]) {
298       CeedOperator op_fallback;
299 
300       CeedDebug(ceed, "\nFalling back to /gpu/hip/ref CeedOperator for ApplyAdd\n");
301       CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback));
302       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
303     }
304   }
305   CeedCallBackend(CeedDestroy(&ceed));
306   return CEED_ERROR_SUCCESS;
307 }
308 
309 //------------------------------------------------------------------------------
310 // QFunction assembly
311 //------------------------------------------------------------------------------
CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op,bool build_objects,CeedVector * assembled,CeedElemRestriction * rstr,CeedRequest * request)312 static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
313                                                            CeedRequest *request) {
314   Ceed                  ceed;
315   CeedOperator_Hip_gen *data;
316 
317   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
318   CeedCallBackend(CeedOperatorGetData(op, &data));
319 
320   // Build the assembly kernel
321   if (!data->assemble_qfunction && !data->use_assembly_fallback) {
322     bool is_build_good = false;
323 
324     CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
325     if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(op, &is_build_good));
326     if (!is_build_good) data->use_assembly_fallback = true;
327   }
328 
329   // Try assembly
330   if (!data->use_assembly_fallback) {
331     bool                   is_run_good = true;
332     Ceed_Hip              *hip_data;
333     CeedInt                num_elem, num_input_fields, num_output_fields;
334     CeedEvalMode           eval_mode;
335     CeedScalar            *assembled_array;
336     CeedQFunctionField    *qf_input_fields, *qf_output_fields;
337     CeedQFunction_Hip_gen *qf_data;
338     CeedQFunction          qf;
339     CeedOperatorField     *op_input_fields, *op_output_fields;
340 
341     CeedCallBackend(CeedGetData(ceed, &hip_data));
342     CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
343     CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
344     CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
345     CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
346     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
347 
348     // Input vectors
349     for (CeedInt i = 0; i < num_input_fields; i++) {
350       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
351       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
352         data->fields.inputs[i] = NULL;
353       } else {
354         bool       is_active;
355         CeedVector vec;
356 
357         // Get input vector
358         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
359         is_active = vec == CEED_VECTOR_ACTIVE;
360         if (is_active) data->fields.inputs[i] = NULL;
361         else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
362         CeedCallBackend(CeedVectorDestroy(&vec));
363       }
364     }
365 
366     // Get context data
367     CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
368 
369     // Build objects if needed
370     if (build_objects) {
371       CeedInt qf_size_in = 0, qf_size_out = 0, Q;
372 
373       // Count number of active input fields
374       {
375         for (CeedInt i = 0; i < num_input_fields; i++) {
376           CeedInt    field_size;
377           CeedVector vec;
378 
379           // Get input vector
380           CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
381           // Check if active input
382           if (vec == CEED_VECTOR_ACTIVE) {
383             CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
384             qf_size_in += field_size;
385           }
386           CeedCallBackend(CeedVectorDestroy(&vec));
387         }
388         CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
389       }
390 
391       // Count number of active output fields
392       {
393         for (CeedInt i = 0; i < num_output_fields; i++) {
394           CeedInt    field_size;
395           CeedVector vec;
396 
397           // Get output vector
398           CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
399           // Check if active output
400           if (vec == CEED_VECTOR_ACTIVE) {
401             CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
402             qf_size_out += field_size;
403           }
404           CeedCallBackend(CeedVectorDestroy(&vec));
405         }
406         CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
407       }
408       CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
409 
410       // Actually build objects now
411       const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
412       CeedInt        strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
413 
414       // Create output restriction
415       CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out,
416                                                        (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides,
417                                                        rstr));
418       // Create assembled vector
419       CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
420     }
421 
422     // Assembly array
423     CeedCallBackend(CeedVectorGetArrayWrite(*assembled, CEED_MEM_DEVICE, &assembled_array));
424 
425     // Assemble QFunction
426     bool  is_tensor = false;
427     void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
428 
429     CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
430     CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
431 
432     if (is_tensor) {
433       CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
434     } else {
435       CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
436 
437       elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
438       block_sizes[2]  = elems_per_block;
439     }
440     if (data->dim == 1 || !is_tensor) {
441       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
442       CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
443 
444       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
445                                                     sharedMem, &is_run_good, opargs));
446     } else if (data->dim == 2) {
447       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
448       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
449 
450       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
451                                                     sharedMem, &is_run_good, opargs));
452     } else if (data->dim == 3) {
453       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
454       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
455 
456       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
457                                                     sharedMem, &is_run_good, opargs));
458     }
459 
460     // Restore input arrays
461     for (CeedInt i = 0; i < num_input_fields; i++) {
462       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
463       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
464       } else {
465         bool       is_active;
466         CeedVector vec;
467 
468         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
469         is_active = vec == CEED_VECTOR_ACTIVE;
470         if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
471         CeedCallBackend(CeedVectorDestroy(&vec));
472       }
473     }
474 
475     // Restore context data
476     CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
477 
478     // Restore assembly array
479     CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
480 
481     // Cleanup
482     CeedCallBackend(CeedQFunctionDestroy(&qf));
483     if (!is_run_good) {
484       data->use_assembly_fallback = true;
485       if (build_objects) {
486         CeedCallBackend(CeedVectorDestroy(assembled));
487         CeedCallBackend(CeedElemRestrictionDestroy(rstr));
488       }
489     }
490   }
491   CeedCallBackend(CeedDestroy(&ceed));
492 
493   // Fallback, if needed
494   if (data->use_assembly_fallback) {
495     CeedOperator op_fallback;
496 
497     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LineearAssembleQFunction\n");
498     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
499     CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
500     return CEED_ERROR_SUCCESS;
501   }
502   return CEED_ERROR_SUCCESS;
503 }
504 
CeedOperatorLinearAssembleQFunction_Hip_gen(CeedOperator op,CeedVector * assembled,CeedElemRestriction * rstr,CeedRequest * request)505 static int CeedOperatorLinearAssembleQFunction_Hip_gen(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
506   return CeedOperatorLinearAssembleQFunctionCore_Hip_gen(op, true, assembled, rstr, request);
507 }
508 
CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen(CeedOperator op,CeedVector assembled,CeedElemRestriction rstr,CeedRequest * request)509 static int CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
510   return CeedOperatorLinearAssembleQFunctionCore_Hip_gen(op, false, &assembled, &rstr, request);
511 }
512 
513 //------------------------------------------------------------------------------
514 // AtPoints diagonal assembly
515 //------------------------------------------------------------------------------
CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen(CeedOperator op,CeedVector assembled,CeedRequest * request)516 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen(CeedOperator op, CeedVector assembled, CeedRequest *request) {
517   Ceed                  ceed;
518   CeedOperator_Hip_gen *data;
519 
520   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
521   CeedCallBackend(CeedOperatorGetData(op, &data));
522 
523   // Build the assembly kernel
524   if (!data->assemble_diagonal && !data->use_assembly_fallback) {
525     bool                     is_build_good = false;
526     CeedInt                  num_active_bases_in, num_active_bases_out;
527     CeedOperatorAssemblyData assembly_data;
528 
529     CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
530     CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL,
531                                                          NULL, NULL));
532     if (num_active_bases_in == num_active_bases_out) {
533       CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
534       if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(op, &is_build_good));
535     }
536     if (!is_build_good) data->use_assembly_fallback = true;
537   }
538 
539   // Try assembly
540   if (!data->use_assembly_fallback) {
541     bool                   is_run_good = true;
542     Ceed_Hip              *hip_data;
543     CeedInt                num_elem, num_input_fields, num_output_fields;
544     CeedEvalMode           eval_mode;
545     CeedScalar            *assembled_array;
546     CeedQFunctionField    *qf_input_fields, *qf_output_fields;
547     CeedQFunction_Hip_gen *qf_data;
548     CeedQFunction          qf;
549     CeedOperatorField     *op_input_fields, *op_output_fields;
550 
551     CeedCallBackend(CeedGetData(ceed, &hip_data));
552     CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
553     CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
554     CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
555     CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
556     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
557 
558     // Input vectors
559     for (CeedInt i = 0; i < num_input_fields; i++) {
560       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
561       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
562         data->fields.inputs[i] = NULL;
563       } else {
564         bool       is_active;
565         CeedVector vec;
566 
567         // Get input vector
568         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
569         is_active = vec == CEED_VECTOR_ACTIVE;
570         if (is_active) data->fields.inputs[i] = NULL;
571         else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
572         CeedCallBackend(CeedVectorDestroy(&vec));
573       }
574     }
575 
576     // Point coordinates
577     {
578       CeedVector vec;
579 
580       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
581       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
582       CeedCallBackend(CeedVectorDestroy(&vec));
583 
584       // Points per elem
585       if (num_elem != data->points.num_elem) {
586         CeedInt            *points_per_elem;
587         const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
588         CeedElemRestriction rstr_points = NULL;
589 
590         data->points.num_elem = num_elem;
591         CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
592         CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
593         for (CeedInt e = 0; e < num_elem; e++) {
594           CeedInt num_points_elem;
595 
596           CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
597           points_per_elem[e] = num_points_elem;
598         }
599         if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
600         CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
601         CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
602         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
603         CeedCallBackend(CeedFree(&points_per_elem));
604       }
605     }
606 
607     // Get context data
608     CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
609 
610     // Assembly array
611     CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
612 
613     // Assemble diagonal
614     void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
615 
616     CeedInt block_sizes[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
617 
618     CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
619     block_sizes[2] = 1;
620     if (data->dim == 1) {
621       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
622       CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
623 
624       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
625                                                     sharedMem, &is_run_good, opargs));
626     } else if (data->dim == 2) {
627       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
628       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
629 
630       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
631                                                     sharedMem, &is_run_good, opargs));
632     } else if (data->dim == 3) {
633       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
634       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
635 
636       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
637                                                     sharedMem, &is_run_good, opargs));
638     }
639     CeedCallHip(ceed, hipDeviceSynchronize());
640 
641     // Restore input arrays
642     for (CeedInt i = 0; i < num_input_fields; i++) {
643       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
644       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
645       } else {
646         bool       is_active;
647         CeedVector vec;
648 
649         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
650         is_active = vec == CEED_VECTOR_ACTIVE;
651         if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
652         CeedCallBackend(CeedVectorDestroy(&vec));
653       }
654     }
655 
656     // Restore point coordinates
657     {
658       CeedVector vec;
659 
660       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
661       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
662       CeedCallBackend(CeedVectorDestroy(&vec));
663     }
664 
665     // Restore context data
666     CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
667 
668     // Restore assembly array
669     CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
670 
671     // Cleanup
672     CeedCallBackend(CeedQFunctionDestroy(&qf));
673     if (!is_run_good) data->use_assembly_fallback = true;
674   }
675   CeedCallBackend(CeedDestroy(&ceed));
676 
677   // Fallback, if needed
678   if (data->use_assembly_fallback) {
679     CeedOperator op_fallback;
680 
681     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints LinearAssembleAddDiagonal\n");
682     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
683     CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request));
684     return CEED_ERROR_SUCCESS;
685   }
686   return CEED_ERROR_SUCCESS;
687 }
688 
689 //------------------------------------------------------------------------------
690 // AtPoints full assembly
691 //------------------------------------------------------------------------------
CeedOperatorAssembleSingleAtPoints_Hip_gen(CeedOperator op,CeedInt offset,CeedVector assembled)692 static int CeedOperatorAssembleSingleAtPoints_Hip_gen(CeedOperator op, CeedInt offset, CeedVector assembled) {
693   Ceed                  ceed;
694   CeedOperator_Hip_gen *data;
695 
696   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
697   CeedCallBackend(CeedOperatorGetData(op, &data));
698 
699   // Build the assembly kernel
700   if (!data->assemble_full && !data->use_assembly_fallback) {
701     bool                     is_build_good = false;
702     CeedInt                  num_active_bases_in, num_active_bases_out;
703     CeedOperatorAssemblyData assembly_data;
704 
705     CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
706     CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL,
707                                                          NULL, NULL));
708     if (num_active_bases_in == num_active_bases_out) {
709       CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
710       if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(op, &is_build_good));
711     }
712     if (!is_build_good) {
713       CeedDebug(ceed, "Single Operator Assemble at Points compile failed, using fallback\n");
714       data->use_assembly_fallback = true;
715     }
716   }
717 
718   // Try assembly
719   if (!data->use_assembly_fallback) {
720     bool                   is_run_good = true;
721     Ceed_Hip              *Hip_data;
722     CeedInt                num_elem, num_input_fields, num_output_fields;
723     CeedEvalMode           eval_mode;
724     CeedScalar            *assembled_array;
725     CeedQFunctionField    *qf_input_fields, *qf_output_fields;
726     CeedQFunction_Hip_gen *qf_data;
727     CeedQFunction          qf;
728     CeedOperatorField     *op_input_fields, *op_output_fields;
729 
730     CeedCallBackend(CeedGetData(ceed, &Hip_data));
731     CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
732     CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
733     CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
734     CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
735     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
736     CeedDebug(ceed, "Running single operator assemble for /gpu/hip/gen\n");
737 
738     // Input vectors
739     for (CeedInt i = 0; i < num_input_fields; i++) {
740       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
741       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
742         data->fields.inputs[i] = NULL;
743       } else {
744         bool       is_active;
745         CeedVector vec;
746 
747         // Get input vector
748         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
749         is_active = vec == CEED_VECTOR_ACTIVE;
750         if (is_active) data->fields.inputs[i] = NULL;
751         else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
752         CeedCallBackend(CeedVectorDestroy(&vec));
753       }
754     }
755 
756     // Point coordinates
757     {
758       CeedVector vec;
759 
760       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
761       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
762       CeedCallBackend(CeedVectorDestroy(&vec));
763 
764       // Points per elem
765       if (num_elem != data->points.num_elem) {
766         CeedInt            *points_per_elem;
767         const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
768         CeedElemRestriction rstr_points = NULL;
769 
770         data->points.num_elem = num_elem;
771         CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
772         CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
773         for (CeedInt e = 0; e < num_elem; e++) {
774           CeedInt num_points_elem;
775 
776           CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
777           points_per_elem[e] = num_points_elem;
778         }
779         if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
780         CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
781         CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
782         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
783         CeedCallBackend(CeedFree(&points_per_elem));
784       }
785     }
786 
787     // Get context data
788     CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
789 
790     // Assembly array
791     CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
792     CeedScalar *assembled_offset_array = &assembled_array[offset];
793 
794     // Assemble diagonal
795     void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields,          &data->B,
796                       &data->G,          &data->W,      &data->points,  &assembled_offset_array};
797 
798     CeedInt block_sizes[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
799 
800     CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
801     block_sizes[2] = 1;
802     if (data->dim == 1) {
803       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
804       CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
805 
806       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
807                                                     &is_run_good, opargs));
808     } else if (data->dim == 2) {
809       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
810       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
811 
812       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
813                                                     &is_run_good, opargs));
814     } else if (data->dim == 3) {
815       CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
816       CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
817 
818       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
819                                                     &is_run_good, opargs));
820     }
821     CeedCallHip(ceed, hipDeviceSynchronize());
822 
823     // Restore input arrays
824     for (CeedInt i = 0; i < num_input_fields; i++) {
825       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
826       if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
827       } else {
828         bool       is_active;
829         CeedVector vec;
830 
831         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
832         is_active = vec == CEED_VECTOR_ACTIVE;
833         if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
834         CeedCallBackend(CeedVectorDestroy(&vec));
835       }
836     }
837 
838     // Restore point coordinates
839     {
840       CeedVector vec;
841 
842       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
843       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
844       CeedCallBackend(CeedVectorDestroy(&vec));
845     }
846 
847     // Restore context data
848     CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
849 
850     // Restore assembly array
851     CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
852 
853     // Cleanup
854     CeedCallBackend(CeedQFunctionDestroy(&qf));
855     if (!is_run_good) {
856       CeedDebug(ceed, "Single Operator Assemble at Points run failed, using fallback\n");
857       data->use_assembly_fallback = true;
858     }
859   }
860   CeedCallBackend(CeedDestroy(&ceed));
861 
862   // Fallback, if needed
863   if (data->use_assembly_fallback) {
864     CeedOperator op_fallback;
865 
866     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints SingleOperatorAssemble\n");
867     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
868     CeedCallBackend(CeedOperatorAssembleSingle(op_fallback, offset, assembled));
869     return CEED_ERROR_SUCCESS;
870   }
871   return CEED_ERROR_SUCCESS;
872 }
873 
874 //------------------------------------------------------------------------------
875 // Create operator
876 //------------------------------------------------------------------------------
CeedOperatorCreate_Hip_gen(CeedOperator op)877 int CeedOperatorCreate_Hip_gen(CeedOperator op) {
878   bool                  is_composite, is_at_points;
879   Ceed                  ceed;
880   CeedOperator_Hip_gen *impl;
881 
882   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
883   CeedCallBackend(CeedCalloc(1, &impl));
884   CeedCallBackend(CeedOperatorSetData(op, impl));
885   CeedCall(CeedOperatorIsComposite(op, &is_composite));
886   if (is_composite) {
887     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Hip_gen));
888   } else {
889     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen));
890   }
891   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
892   if (is_at_points) {
893     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen));
894     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Hip_gen));
895   }
896   if (!is_at_points) {
897     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip_gen));
898     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen));
899   }
900   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen));
901   CeedCallBackend(CeedDestroy(&ceed));
902   return CEED_ERROR_SUCCESS;
903 }
904 
905 //------------------------------------------------------------------------------
906