xref: /libCEED/backends/hip-ref/ceed-hip-ref-operator.c (revision 7d5185d77fd3980a37189da5411f3a999b5628b9)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <assert.h>
12 #include <stdbool.h>
13 #include <string.h>
14 #include <hip/hip_runtime.h>
15 
16 #include "../hip/ceed-hip-common.h"
17 #include "../hip/ceed-hip-compile.h"
18 #include "ceed-hip-ref.h"
19 
20 //------------------------------------------------------------------------------
21 // Destroy operator
22 //------------------------------------------------------------------------------
23 static int CeedOperatorDestroy_Hip(CeedOperator op) {
24   CeedOperator_Hip *impl;
25 
26   CeedCallBackend(CeedOperatorGetData(op, &impl));
27 
28   // Apply data
29   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
30     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
31   }
32   CeedCallBackend(CeedFree(&impl->e_vecs));
33   CeedCallBackend(CeedFree(&impl->input_states));
34 
35   for (CeedInt i = 0; i < impl->num_inputs; i++) {
36     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i]));
37   }
38   CeedCallBackend(CeedFree(&impl->q_vecs_in));
39 
40   for (CeedInt i = 0; i < impl->num_outputs; i++) {
41     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
42   }
43   CeedCallBackend(CeedFree(&impl->q_vecs_out));
44 
45   // QFunction assembly data
46   for (CeedInt i = 0; i < impl->num_active_in; i++) {
47     CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
48   }
49   CeedCallBackend(CeedFree(&impl->qf_active_in));
50 
51   // Diag data
52   if (impl->diag) {
53     Ceed ceed;
54 
55     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
56     if (impl->diag->module) {
57       CeedCallHip(ceed, hipModuleUnload(impl->diag->module));
58     }
59     if (impl->diag->module_point_block) {
60       CeedCallHip(ceed, hipModuleUnload(impl->diag->module_point_block));
61     }
62     CeedCallHip(ceed, hipFree(impl->diag->d_eval_modes_in));
63     CeedCallHip(ceed, hipFree(impl->diag->d_eval_modes_out));
64     CeedCallHip(ceed, hipFree(impl->diag->d_identity));
65     CeedCallHip(ceed, hipFree(impl->diag->d_interp_in));
66     CeedCallHip(ceed, hipFree(impl->diag->d_interp_out));
67     CeedCallHip(ceed, hipFree(impl->diag->d_grad_in));
68     CeedCallHip(ceed, hipFree(impl->diag->d_grad_out));
69     CeedCallHip(ceed, hipFree(impl->diag->d_div_in));
70     CeedCallHip(ceed, hipFree(impl->diag->d_div_out));
71     CeedCallHip(ceed, hipFree(impl->diag->d_curl_in));
72     CeedCallHip(ceed, hipFree(impl->diag->d_curl_out));
73     CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
74     CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
75     CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag));
76     CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag));
77   }
78   CeedCallBackend(CeedFree(&impl->diag));
79 
80   if (impl->asmb) {
81     Ceed ceed;
82 
83     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
84     CeedCallHip(ceed, hipModuleUnload(impl->asmb->module));
85     CeedCallHip(ceed, hipFree(impl->asmb->d_B_in));
86     CeedCallHip(ceed, hipFree(impl->asmb->d_B_out));
87   }
88   CeedCallBackend(CeedFree(&impl->asmb));
89 
90   CeedCallBackend(CeedFree(&impl));
91   return CEED_ERROR_SUCCESS;
92 }
93 
94 //------------------------------------------------------------------------------
95 // Setup infields or outfields
96 //------------------------------------------------------------------------------
97 static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e,
98                                        CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
99   Ceed                ceed;
100   CeedQFunctionField *qf_fields;
101   CeedOperatorField  *op_fields;
102 
103   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
104   if (is_input) {
105     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
106     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
107   } else {
108     CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
109     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
110   }
111 
112   // Loop over fields
113   for (CeedInt i = 0; i < num_fields; i++) {
114     bool         is_strided = false, skip_restriction = false;
115     CeedSize     q_size;
116     CeedInt      size;
117     CeedEvalMode eval_mode;
118     CeedBasis    basis;
119 
120     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
121     if (eval_mode != CEED_EVAL_WEIGHT) {
122       CeedElemRestriction elem_rstr;
123 
124       // Check whether this field can skip the element restriction:
125       // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
126       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
127 
128       // First, check whether the field is input or output:
129       if (is_input) {
130         CeedVector vec;
131 
132         // Check for passive input
133         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
134         if (vec != CEED_VECTOR_ACTIVE) {
135           // Check eval_mode
136           if (eval_mode == CEED_EVAL_NONE) {
137             // Check for strided restriction
138             CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
139             if (is_strided) {
140               // Check if vector is already in preferred backend ordering
141               CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction));
142             }
143           }
144         }
145       }
146       if (skip_restriction) {
147         // We do not need an E-Vector, but will use the input field vector's data directly in the operator application.
148         e_vecs[i + start_e] = NULL;
149       } else {
150         CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e]));
151       }
152     }
153 
154     switch (eval_mode) {
155       case CEED_EVAL_NONE:
156         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
157         q_size = (CeedSize)num_elem * Q * size;
158         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
159         break;
160       case CEED_EVAL_INTERP:
161       case CEED_EVAL_GRAD:
162       case CEED_EVAL_DIV:
163       case CEED_EVAL_CURL:
164         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
165         q_size = (CeedSize)num_elem * Q * size;
166         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
167         break;
168       case CEED_EVAL_WEIGHT:  // Only on input fields
169         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
170         q_size = (CeedSize)num_elem * Q;
171         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
172         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
173         break;
174     }
175   }
176   return CEED_ERROR_SUCCESS;
177 }
178 
179 //------------------------------------------------------------------------------
180 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
181 //------------------------------------------------------------------------------
182 static int CeedOperatorSetup_Hip(CeedOperator op) {
183   Ceed                ceed;
184   bool                is_setup_done;
185   CeedInt             Q, num_elem, num_input_fields, num_output_fields;
186   CeedQFunctionField *qf_input_fields, *qf_output_fields;
187   CeedQFunction       qf;
188   CeedOperatorField  *op_input_fields, *op_output_fields;
189   CeedOperator_Hip   *impl;
190 
191   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
192   if (is_setup_done) return CEED_ERROR_SUCCESS;
193 
194   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
195   CeedCallBackend(CeedOperatorGetData(op, &impl));
196   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
197   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
198   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
199   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
200   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
201 
202   // Allocate
203   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
204   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
205   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
206   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
207   impl->num_inputs  = num_input_fields;
208   impl->num_outputs = num_output_fields;
209 
210   // Set up infield and outfield e_vecs and q_vecs
211   // Infields
212   CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
213   // Outfields
214   CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
215 
216   CeedCallBackend(CeedOperatorSetSetupDone(op));
217   return CEED_ERROR_SUCCESS;
218 }
219 
220 //------------------------------------------------------------------------------
221 // Setup Operator Inputs
222 //------------------------------------------------------------------------------
223 static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
224                                               CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
225                                               CeedOperator_Hip *impl, CeedRequest *request) {
226   for (CeedInt i = 0; i < num_input_fields; i++) {
227     CeedEvalMode        eval_mode;
228     CeedVector          vec;
229     CeedElemRestriction elem_rstr;
230 
231     // Get input vector
232     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
233     if (vec == CEED_VECTOR_ACTIVE) {
234       if (skip_active) continue;
235       else vec = in_vec;
236     }
237 
238     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
239     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
240     } else {
241       // Get input vector
242       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
243       // Get input element restriction
244       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
245       if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
246       // Restrict, if necessary
247       if (!impl->e_vecs[i]) {
248         // No restriction for this field; read data directly from vec.
249         CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
250       } else {
251         uint64_t state;
252 
253         CeedCallBackend(CeedVectorGetState(vec, &state));
254         if (state != impl->input_states[i]) {
255           CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
256           impl->input_states[i] = state;
257         }
258         // Get evec
259         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
260       }
261     }
262   }
263   return CEED_ERROR_SUCCESS;
264 }
265 
266 //------------------------------------------------------------------------------
267 // Input Basis Action
268 //------------------------------------------------------------------------------
269 static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
270                                              CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
271                                              CeedOperator_Hip *impl) {
272   for (CeedInt i = 0; i < num_input_fields; i++) {
273     CeedInt             elem_size, size;
274     CeedEvalMode        eval_mode;
275     CeedElemRestriction elem_rstr;
276     CeedBasis           basis;
277 
278     // Skip active input
279     if (skip_active) {
280       CeedVector vec;
281 
282       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
283       if (vec == CEED_VECTOR_ACTIVE) continue;
284     }
285     // Get elem_size, eval_mode, size
286     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
287     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
288     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
289     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
290     // Basis action
291     switch (eval_mode) {
292       case CEED_EVAL_NONE:
293         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
294         break;
295       case CEED_EVAL_INTERP:
296       case CEED_EVAL_GRAD:
297       case CEED_EVAL_DIV:
298       case CEED_EVAL_CURL:
299         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
300         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i]));
301         break;
302       case CEED_EVAL_WEIGHT:
303         break;  // No action
304     }
305   }
306   return CEED_ERROR_SUCCESS;
307 }
308 
309 //------------------------------------------------------------------------------
310 // Restore Input Vectors
311 //------------------------------------------------------------------------------
312 static inline int CeedOperatorRestoreInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
313                                                 const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) {
314   for (CeedInt i = 0; i < num_input_fields; i++) {
315     CeedEvalMode eval_mode;
316     CeedVector   vec;
317 
318     // Skip active input
319     if (skip_active) {
320       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
321       if (vec == CEED_VECTOR_ACTIVE) continue;
322     }
323     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
324     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
325     } else {
326       if (!impl->e_vecs[i]) {  // This was a skip_restriction case
327         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
328         CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i]));
329       } else {
330         CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i]));
331       }
332     }
333   }
334   return CEED_ERROR_SUCCESS;
335 }
336 
337 //------------------------------------------------------------------------------
338 // Apply and add to output
339 //------------------------------------------------------------------------------
340 static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
341   CeedInt             Q, num_elem, elem_size, num_input_fields, num_output_fields, size;
342   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
343   CeedQFunctionField *qf_input_fields, *qf_output_fields;
344   CeedQFunction       qf;
345   CeedOperatorField  *op_input_fields, *op_output_fields;
346   CeedOperator_Hip   *impl;
347 
348   CeedCallBackend(CeedOperatorGetData(op, &impl));
349   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
350   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
351   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
352   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
353   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
354 
355   // Setup
356   CeedCallBackend(CeedOperatorSetup_Hip(op));
357 
358   // Input Evecs and Restriction
359   CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
360 
361   // Input basis apply if needed
362   CeedCallBackend(CeedOperatorInputBasis_Hip(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
363 
364   // Output pointers, as necessary
365   for (CeedInt i = 0; i < num_output_fields; i++) {
366     CeedEvalMode eval_mode;
367 
368     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
369     if (eval_mode == CEED_EVAL_NONE) {
370       // Set the output Q-Vector to use the E-Vector data directly.
371       CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
372       CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
373     }
374   }
375 
376   // Q function
377   CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));
378 
379   // Output basis apply if needed
380   for (CeedInt i = 0; i < num_output_fields; i++) {
381     CeedEvalMode        eval_mode;
382     CeedElemRestriction elem_rstr;
383     CeedBasis           basis;
384 
385     // Get elem_size, eval_mode, size
386     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
387     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
388     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
389     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
390     // Basis action
391     switch (eval_mode) {
392       case CEED_EVAL_NONE:
393         break;  // No action
394       case CEED_EVAL_INTERP:
395       case CEED_EVAL_GRAD:
396       case CEED_EVAL_DIV:
397       case CEED_EVAL_CURL:
398         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
399         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
400         break;
401       // LCOV_EXCL_START
402       case CEED_EVAL_WEIGHT: {
403         return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
404         // LCOV_EXCL_STOP
405       }
406     }
407   }
408 
409   // Output restriction
410   for (CeedInt i = 0; i < num_output_fields; i++) {
411     CeedEvalMode        eval_mode;
412     CeedVector          vec;
413     CeedElemRestriction elem_rstr;
414 
415     // Restore evec
416     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
417     if (eval_mode == CEED_EVAL_NONE) {
418       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
419     }
420     // Get output vector
421     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
422     // Restrict
423     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
424     // Active
425     if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
426 
427     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
428   }
429 
430   // Restore input arrays
431   CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
432   return CEED_ERROR_SUCCESS;
433 }
434 
435 //------------------------------------------------------------------------------
436 // Linear QFunction Assembly Core
437 //------------------------------------------------------------------------------
438 static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
439                                                               CeedRequest *request) {
440   Ceed                ceed, ceed_parent;
441   CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
442   CeedScalar         *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL};
443   CeedVector         *active_inputs;
444   CeedQFunctionField *qf_input_fields, *qf_output_fields;
445   CeedQFunction       qf;
446   CeedOperatorField  *op_input_fields, *op_output_fields;
447   CeedOperator_Hip   *impl;
448 
449   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
450   CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent));
451   CeedCallBackend(CeedOperatorGetData(op, &impl));
452   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
453   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
454   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
455   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
456   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
457   active_inputs = impl->qf_active_in;
458   num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
459 
460   // Setup
461   CeedCallBackend(CeedOperatorSetup_Hip(op));
462 
463   // Input Evecs and Restriction
464   CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
465 
466   // Count number of active input fields
467   if (!num_active_in) {
468     for (CeedInt i = 0; i < num_input_fields; i++) {
469       CeedScalar *q_vec_array;
470       CeedVector  vec;
471 
472       // Get input vector
473       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
474       // Check if active input
475       if (vec == CEED_VECTOR_ACTIVE) {
476         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
477         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
478         CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array));
479         CeedCallBackend(CeedRealloc(num_active_in + size, &active_inputs));
480         for (CeedInt field = 0; field < size; field++) {
481           CeedSize q_size = (CeedSize)Q * num_elem;
482 
483           CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field]));
484           CeedCallBackend(
485               CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem]));
486         }
487         num_active_in += size;
488         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
489       }
490     }
491     impl->num_active_in = num_active_in;
492     impl->qf_active_in  = active_inputs;
493   }
494 
495   // Count number of active output fields
496   if (!num_active_out) {
497     for (CeedInt i = 0; i < num_output_fields; i++) {
498       CeedVector vec;
499 
500       // Get output vector
501       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
502       // Check if active output
503       if (vec == CEED_VECTOR_ACTIVE) {
504         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
505         num_active_out += size;
506       }
507     }
508     impl->num_active_out = num_active_out;
509   }
510 
511   // Check sizes
512   CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
513 
514   // Build objects if needed
515   if (build_objects) {
516     CeedSize l_size     = (CeedSize)num_elem * Q * num_active_in * num_active_out;
517     CeedInt  strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
518 
519     // Create output restriction
520     CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
521                                                      num_active_in * num_active_out * num_elem * Q, strides, rstr));
522     // Create assembled vector
523     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
524   }
525   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
526   CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array));
527 
528   // Input basis apply
529   CeedCallBackend(CeedOperatorInputBasis_Hip(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
530 
531   // Assemble QFunction
532   for (CeedInt in = 0; in < num_active_in; in++) {
533     // Set Inputs
534     CeedCallBackend(CeedVectorSetValue(active_inputs[in], 1.0));
535     if (num_active_in > 1) {
536       CeedCallBackend(CeedVectorSetValue(active_inputs[(in + num_active_in - 1) % num_active_in], 0.0));
537     }
538     // Set Outputs
539     for (CeedInt out = 0; out < num_output_fields; out++) {
540       CeedVector vec;
541 
542       // Get output vector
543       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
544       // Check if active output
545       if (vec == CEED_VECTOR_ACTIVE) {
546         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array));
547         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
548         assembled_array += size * Q * num_elem;  // Advance the pointer by the size of the output
549       }
550     }
551     // Apply QFunction
552     CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
553   }
554 
555   // Un-set output q_vecs to prevent accidental overwrite of Assembled
556   for (CeedInt out = 0; out < num_output_fields; out++) {
557     CeedVector vec;
558 
559     // Get output vector
560     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
561     // Check if active output
562     if (vec == CEED_VECTOR_ACTIVE) {
563       CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL));
564     }
565   }
566 
567   // Restore input arrays
568   CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
569 
570   // Restore output
571   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
572   return CEED_ERROR_SUCCESS;
573 }
574 
575 //------------------------------------------------------------------------------
576 // Assemble Linear QFunction
577 //------------------------------------------------------------------------------
578 static int CeedOperatorLinearAssembleQFunction_Hip(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
579   return CeedOperatorLinearAssembleQFunctionCore_Hip(op, true, assembled, rstr, request);
580 }
581 
582 //------------------------------------------------------------------------------
583 // Update Assembled Linear QFunction
584 //------------------------------------------------------------------------------
585 static int CeedOperatorLinearAssembleQFunctionUpdate_Hip(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
586   return CeedOperatorLinearAssembleQFunctionCore_Hip(op, false, &assembled, &rstr, request);
587 }
588 
589 //------------------------------------------------------------------------------
590 // Assemble Diagonal Setup
591 //------------------------------------------------------------------------------
592 static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
593   Ceed                ceed;
594   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
595   CeedInt             q_comp, num_nodes, num_qpts;
596   CeedEvalMode       *eval_modes_in = NULL, *eval_modes_out = NULL;
597   CeedBasis           basis_in = NULL, basis_out = NULL;
598   CeedQFunctionField *qf_fields;
599   CeedQFunction       qf;
600   CeedOperatorField  *op_fields;
601   CeedOperator_Hip   *impl;
602 
603   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
604   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
605   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
606 
607   // Determine active input basis
608   CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
609   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
610   for (CeedInt i = 0; i < num_input_fields; i++) {
611     CeedVector vec;
612 
613     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
614     if (vec == CEED_VECTOR_ACTIVE) {
615       CeedBasis    basis;
616       CeedEvalMode eval_mode;
617 
618       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
619       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND,
620                 "Backend does not implement operator diagonal assembly with multiple active bases");
621       basis_in = basis;
622       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
623       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
624       if (eval_mode != CEED_EVAL_WEIGHT) {
625         // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly
626         CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in));
627         for (CeedInt d = 0; d < q_comp; d++) eval_modes_in[num_eval_modes_in + d] = eval_mode;
628         num_eval_modes_in += q_comp;
629       }
630     }
631   }
632 
633   // Determine active output basis
634   CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
635   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
636   for (CeedInt i = 0; i < num_output_fields; i++) {
637     CeedVector vec;
638 
639     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
640     if (vec == CEED_VECTOR_ACTIVE) {
641       CeedBasis    basis;
642       CeedEvalMode eval_mode;
643 
644       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
645       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
646                 "Backend does not implement operator diagonal assembly with multiple active bases");
647       basis_out = basis;
648       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
649       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
650       if (eval_mode != CEED_EVAL_WEIGHT) {
651         // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly
652         CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out));
653         for (CeedInt d = 0; d < q_comp; d++) eval_modes_out[num_eval_modes_out + d] = eval_mode;
654         num_eval_modes_out += q_comp;
655       }
656     }
657   }
658 
659   // Operator data struct
660   CeedCallBackend(CeedOperatorGetData(op, &impl));
661   CeedCallBackend(CeedCalloc(1, &impl->diag));
662   CeedOperatorDiag_Hip *diag = impl->diag;
663 
664   // Basis matrices
665   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
666   if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
667   else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
668   const CeedInt interp_bytes     = num_nodes * num_qpts * sizeof(CeedScalar);
669   const CeedInt eval_modes_bytes = sizeof(CeedEvalMode);
670   bool          has_eval_none    = false;
671 
672   // CEED_EVAL_NONE
673   for (CeedInt i = 0; i < num_eval_modes_in; i++) has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE);
674   for (CeedInt i = 0; i < num_eval_modes_out; i++) has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE);
675   if (has_eval_none) {
676     CeedScalar *identity = NULL;
677 
678     CeedCallBackend(CeedCalloc(num_nodes * num_qpts, &identity));
679     for (CeedInt i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0;
680     CeedCallHip(ceed, hipMalloc((void **)&diag->d_identity, interp_bytes));
681     CeedCallHip(ceed, hipMemcpy(diag->d_identity, identity, interp_bytes, hipMemcpyHostToDevice));
682     CeedCallBackend(CeedFree(&identity));
683   }
684 
685   // CEED_EVAL_INTERP, CEED_EVAL_GRAD, CEED_EVAL_DIV, and CEED_EVAL_CURL
686   for (CeedInt in = 0; in < 2; in++) {
687     CeedFESpace fespace;
688     CeedBasis   basis = in ? basis_in : basis_out;
689 
690     CeedCallBackend(CeedBasisGetFESpace(basis, &fespace));
691     switch (fespace) {
692       case CEED_FE_SPACE_H1: {
693         CeedInt           q_comp_interp, q_comp_grad;
694         const CeedScalar *interp, *grad;
695         CeedScalar       *d_interp, *d_grad;
696 
697         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
698         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
699 
700         CeedCallBackend(CeedBasisGetInterp(basis, &interp));
701         CeedCallHip(ceed, hipMalloc((void **)&d_interp, interp_bytes * q_comp_interp));
702         CeedCallHip(ceed, hipMemcpy(d_interp, interp, interp_bytes * q_comp_interp, hipMemcpyHostToDevice));
703         CeedCallBackend(CeedBasisGetGrad(basis, &grad));
704         CeedCallHip(ceed, hipMalloc((void **)&d_grad, interp_bytes * q_comp_grad));
705         CeedCallHip(ceed, hipMemcpy(d_grad, grad, interp_bytes * q_comp_grad, hipMemcpyHostToDevice));
706         if (in) {
707           diag->d_interp_in = d_interp;
708           diag->d_grad_in   = d_grad;
709         } else {
710           diag->d_interp_out = d_interp;
711           diag->d_grad_out   = d_grad;
712         }
713       } break;
714       case CEED_FE_SPACE_HDIV: {
715         CeedInt           q_comp_interp, q_comp_div;
716         const CeedScalar *interp, *div;
717         CeedScalar       *d_interp, *d_div;
718 
719         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
720         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
721 
722         CeedCallBackend(CeedBasisGetInterp(basis, &interp));
723         CeedCallHip(ceed, hipMalloc((void **)&d_interp, interp_bytes * q_comp_interp));
724         CeedCallHip(ceed, hipMemcpy(d_interp, interp, interp_bytes * q_comp_interp, hipMemcpyHostToDevice));
725         CeedCallBackend(CeedBasisGetDiv(basis, &div));
726         CeedCallHip(ceed, hipMalloc((void **)&d_div, interp_bytes * q_comp_div));
727         CeedCallHip(ceed, hipMemcpy(d_div, div, interp_bytes * q_comp_div, hipMemcpyHostToDevice));
728         if (in) {
729           diag->d_interp_in = d_interp;
730           diag->d_div_in    = d_div;
731         } else {
732           diag->d_interp_out = d_interp;
733           diag->d_div_out    = d_div;
734         }
735       } break;
736       case CEED_FE_SPACE_HCURL: {
737         CeedInt           q_comp_interp, q_comp_curl;
738         const CeedScalar *interp, *curl;
739         CeedScalar       *d_interp, *d_curl;
740 
741         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
742         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
743 
744         CeedCallBackend(CeedBasisGetInterp(basis, &interp));
745         CeedCallHip(ceed, hipMalloc((void **)&d_interp, interp_bytes * q_comp_interp));
746         CeedCallHip(ceed, hipMemcpy(d_interp, interp, interp_bytes * q_comp_interp, hipMemcpyHostToDevice));
747         CeedCallBackend(CeedBasisGetCurl(basis, &curl));
748         CeedCallHip(ceed, hipMalloc((void **)&d_curl, interp_bytes * q_comp_curl));
749         CeedCallHip(ceed, hipMemcpy(d_curl, curl, interp_bytes * q_comp_curl, hipMemcpyHostToDevice));
750         if (in) {
751           diag->d_interp_in = d_interp;
752           diag->d_curl_in   = d_curl;
753         } else {
754           diag->d_interp_out = d_interp;
755           diag->d_curl_out   = d_curl;
756         }
757       } break;
758     }
759   }
760 
761   // Arrays of eval_modes
762   CeedCallHip(ceed, hipMalloc((void **)&diag->d_eval_modes_in, num_eval_modes_in * eval_modes_bytes));
763   CeedCallHip(ceed, hipMemcpy(diag->d_eval_modes_in, eval_modes_in, num_eval_modes_in * eval_modes_bytes, hipMemcpyHostToDevice));
764   CeedCallHip(ceed, hipMalloc((void **)&diag->d_eval_modes_out, num_eval_modes_out * eval_modes_bytes));
765   CeedCallHip(ceed, hipMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, hipMemcpyHostToDevice));
766   CeedCallBackend(CeedFree(&eval_modes_in));
767   CeedCallBackend(CeedFree(&eval_modes_out));
768   return CEED_ERROR_SUCCESS;
769 }
770 
771 //------------------------------------------------------------------------------
772 // Assemble Diagonal Setup (Compilation)
773 //------------------------------------------------------------------------------
774 static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) {
775   Ceed                ceed;
776   char               *diagonal_kernel_source;
777   const char         *diagonal_kernel_path;
778   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
779   CeedInt             num_comp, q_comp, num_nodes, num_qpts;
780   CeedBasis           basis_in = NULL, basis_out = NULL;
781   CeedQFunctionField *qf_fields;
782   CeedQFunction       qf;
783   CeedOperatorField  *op_fields;
784   CeedOperator_Hip   *impl;
785 
786   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
787   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
788   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
789 
790   // Determine active input basis
791   CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
792   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
793   for (CeedInt i = 0; i < num_input_fields; i++) {
794     CeedVector vec;
795 
796     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
797     if (vec == CEED_VECTOR_ACTIVE) {
798       CeedEvalMode eval_mode;
799 
800       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in));
801       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
802       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
803       if (eval_mode != CEED_EVAL_WEIGHT) {
804         num_eval_modes_in += q_comp;
805       }
806     }
807   }
808 
809   // Determine active output basis
810   CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
811   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
812   for (CeedInt i = 0; i < num_output_fields; i++) {
813     CeedVector vec;
814 
815     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
816     if (vec == CEED_VECTOR_ACTIVE) {
817       CeedEvalMode eval_mode;
818 
819       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out));
820       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
821       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
822       if (eval_mode != CEED_EVAL_WEIGHT) {
823         num_eval_modes_out += q_comp;
824       }
825     }
826   }
827 
828   // Operator data struct
829   CeedCallBackend(CeedOperatorGetData(op, &impl));
830   CeedOperatorDiag_Hip *diag = impl->diag;
831 
832   // Assemble kernel
833   hipModule_t *module          = is_point_block ? &diag->module_point_block : &diag->module;
834   CeedInt      elems_per_block = 1;
835   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
836   CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
837   if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
838   else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
839   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path));
840   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n");
841   CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source));
842   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n");
843   CeedCallHip(ceed, CeedCompile_Hip(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
844                                     num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE",
845                                     use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block));
846   CeedCallHip(ceed, CeedGetKernel_Hip(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
847   CeedCallBackend(CeedFree(&diagonal_kernel_path));
848   CeedCallBackend(CeedFree(&diagonal_kernel_source));
849   return CEED_ERROR_SUCCESS;
850 }
851 
852 //------------------------------------------------------------------------------
853 // Assemble Diagonal Core
854 //------------------------------------------------------------------------------
855 static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) {
856   Ceed                ceed;
857   CeedInt             num_elem, num_nodes;
858   CeedScalar         *elem_diag_array;
859   const CeedScalar   *assembled_qf_array;
860   CeedVector          assembled_qf   = NULL, elem_diag;
861   CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out, diag_rstr;
862   CeedOperator_Hip   *impl;
863 
864   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
865   CeedCallBackend(CeedOperatorGetData(op, &impl));
866 
867   // Assemble QFunction
868   CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, request));
869   CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr));
870   CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array));
871 
872   // Setup
873   if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op));
874   CeedOperatorDiag_Hip *diag = impl->diag;
875 
876   assert(diag != NULL);
877 
878   // Assemble kernel if needed
879   if ((!is_point_block && !diag->LinearDiagonal) || (is_point_block && !diag->LinearPointBlock)) {
880     CeedSize assembled_length, assembled_qf_length;
881     CeedInt  use_ceedsize_idx = 0;
882     CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length));
883     CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
884     if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
885 
886     CeedCallBackend(CeedOperatorAssembleDiagonalSetupCompile_Hip(op, use_ceedsize_idx, is_point_block));
887   }
888 
889   // Restriction and diagonal vector
890   CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out));
891   CeedCheck(rstr_in == rstr_out, ceed, CEED_ERROR_BACKEND,
892             "Cannot assemble operator diagonal with different input and output active element restrictions");
893   if (!is_point_block && !diag->diag_rstr) {
894     CeedCallBackend(CeedElemRestrictionCreateUnsignedCopy(rstr_out, &diag->diag_rstr));
895     CeedCallBackend(CeedElemRestrictionCreateVector(diag->diag_rstr, NULL, &diag->elem_diag));
896   } else if (is_point_block && !diag->point_block_diag_rstr) {
897     CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr));
898     CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag));
899   }
900   diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr;
901   elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag;
902   CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0));
903 
904   // Only assemble diagonal if the basis has nodes, otherwise inputs are null pointers
905   CeedCallBackend(CeedElemRestrictionGetElementSize(diag_rstr, &num_nodes));
906   if (num_nodes > 0) {
907     // Assemble element operator diagonals
908     CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array));
909     CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem));
910 
911     // Compute the diagonal of B^T D B
912     CeedInt elems_per_block = 1;
913     CeedInt grid            = CeedDivUpInt(num_elem, elems_per_block);
914     void   *args[]          = {(void *)&num_elem,      &diag->d_identity,       &diag->d_interp_in,  &diag->d_grad_in, &diag->d_div_in,
915                                &diag->d_curl_in,       &diag->d_interp_out,     &diag->d_grad_out,   &diag->d_div_out, &diag->d_curl_out,
916                                &diag->d_eval_modes_in, &diag->d_eval_modes_out, &assembled_qf_array, &elem_diag_array};
917 
918     if (is_point_block) {
919       CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->LinearPointBlock, grid, num_nodes, 1, elems_per_block, args));
920     } else {
921       CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->LinearDiagonal, grid, num_nodes, 1, elems_per_block, args));
922     }
923 
924     // Restore arrays
925     CeedCallBackend(CeedVectorRestoreArray(elem_diag, &elem_diag_array));
926     CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array));
927   }
928 
929   // Assemble local operator diagonal
930   CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request));
931 
932   // Cleanup
933   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
934   return CEED_ERROR_SUCCESS;
935 }
936 
937 //------------------------------------------------------------------------------
938 // Assemble Linear Diagonal
939 //------------------------------------------------------------------------------
940 static int CeedOperatorLinearAssembleAddDiagonal_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
941   CeedCallBackend(CeedOperatorAssembleDiagonalCore_Hip(op, assembled, request, false));
942   return CEED_ERROR_SUCCESS;
943 }
944 
945 //------------------------------------------------------------------------------
946 // Assemble Linear Point Block Diagonal
947 //------------------------------------------------------------------------------
948 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
949   CeedCallBackend(CeedOperatorAssembleDiagonalCore_Hip(op, assembled, request, true));
950   return CEED_ERROR_SUCCESS;
951 }
952 
953 //------------------------------------------------------------------------------
954 // Single Operator Assembly Setup
955 //------------------------------------------------------------------------------
956 static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) {
957   Ceed                ceed;
958   char               *assembly_kernel_source;
959   const char         *assembly_kernel_path;
960   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
961   CeedInt             elem_size_in, num_qpts_in = 0, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp;
962   CeedEvalMode       *eval_modes_in = NULL, *eval_modes_out = NULL;
963   CeedElemRestriction rstr_in = NULL, rstr_out = NULL;
964   CeedBasis           basis_in = NULL, basis_out = NULL;
965   CeedQFunctionField *qf_fields;
966   CeedQFunction       qf;
967   CeedOperatorField  *input_fields, *output_fields;
968   CeedOperator_Hip   *impl;
969 
970   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
971   CeedCallBackend(CeedOperatorGetData(op, &impl));
972 
973   // Get intput and output fields
974   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
975 
976   // Determine active input basis eval mode
977   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
978   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
979   for (CeedInt i = 0; i < num_input_fields; i++) {
980     CeedVector vec;
981 
982     CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec));
983     if (vec == CEED_VECTOR_ACTIVE) {
984       CeedBasis    basis;
985       CeedEvalMode eval_mode;
986 
987       CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis));
988       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
989       basis_in = basis;
990       CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in));
991       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in));
992       if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in;
993       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in));
994       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
995       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
996       if (eval_mode != CEED_EVAL_WEIGHT) {
997         // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly
998         CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in));
999         for (CeedInt d = 0; d < q_comp; d++) {
1000           eval_modes_in[num_eval_modes_in + d] = eval_mode;
1001         }
1002         num_eval_modes_in += q_comp;
1003       }
1004     }
1005   }
1006 
1007   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
1008   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
1009   for (CeedInt i = 0; i < num_output_fields; i++) {
1010     CeedVector vec;
1011 
1012     CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec));
1013     if (vec == CEED_VECTOR_ACTIVE) {
1014       CeedBasis    basis;
1015       CeedEvalMode eval_mode;
1016 
1017       CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis));
1018       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
1019                 "Backend does not implement operator assembly with multiple active bases");
1020       basis_out = basis;
1021       CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out));
1022       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out));
1023       if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out;
1024       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out));
1025       CeedCheck(num_qpts_in == num_qpts_out, ceed, CEED_ERROR_UNSUPPORTED,
1026                 "Active input and output bases must have the same number of quadrature points");
1027       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
1028       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
1029       if (eval_mode != CEED_EVAL_WEIGHT) {
1030         // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly
1031         CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out));
1032         for (CeedInt d = 0; d < q_comp; d++) {
1033           eval_modes_out[num_eval_modes_out + d] = eval_mode;
1034         }
1035         num_eval_modes_out += q_comp;
1036       }
1037     }
1038   }
1039   CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
1040 
1041   CeedCallBackend(CeedCalloc(1, &impl->asmb));
1042   CeedOperatorAssemble_Hip *asmb = impl->asmb;
1043   asmb->elems_per_block          = 1;
1044   asmb->block_size_x             = elem_size_in;
1045   asmb->block_size_y             = elem_size_out;
1046 
1047   bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > 1024;
1048 
1049   if (fallback) {
1050     // Use fallback kernel with 1D threadblock
1051     asmb->block_size_y = 1;
1052   }
1053 
1054   // Compile kernels
1055   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in));
1056   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out));
1057   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble.h", &assembly_kernel_path));
1058   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n");
1059   CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source));
1060   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n");
1061   CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
1062                                   num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in,
1063                                   "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE",
1064                                   asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "USE_CEEDSIZE",
1065                                   use_ceedsize_idx));
1066   CeedCallBackend(CeedGetKernel_Hip(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble));
1067   CeedCallBackend(CeedFree(&assembly_kernel_path));
1068   CeedCallBackend(CeedFree(&assembly_kernel_source));
1069 
1070   // Load into B_in, in order that they will be used in eval_modes_in
1071   {
1072     const CeedInt in_bytes           = elem_size_in * num_qpts_in * num_eval_modes_in * sizeof(CeedScalar);
1073     CeedInt       d_in               = 0;
1074     CeedEvalMode  eval_modes_in_prev = CEED_EVAL_NONE;
1075     bool          has_eval_none      = false;
1076     CeedScalar   *identity           = NULL;
1077 
1078     for (CeedInt i = 0; i < num_eval_modes_in; i++) {
1079       has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE);
1080     }
1081     if (has_eval_none) {
1082       CeedCallBackend(CeedCalloc(elem_size_in * num_qpts_in, &identity));
1083       for (CeedInt i = 0; i < (elem_size_in < num_qpts_in ? elem_size_in : num_qpts_in); i++) identity[i * elem_size_in + i] = 1.0;
1084     }
1085 
1086     CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_in, in_bytes));
1087     for (CeedInt i = 0; i < num_eval_modes_in; i++) {
1088       const CeedScalar *h_B_in;
1089 
1090       CeedCallBackend(CeedOperatorGetBasisPointer(basis_in, eval_modes_in[i], identity, &h_B_in));
1091       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_modes_in[i], &q_comp));
1092       if (q_comp > 1) {
1093         if (i == 0 || eval_modes_in[i] != eval_modes_in_prev) d_in = 0;
1094         else h_B_in = &h_B_in[(++d_in) * elem_size_in * num_qpts_in];
1095       }
1096       eval_modes_in_prev = eval_modes_in[i];
1097 
1098       CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar),
1099                                   hipMemcpyHostToDevice));
1100     }
1101 
1102     if (identity) {
1103       CeedCallBackend(CeedFree(&identity));
1104     }
1105   }
1106 
1107   // Load into B_out, in order that they will be used in eval_modes_out
1108   {
1109     const CeedInt out_bytes           = elem_size_out * num_qpts_out * num_eval_modes_out * sizeof(CeedScalar);
1110     CeedInt       d_out               = 0;
1111     CeedEvalMode  eval_modes_out_prev = CEED_EVAL_NONE;
1112     bool          has_eval_none       = false;
1113     CeedScalar   *identity            = NULL;
1114 
1115     for (CeedInt i = 0; i < num_eval_modes_out; i++) {
1116       has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE);
1117     }
1118     if (has_eval_none) {
1119       CeedCallBackend(CeedCalloc(elem_size_out * num_qpts_out, &identity));
1120       for (CeedInt i = 0; i < (elem_size_out < num_qpts_out ? elem_size_out : num_qpts_out); i++) identity[i * elem_size_out + i] = 1.0;
1121     }
1122 
1123     CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_out, out_bytes));
1124     for (CeedInt i = 0; i < num_eval_modes_out; i++) {
1125       const CeedScalar *h_B_out;
1126 
1127       CeedCallBackend(CeedOperatorGetBasisPointer(basis_out, eval_modes_out[i], identity, &h_B_out));
1128       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_modes_out[i], &q_comp));
1129       if (q_comp > 1) {
1130         if (i == 0 || eval_modes_out[i] != eval_modes_out_prev) d_out = 0;
1131         else h_B_out = &h_B_out[(++d_out) * elem_size_out * num_qpts_out];
1132       }
1133       eval_modes_out_prev = eval_modes_out[i];
1134 
1135       CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar),
1136                                   hipMemcpyHostToDevice));
1137     }
1138 
1139     if (identity) {
1140       CeedCallBackend(CeedFree(&identity));
1141     }
1142   }
1143   return CEED_ERROR_SUCCESS;
1144 }
1145 
1146 //------------------------------------------------------------------------------
1147 // Assemble matrix data for COO matrix of assembled operator.
1148 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
1149 //
1150 // Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval
1151 // modes).
1152 // TODO: allow multiple active input restrictions/basis objects
1153 //------------------------------------------------------------------------------
1154 static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedVector values) {
1155   Ceed                ceed;
1156   CeedSize            values_length = 0, assembled_qf_length = 0;
1157   CeedInt             use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out;
1158   CeedScalar         *values_array;
1159   const CeedScalar   *assembled_qf_array;
1160   CeedVector          assembled_qf   = NULL;
1161   CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out;
1162   CeedRestrictionType rstr_type_in, rstr_type_out;
1163   const bool         *orients_in = NULL, *orients_out = NULL;
1164   const CeedInt8     *curl_orients_in = NULL, *curl_orients_out = NULL;
1165   CeedOperator_Hip   *impl;
1166 
1167   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1168   CeedCallBackend(CeedOperatorGetData(op, &impl));
1169 
1170   // Assemble QFunction
1171   CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, CEED_REQUEST_IMMEDIATE));
1172   CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr));
1173   CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array));
1174 
1175   CeedCallBackend(CeedVectorGetLength(values, &values_length));
1176   CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
1177   if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
1178 
1179   // Setup
1180   if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Hip(op, use_ceedsize_idx));
1181   CeedOperatorAssemble_Hip *asmb = impl->asmb;
1182 
1183   assert(asmb != NULL);
1184 
1185   // Assemble element operator
1186   CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array));
1187   values_array += offset;
1188 
1189   CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out));
1190   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem_in));
1191   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in));
1192 
1193   CeedCallBackend(CeedElemRestrictionGetType(rstr_in, &rstr_type_in));
1194   if (rstr_type_in == CEED_RESTRICTION_ORIENTED) {
1195     CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_in, CEED_MEM_DEVICE, &orients_in));
1196   } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) {
1197     CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_in, CEED_MEM_DEVICE, &curl_orients_in));
1198   }
1199 
1200   if (rstr_in != rstr_out) {
1201     CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_out, &num_elem_out));
1202     CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED,
1203               "Active input and output operator restrictions must have the same number of elements");
1204     CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out));
1205 
1206     CeedCallBackend(CeedElemRestrictionGetType(rstr_out, &rstr_type_out));
1207     if (rstr_type_out == CEED_RESTRICTION_ORIENTED) {
1208       CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_out, CEED_MEM_DEVICE, &orients_out));
1209     } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) {
1210       CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_out, CEED_MEM_DEVICE, &curl_orients_out));
1211     }
1212   } else {
1213     elem_size_out    = elem_size_in;
1214     orients_out      = orients_in;
1215     curl_orients_out = curl_orients_in;
1216   }
1217 
1218   // Compute B^T D B
1219   CeedInt shared_mem =
1220       ((curl_orients_in || curl_orients_out ? elem_size_in * elem_size_out : 0) + (curl_orients_in ? elem_size_in * asmb->block_size_y : 0)) *
1221       sizeof(CeedScalar);
1222   CeedInt grid   = CeedDivUpInt(num_elem_in, asmb->elems_per_block);
1223   void   *args[] = {(void *)&num_elem_in, &asmb->d_B_in,     &asmb->d_B_out,      &orients_in,  &curl_orients_in,
1224                     &orients_out,         &curl_orients_out, &assembled_qf_array, &values_array};
1225 
1226   CeedCallBackend(
1227       CeedRunKernelDimShared_Hip(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args));
1228 
1229   // Restore arrays
1230   CeedCallBackend(CeedVectorRestoreArray(values, &values_array));
1231   CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array));
1232 
1233   // Cleanup
1234   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
1235   if (rstr_type_in == CEED_RESTRICTION_ORIENTED) {
1236     CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_in, &orients_in));
1237   } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) {
1238     CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_in, &curl_orients_in));
1239   }
1240   if (rstr_in != rstr_out) {
1241     if (rstr_type_out == CEED_RESTRICTION_ORIENTED) {
1242       CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_out, &orients_out));
1243     } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) {
1244       CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out));
1245     }
1246   }
1247   return CEED_ERROR_SUCCESS;
1248 }
1249 
1250 //------------------------------------------------------------------------------
1251 // Create operator
1252 //------------------------------------------------------------------------------
1253 int CeedOperatorCreate_Hip(CeedOperator op) {
1254   Ceed              ceed;
1255   CeedOperator_Hip *impl;
1256 
1257   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1258   CeedCallBackend(CeedCalloc(1, &impl));
1259   CeedCallBackend(CeedOperatorSetData(op, impl));
1260   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip));
1261   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip));
1262   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Hip));
1263   CeedCallBackend(
1264       CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip));
1265   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Hip));
1266   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip));
1267   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip));
1268   return CEED_ERROR_SUCCESS;
1269 }
1270 
1271 //------------------------------------------------------------------------------
1272