xref: /libCEED/backends/hip-ref/ceed-hip-ref-operator.c (revision 8a21357021cf95ebb58183dc35a646c90195a1f2)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <assert.h>
12 #include <stdbool.h>
13 #include <string.h>
14 #include <hip/hip_runtime.h>
15 
16 #include "../hip/ceed-hip-common.h"
17 #include "../hip/ceed-hip-compile.h"
18 #include "ceed-hip-ref.h"
19 
20 //------------------------------------------------------------------------------
21 // Destroy operator
22 //------------------------------------------------------------------------------
23 static int CeedOperatorDestroy_Hip(CeedOperator op) {
24   CeedOperator_Hip *impl;
25 
26   CeedCallBackend(CeedOperatorGetData(op, &impl));
27 
28   // Apply data
29   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
30   CeedCallBackend(CeedFree(&impl->skip_rstr_out));
31   CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
32   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
33     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
34   }
35   CeedCallBackend(CeedFree(&impl->e_vecs));
36   CeedCallBackend(CeedFree(&impl->input_states));
37 
38   for (CeedInt i = 0; i < impl->num_inputs; i++) {
39     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i]));
40   }
41   CeedCallBackend(CeedFree(&impl->q_vecs_in));
42 
43   for (CeedInt i = 0; i < impl->num_outputs; i++) {
44     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
45   }
46   CeedCallBackend(CeedFree(&impl->q_vecs_out));
47   CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem));
48 
49   // QFunction assembly data
50   for (CeedInt i = 0; i < impl->num_active_in; i++) {
51     CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
52   }
53   CeedCallBackend(CeedFree(&impl->qf_active_in));
54 
55   // Diag data
56   if (impl->diag) {
57     Ceed ceed;
58 
59     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
60     if (impl->diag->module) {
61       CeedCallHip(ceed, hipModuleUnload(impl->diag->module));
62     }
63     if (impl->diag->module_point_block) {
64       CeedCallHip(ceed, hipModuleUnload(impl->diag->module_point_block));
65     }
66     CeedCallHip(ceed, hipFree(impl->diag->d_eval_modes_in));
67     CeedCallHip(ceed, hipFree(impl->diag->d_eval_modes_out));
68     CeedCallHip(ceed, hipFree(impl->diag->d_identity));
69     CeedCallHip(ceed, hipFree(impl->diag->d_interp_in));
70     CeedCallHip(ceed, hipFree(impl->diag->d_interp_out));
71     CeedCallHip(ceed, hipFree(impl->diag->d_grad_in));
72     CeedCallHip(ceed, hipFree(impl->diag->d_grad_out));
73     CeedCallHip(ceed, hipFree(impl->diag->d_div_in));
74     CeedCallHip(ceed, hipFree(impl->diag->d_div_out));
75     CeedCallHip(ceed, hipFree(impl->diag->d_curl_in));
76     CeedCallHip(ceed, hipFree(impl->diag->d_curl_out));
77     CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
78     CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
79     CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag));
80     CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag));
81   }
82   CeedCallBackend(CeedFree(&impl->diag));
83 
84   if (impl->asmb) {
85     Ceed ceed;
86 
87     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
88     CeedCallHip(ceed, hipModuleUnload(impl->asmb->module));
89     CeedCallHip(ceed, hipFree(impl->asmb->d_B_in));
90     CeedCallHip(ceed, hipFree(impl->asmb->d_B_out));
91   }
92   CeedCallBackend(CeedFree(&impl->asmb));
93 
94   CeedCallBackend(CeedFree(&impl));
95   return CEED_ERROR_SUCCESS;
96 }
97 
98 //------------------------------------------------------------------------------
99 // Setup infields or outfields
100 //------------------------------------------------------------------------------
101 static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, bool *apply_add_basis,
102                                        CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
103   Ceed                ceed;
104   CeedQFunctionField *qf_fields;
105   CeedOperatorField  *op_fields;
106 
107   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
108   if (is_input) {
109     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
110     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
111   } else {
112     CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
113     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
114   }
115 
116   // Loop over fields
117   for (CeedInt i = 0; i < num_fields; i++) {
118     bool         is_strided = false, skip_restriction = false;
119     CeedSize     q_size;
120     CeedInt      size;
121     CeedEvalMode eval_mode;
122     CeedBasis    basis;
123 
124     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
125     if (eval_mode != CEED_EVAL_WEIGHT) {
126       CeedElemRestriction elem_rstr;
127 
128       // Check whether this field can skip the element restriction:
129       // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
130       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
131 
132       // First, check whether the field is input or output:
133       if (is_input) {
134         CeedVector vec;
135 
136         // Check for passive input
137         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
138         if (vec != CEED_VECTOR_ACTIVE) {
139           // Check eval_mode
140           if (eval_mode == CEED_EVAL_NONE) {
141             // Check for strided restriction
142             CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
143             if (is_strided) {
144               // Check if vector is already in preferred backend ordering
145               CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction));
146             }
147           }
148         }
149       }
150       if (skip_restriction) {
151         // We do not need an E-Vector, but will use the input field vector's data directly in the operator application.
152         e_vecs[i + start_e] = NULL;
153       } else {
154         CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e]));
155       }
156     }
157 
158     switch (eval_mode) {
159       case CEED_EVAL_NONE:
160         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
161         q_size = (CeedSize)num_elem * Q * size;
162         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
163         break;
164       case CEED_EVAL_INTERP:
165       case CEED_EVAL_GRAD:
166       case CEED_EVAL_DIV:
167       case CEED_EVAL_CURL:
168         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
169         q_size = (CeedSize)num_elem * Q * size;
170         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
171         break;
172       case CEED_EVAL_WEIGHT:  // Only on input fields
173         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
174         q_size = (CeedSize)num_elem * Q;
175         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
176         if (is_at_points) {
177           CeedInt num_points[num_elem];
178 
179           for (CeedInt i = 0; i < num_elem; i++) num_points[i] = Q;
180           CeedCallBackend(
181               CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, q_vecs[i]));
182         } else {
183           CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
184         }
185         break;
186     }
187   }
188   // Drop duplicate restrictions
189   if (is_input) {
190     for (CeedInt i = 0; i < num_fields; i++) {
191       CeedVector          vec_i;
192       CeedElemRestriction rstr_i;
193 
194       CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
195       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
196       for (CeedInt j = i + 1; j < num_fields; j++) {
197         CeedVector          vec_j;
198         CeedElemRestriction rstr_j;
199 
200         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
201         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
202         if (vec_i == vec_j && rstr_i == rstr_j) {
203           CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i + start_e], &e_vecs[j + start_e]));
204           skip_rstr[j] = true;
205         }
206       }
207     }
208   } else {
209     for (CeedInt i = num_fields - 1; i >= 0; i--) {
210       CeedVector          vec_i;
211       CeedElemRestriction rstr_i;
212 
213       CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
214       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
215       for (CeedInt j = i - 1; j >= 0; j--) {
216         CeedVector          vec_j;
217         CeedElemRestriction rstr_j;
218 
219         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
220         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
221         if (vec_i == vec_j && rstr_i == rstr_j) {
222           CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i + start_e], &e_vecs[j + start_e]));
223           skip_rstr[j]       = true;
224           apply_add_basis[i] = true;
225         }
226       }
227     }
228   }
229   return CEED_ERROR_SUCCESS;
230 }
231 
232 //------------------------------------------------------------------------------
233 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
234 //------------------------------------------------------------------------------
235 static int CeedOperatorSetup_Hip(CeedOperator op) {
236   Ceed                ceed;
237   bool                is_setup_done;
238   CeedInt             Q, num_elem, num_input_fields, num_output_fields;
239   CeedQFunctionField *qf_input_fields, *qf_output_fields;
240   CeedQFunction       qf;
241   CeedOperatorField  *op_input_fields, *op_output_fields;
242   CeedOperator_Hip   *impl;
243 
244   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
245   if (is_setup_done) return CEED_ERROR_SUCCESS;
246 
247   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
248   CeedCallBackend(CeedOperatorGetData(op, &impl));
249   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
250   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
251   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
252   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
253   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
254 
255   // Allocate
256   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
257   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
258   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
259   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
260   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
261   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
262   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
263   impl->num_inputs  = num_input_fields;
264   impl->num_outputs = num_output_fields;
265 
266   // Set up infield and outfield e-vecs and q-vecs
267   // Infields
268   CeedCallBackend(
269       CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
270   // Outfields
271   CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
272                                               num_input_fields, num_output_fields, Q, num_elem));
273 
274   // Reuse active e-vecs where able
275   {
276     CeedInt              num_used  = 0;
277     CeedElemRestriction *rstr_used = NULL;
278 
279     for (CeedInt i = 0; i < num_input_fields; i++) {
280       bool                is_used = false;
281       CeedVector          vec_i;
282       CeedElemRestriction rstr_i;
283 
284       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
285       if (vec_i != CEED_VECTOR_ACTIVE) continue;
286       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
287       for (CeedInt j = 0; j < num_used; j++) {
288         if (rstr_i == rstr_used[i]) is_used = true;
289       }
290       if (is_used) continue;
291       num_used++;
292       if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
293       else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
294       rstr_used[num_used - 1] = rstr_i;
295       for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
296         CeedEvalMode        eval_mode;
297         CeedVector          vec_j;
298         CeedElemRestriction rstr_j;
299 
300         CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
301         if (vec_j != CEED_VECTOR_ACTIVE) continue;
302         CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
303         if (eval_mode == CEED_EVAL_NONE) continue;
304         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
305         if (rstr_i == rstr_j) {
306           CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
307         }
308       }
309     }
310     CeedCallBackend(CeedFree(&rstr_used));
311   }
312   impl->has_shared_e_vecs = true;
313   CeedCallBackend(CeedOperatorSetSetupDone(op));
314   return CEED_ERROR_SUCCESS;
315 }
316 
317 //------------------------------------------------------------------------------
318 // Setup Operator Inputs
319 //------------------------------------------------------------------------------
320 static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
321                                               CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
322                                               CeedOperator_Hip *impl, CeedRequest *request) {
323   for (CeedInt i = 0; i < num_input_fields; i++) {
324     CeedEvalMode        eval_mode;
325     CeedVector          vec;
326     CeedElemRestriction elem_rstr;
327 
328     // Get input vector
329     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
330     if (vec == CEED_VECTOR_ACTIVE) {
331       if (skip_active) continue;
332       else vec = in_vec;
333     }
334 
335     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
336     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
337     } else {
338       // Get input vector
339       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
340       // Get input element restriction
341       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
342       if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
343       // Restrict, if necessary
344       if (!impl->e_vecs[i]) {
345         // No restriction for this field; read data directly from vec.
346         CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
347       } else {
348         uint64_t state;
349 
350         CeedCallBackend(CeedVectorGetState(vec, &state));
351         if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
352           CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
353         }
354         impl->input_states[i] = state;
355         // Get evec
356         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
357       }
358     }
359   }
360   return CEED_ERROR_SUCCESS;
361 }
362 
363 //------------------------------------------------------------------------------
364 // Input Basis Action
365 //------------------------------------------------------------------------------
366 static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
367                                              CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
368                                              CeedOperator_Hip *impl) {
369   for (CeedInt i = 0; i < num_input_fields; i++) {
370     CeedInt             elem_size, size;
371     CeedEvalMode        eval_mode;
372     CeedElemRestriction elem_rstr;
373     CeedBasis           basis;
374 
375     // Skip active input
376     if (skip_active) {
377       CeedVector vec;
378 
379       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
380       if (vec == CEED_VECTOR_ACTIVE) continue;
381     }
382     // Get elem_size, eval_mode, size
383     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
384     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
385     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
386     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
387     // Basis action
388     switch (eval_mode) {
389       case CEED_EVAL_NONE:
390         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
391         break;
392       case CEED_EVAL_INTERP:
393       case CEED_EVAL_GRAD:
394       case CEED_EVAL_DIV:
395       case CEED_EVAL_CURL:
396         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
397         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i]));
398         break;
399       case CEED_EVAL_WEIGHT:
400         break;  // No action
401     }
402   }
403   return CEED_ERROR_SUCCESS;
404 }
405 
406 //------------------------------------------------------------------------------
407 // Restore Input Vectors
408 //------------------------------------------------------------------------------
409 static inline int CeedOperatorRestoreInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
410                                                 const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) {
411   for (CeedInt i = 0; i < num_input_fields; i++) {
412     CeedEvalMode eval_mode;
413     CeedVector   vec;
414 
415     // Skip active input
416     if (skip_active) {
417       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
418       if (vec == CEED_VECTOR_ACTIVE) continue;
419     }
420     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
421     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
422     } else {
423       if (!impl->e_vecs[i]) {  // This was a skip_restriction case
424         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
425         CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i]));
426       } else {
427         CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i]));
428       }
429     }
430   }
431   return CEED_ERROR_SUCCESS;
432 }
433 
434 //------------------------------------------------------------------------------
435 // Apply and add to output
436 //------------------------------------------------------------------------------
437 static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
438   CeedInt             Q, num_elem, elem_size, num_input_fields, num_output_fields, size;
439   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
440   CeedQFunctionField *qf_input_fields, *qf_output_fields;
441   CeedQFunction       qf;
442   CeedOperatorField  *op_input_fields, *op_output_fields;
443   CeedOperator_Hip   *impl;
444 
445   CeedCallBackend(CeedOperatorGetData(op, &impl));
446   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
447   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
448   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
449   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
450   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
451 
452   // Setup
453   CeedCallBackend(CeedOperatorSetup_Hip(op));
454 
455   // Input Evecs and Restriction
456   CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
457 
458   // Input basis apply if needed
459   CeedCallBackend(CeedOperatorInputBasis_Hip(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
460 
461   // Output pointers, as necessary
462   for (CeedInt i = 0; i < num_output_fields; i++) {
463     CeedEvalMode eval_mode;
464 
465     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
466     if (eval_mode == CEED_EVAL_NONE) {
467       // Set the output Q-Vector to use the E-Vector data directly.
468       CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
469       CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
470     }
471   }
472 
473   // Q function
474   CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));
475 
476   // Restore input arrays
477   CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
478 
479   // Output basis apply if needed
480   for (CeedInt i = 0; i < num_output_fields; i++) {
481     CeedEvalMode        eval_mode;
482     CeedElemRestriction elem_rstr;
483     CeedBasis           basis;
484 
485     // Get elem_size, eval_mode, size
486     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
487     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
488     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
489     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
490     // Basis action
491     switch (eval_mode) {
492       case CEED_EVAL_NONE:
493         break;  // No action
494       case CEED_EVAL_INTERP:
495       case CEED_EVAL_GRAD:
496       case CEED_EVAL_DIV:
497       case CEED_EVAL_CURL:
498         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
499         if (impl->apply_add_basis_out[i]) {
500           CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
501         } else {
502           CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
503         }
504         break;
505       // LCOV_EXCL_START
506       case CEED_EVAL_WEIGHT: {
507         return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
508         // LCOV_EXCL_STOP
509       }
510     }
511   }
512 
513   // Output restriction
514   for (CeedInt i = 0; i < num_output_fields; i++) {
515     CeedEvalMode        eval_mode;
516     CeedVector          vec;
517     CeedElemRestriction elem_rstr;
518 
519     // Restore evec
520     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
521     if (eval_mode == CEED_EVAL_NONE) {
522       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
523     }
524     if (impl->skip_rstr_out[i]) continue;
525     // Get output vector
526     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
527     // Restrict
528     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
529     // Active
530     if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
531 
532     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
533   }
534   return CEED_ERROR_SUCCESS;
535 }
536 
537 //------------------------------------------------------------------------------
538 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
539 //------------------------------------------------------------------------------
540 static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
541   Ceed                ceed;
542   bool                is_setup_done;
543   CeedInt             max_num_points = -1, num_elem, num_input_fields, num_output_fields;
544   CeedQFunctionField *qf_input_fields, *qf_output_fields;
545   CeedQFunction       qf;
546   CeedOperatorField  *op_input_fields, *op_output_fields;
547   CeedOperator_Hip   *impl;
548 
549   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
550   if (is_setup_done) return CEED_ERROR_SUCCESS;
551 
552   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
553   CeedCallBackend(CeedOperatorGetData(op, &impl));
554   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
555   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
556   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
557   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
558   {
559     CeedElemRestriction elem_rstr = NULL;
560 
561     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL));
562     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points));
563   }
564   impl->max_num_points = max_num_points;
565 
566   // Allocate
567   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
568   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
569   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
570   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
571   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
572   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
573   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
574   impl->num_inputs  = num_input_fields;
575   impl->num_outputs = num_output_fields;
576 
577   // Set up infield and outfield e-vecs and q-vecs
578   // Infields
579   CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
580                                               max_num_points, num_elem));
581   // Outfields
582   CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
583                                               num_input_fields, num_output_fields, max_num_points, num_elem));
584 
585   // Reuse active e-vecs where able
586   {
587     CeedInt              num_used  = 0;
588     CeedElemRestriction *rstr_used = NULL;
589 
590     for (CeedInt i = 0; i < num_input_fields; i++) {
591       bool                is_used = false;
592       CeedVector          vec_i;
593       CeedElemRestriction rstr_i;
594 
595       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
596       if (vec_i != CEED_VECTOR_ACTIVE) continue;
597       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
598       for (CeedInt j = 0; j < num_used; j++) {
599         if (rstr_i == rstr_used[i]) is_used = true;
600       }
601       if (is_used) continue;
602       num_used++;
603       if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
604       else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
605       rstr_used[num_used - 1] = rstr_i;
606       for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
607         CeedEvalMode        eval_mode;
608         CeedVector          vec_j;
609         CeedElemRestriction rstr_j;
610 
611         CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
612         if (vec_j != CEED_VECTOR_ACTIVE) continue;
613         CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
614         if (eval_mode == CEED_EVAL_NONE) continue;
615         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
616         if (rstr_i == rstr_j) {
617           CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
618         }
619       }
620     }
621     CeedCallBackend(CeedFree(&rstr_used));
622   }
623   impl->has_shared_e_vecs = true;
624   CeedCallBackend(CeedOperatorSetSetupDone(op));
625   return CEED_ERROR_SUCCESS;
626 }
627 
628 //------------------------------------------------------------------------------
629 // Input Basis Action AtPoints
630 //------------------------------------------------------------------------------
631 static inline int CeedOperatorInputBasisAtPoints_Hip(CeedInt num_elem, const CeedInt *num_points, CeedQFunctionField *qf_input_fields,
632                                                      CeedOperatorField *op_input_fields, CeedInt num_input_fields, const bool skip_active,
633                                                      CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) {
634   for (CeedInt i = 0; i < num_input_fields; i++) {
635     CeedInt             elem_size, size;
636     CeedEvalMode        eval_mode;
637     CeedElemRestriction elem_rstr;
638     CeedBasis           basis;
639 
640     // Skip active input
641     if (skip_active) {
642       CeedVector vec;
643 
644       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
645       if (vec == CEED_VECTOR_ACTIVE) continue;
646     }
647     // Get elem_size, eval_mode, size
648     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
649     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
650     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
651     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
652     // Basis action
653     switch (eval_mode) {
654       case CEED_EVAL_NONE:
655         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
656         break;
657       case CEED_EVAL_INTERP:
658       case CEED_EVAL_GRAD:
659       case CEED_EVAL_DIV:
660       case CEED_EVAL_CURL:
661         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
662         CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs[i],
663                                                impl->q_vecs_in[i]));
664         break;
665       case CEED_EVAL_WEIGHT:
666         break;  // No action
667     }
668   }
669   return CEED_ERROR_SUCCESS;
670 }
671 
672 //------------------------------------------------------------------------------
673 // Apply and add to output AtPoints
674 //------------------------------------------------------------------------------
675 static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
676   CeedInt             max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
677   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
678   CeedQFunctionField *qf_input_fields, *qf_output_fields;
679   CeedQFunction       qf;
680   CeedOperatorField  *op_input_fields, *op_output_fields;
681   CeedOperator_Hip   *impl;
682 
683   CeedCallBackend(CeedOperatorGetData(op, &impl));
684   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
685   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
686   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
687   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
688   CeedInt num_points[num_elem];
689 
690   // Setup
691   CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op));
692   max_num_points = impl->max_num_points;
693   for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
694 
695   // Input Evecs and Restriction
696   CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
697 
698   // Get point coordinates
699   if (!impl->point_coords_elem) {
700     CeedVector          point_coords = NULL;
701     CeedElemRestriction rstr_points  = NULL;
702 
703     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
704     CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
705     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
706   }
707 
708   // Input basis apply if needed
709   CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
710 
711   // Output pointers, as necessary
712   for (CeedInt i = 0; i < num_output_fields; i++) {
713     CeedEvalMode eval_mode;
714 
715     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
716     if (eval_mode == CEED_EVAL_NONE) {
717       // Set the output Q-Vector to use the E-Vector data directly.
718       CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
719       CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
720     }
721   }
722 
723   // Q function
724   CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
725 
726   // Restore input arrays
727   CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
728 
729   // Output basis apply if needed
730   for (CeedInt i = 0; i < num_output_fields; i++) {
731     CeedEvalMode        eval_mode;
732     CeedElemRestriction elem_rstr;
733     CeedBasis           basis;
734 
735     // Get elem_size, eval_mode, size
736     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
737     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
738     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
739     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
740     // Basis action
741     switch (eval_mode) {
742       case CEED_EVAL_NONE:
743         break;  // No action
744       case CEED_EVAL_INTERP:
745       case CEED_EVAL_GRAD:
746       case CEED_EVAL_DIV:
747       case CEED_EVAL_CURL:
748         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
749         if (impl->apply_add_basis_out[i]) {
750           CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
751                                                     impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
752         } else {
753           CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
754                                                  impl->e_vecs[i + impl->num_inputs]));
755         }
756         break;
757       // LCOV_EXCL_START
758       case CEED_EVAL_WEIGHT: {
759         return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
760         // LCOV_EXCL_STOP
761       }
762     }
763   }
764 
765   // Output restriction
766   for (CeedInt i = 0; i < num_output_fields; i++) {
767     CeedEvalMode        eval_mode;
768     CeedVector          vec;
769     CeedElemRestriction elem_rstr;
770 
771     // Restore evec
772     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
773     if (eval_mode == CEED_EVAL_NONE) {
774       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
775     }
776     if (impl->skip_rstr_out[i]) continue;
777     // Get output vector
778     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
779     // Restrict
780     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
781     // Active
782     if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
783 
784     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
785   }
786   return CEED_ERROR_SUCCESS;
787 }
788 
789 //------------------------------------------------------------------------------
790 // Linear QFunction Assembly Core
791 //------------------------------------------------------------------------------
792 static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
793                                                               CeedRequest *request) {
794   Ceed                ceed, ceed_parent;
795   CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
796   CeedScalar         *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL};
797   CeedVector         *active_inputs;
798   CeedQFunctionField *qf_input_fields, *qf_output_fields;
799   CeedQFunction       qf;
800   CeedOperatorField  *op_input_fields, *op_output_fields;
801   CeedOperator_Hip   *impl;
802 
803   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
804   CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent));
805   CeedCallBackend(CeedOperatorGetData(op, &impl));
806   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
807   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
808   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
809   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
810   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
811   active_inputs = impl->qf_active_in;
812   num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
813 
814   // Setup
815   CeedCallBackend(CeedOperatorSetup_Hip(op));
816 
817   // Input Evecs and Restriction
818   CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
819 
820   // Count number of active input fields
821   if (!num_active_in) {
822     for (CeedInt i = 0; i < num_input_fields; i++) {
823       CeedScalar *q_vec_array;
824       CeedVector  vec;
825 
826       // Get input vector
827       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
828       // Check if active input
829       if (vec == CEED_VECTOR_ACTIVE) {
830         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
831         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
832         CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array));
833         CeedCallBackend(CeedRealloc(num_active_in + size, &active_inputs));
834         for (CeedInt field = 0; field < size; field++) {
835           CeedSize q_size = (CeedSize)Q * num_elem;
836 
837           CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field]));
838           CeedCallBackend(
839               CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem]));
840         }
841         num_active_in += size;
842         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
843       }
844     }
845     impl->num_active_in = num_active_in;
846     impl->qf_active_in  = active_inputs;
847   }
848 
849   // Count number of active output fields
850   if (!num_active_out) {
851     for (CeedInt i = 0; i < num_output_fields; i++) {
852       CeedVector vec;
853 
854       // Get output vector
855       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
856       // Check if active output
857       if (vec == CEED_VECTOR_ACTIVE) {
858         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
859         num_active_out += size;
860       }
861     }
862     impl->num_active_out = num_active_out;
863   }
864 
865   // Check sizes
866   CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
867 
868   // Build objects if needed
869   if (build_objects) {
870     CeedSize l_size     = (CeedSize)num_elem * Q * num_active_in * num_active_out;
871     CeedInt  strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
872 
873     // Create output restriction
874     CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
875                                                      (CeedSize)num_active_in * (CeedSize)num_active_out * (CeedSize)num_elem * (CeedSize)Q, strides,
876                                                      rstr));
877     // Create assembled vector
878     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
879   }
880   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
881   CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array));
882 
883   // Input basis apply
884   CeedCallBackend(CeedOperatorInputBasis_Hip(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
885 
886   // Assemble QFunction
887   for (CeedInt in = 0; in < num_active_in; in++) {
888     // Set Inputs
889     CeedCallBackend(CeedVectorSetValue(active_inputs[in], 1.0));
890     if (num_active_in > 1) {
891       CeedCallBackend(CeedVectorSetValue(active_inputs[(in + num_active_in - 1) % num_active_in], 0.0));
892     }
893     // Set Outputs
894     for (CeedInt out = 0; out < num_output_fields; out++) {
895       CeedVector vec;
896 
897       // Get output vector
898       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
899       // Check if active output
900       if (vec == CEED_VECTOR_ACTIVE) {
901         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array));
902         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
903         assembled_array += size * Q * num_elem;  // Advance the pointer by the size of the output
904       }
905     }
906     // Apply QFunction
907     CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
908   }
909 
910   // Un-set output q-vecs to prevent accidental overwrite of Assembled
911   for (CeedInt out = 0; out < num_output_fields; out++) {
912     CeedVector vec;
913 
914     // Get output vector
915     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
916     // Check if active output
917     if (vec == CEED_VECTOR_ACTIVE) {
918       CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL));
919     }
920   }
921 
922   // Restore input arrays
923   CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
924 
925   // Restore output
926   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
927   return CEED_ERROR_SUCCESS;
928 }
929 
930 //------------------------------------------------------------------------------
931 // Assemble Linear QFunction
932 //------------------------------------------------------------------------------
933 static int CeedOperatorLinearAssembleQFunction_Hip(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
934   return CeedOperatorLinearAssembleQFunctionCore_Hip(op, true, assembled, rstr, request);
935 }
936 
937 //------------------------------------------------------------------------------
938 // Update Assembled Linear QFunction
939 //------------------------------------------------------------------------------
940 static int CeedOperatorLinearAssembleQFunctionUpdate_Hip(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
941   return CeedOperatorLinearAssembleQFunctionCore_Hip(op, false, &assembled, &rstr, request);
942 }
943 
944 //------------------------------------------------------------------------------
945 // Assemble Diagonal Setup
946 //------------------------------------------------------------------------------
947 static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
948   Ceed                ceed;
949   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
950   CeedInt             q_comp, num_nodes, num_qpts;
951   CeedEvalMode       *eval_modes_in = NULL, *eval_modes_out = NULL;
952   CeedBasis           basis_in = NULL, basis_out = NULL;
953   CeedQFunctionField *qf_fields;
954   CeedQFunction       qf;
955   CeedOperatorField  *op_fields;
956   CeedOperator_Hip   *impl;
957 
958   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
959   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
960   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
961 
962   // Determine active input basis
963   CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
964   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
965   for (CeedInt i = 0; i < num_input_fields; i++) {
966     CeedVector vec;
967 
968     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
969     if (vec == CEED_VECTOR_ACTIVE) {
970       CeedBasis    basis;
971       CeedEvalMode eval_mode;
972 
973       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
974       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND,
975                 "Backend does not implement operator diagonal assembly with multiple active bases");
976       basis_in = basis;
977       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
978       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
979       if (eval_mode != CEED_EVAL_WEIGHT) {
980         // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly
981         CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in));
982         for (CeedInt d = 0; d < q_comp; d++) eval_modes_in[num_eval_modes_in + d] = eval_mode;
983         num_eval_modes_in += q_comp;
984       }
985     }
986   }
987 
988   // Determine active output basis
989   CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
990   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
991   for (CeedInt i = 0; i < num_output_fields; i++) {
992     CeedVector vec;
993 
994     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
995     if (vec == CEED_VECTOR_ACTIVE) {
996       CeedBasis    basis;
997       CeedEvalMode eval_mode;
998 
999       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
1000       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
1001                 "Backend does not implement operator diagonal assembly with multiple active bases");
1002       basis_out = basis;
1003       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
1004       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
1005       if (eval_mode != CEED_EVAL_WEIGHT) {
1006         // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly
1007         CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out));
1008         for (CeedInt d = 0; d < q_comp; d++) eval_modes_out[num_eval_modes_out + d] = eval_mode;
1009         num_eval_modes_out += q_comp;
1010       }
1011     }
1012   }
1013 
1014   // Operator data struct
1015   CeedCallBackend(CeedOperatorGetData(op, &impl));
1016   CeedCallBackend(CeedCalloc(1, &impl->diag));
1017   CeedOperatorDiag_Hip *diag = impl->diag;
1018 
1019   // Basis matrices
1020   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
1021   if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
1022   else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
1023   const CeedInt interp_bytes     = num_nodes * num_qpts * sizeof(CeedScalar);
1024   const CeedInt eval_modes_bytes = sizeof(CeedEvalMode);
1025   bool          has_eval_none    = false;
1026 
1027   // CEED_EVAL_NONE
1028   for (CeedInt i = 0; i < num_eval_modes_in; i++) has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE);
1029   for (CeedInt i = 0; i < num_eval_modes_out; i++) has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE);
1030   if (has_eval_none) {
1031     CeedScalar *identity = NULL;
1032 
1033     CeedCallBackend(CeedCalloc(num_nodes * num_qpts, &identity));
1034     for (CeedInt i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0;
1035     CeedCallHip(ceed, hipMalloc((void **)&diag->d_identity, interp_bytes));
1036     CeedCallHip(ceed, hipMemcpy(diag->d_identity, identity, interp_bytes, hipMemcpyHostToDevice));
1037     CeedCallBackend(CeedFree(&identity));
1038   }
1039 
1040   // CEED_EVAL_INTERP, CEED_EVAL_GRAD, CEED_EVAL_DIV, and CEED_EVAL_CURL
1041   for (CeedInt in = 0; in < 2; in++) {
1042     CeedFESpace fespace;
1043     CeedBasis   basis = in ? basis_in : basis_out;
1044 
1045     CeedCallBackend(CeedBasisGetFESpace(basis, &fespace));
1046     switch (fespace) {
1047       case CEED_FE_SPACE_H1: {
1048         CeedInt           q_comp_interp, q_comp_grad;
1049         const CeedScalar *interp, *grad;
1050         CeedScalar       *d_interp, *d_grad;
1051 
1052         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
1053         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
1054 
1055         CeedCallBackend(CeedBasisGetInterp(basis, &interp));
1056         CeedCallHip(ceed, hipMalloc((void **)&d_interp, interp_bytes * q_comp_interp));
1057         CeedCallHip(ceed, hipMemcpy(d_interp, interp, interp_bytes * q_comp_interp, hipMemcpyHostToDevice));
1058         CeedCallBackend(CeedBasisGetGrad(basis, &grad));
1059         CeedCallHip(ceed, hipMalloc((void **)&d_grad, interp_bytes * q_comp_grad));
1060         CeedCallHip(ceed, hipMemcpy(d_grad, grad, interp_bytes * q_comp_grad, hipMemcpyHostToDevice));
1061         if (in) {
1062           diag->d_interp_in = d_interp;
1063           diag->d_grad_in   = d_grad;
1064         } else {
1065           diag->d_interp_out = d_interp;
1066           diag->d_grad_out   = d_grad;
1067         }
1068       } break;
1069       case CEED_FE_SPACE_HDIV: {
1070         CeedInt           q_comp_interp, q_comp_div;
1071         const CeedScalar *interp, *div;
1072         CeedScalar       *d_interp, *d_div;
1073 
1074         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
1075         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
1076 
1077         CeedCallBackend(CeedBasisGetInterp(basis, &interp));
1078         CeedCallHip(ceed, hipMalloc((void **)&d_interp, interp_bytes * q_comp_interp));
1079         CeedCallHip(ceed, hipMemcpy(d_interp, interp, interp_bytes * q_comp_interp, hipMemcpyHostToDevice));
1080         CeedCallBackend(CeedBasisGetDiv(basis, &div));
1081         CeedCallHip(ceed, hipMalloc((void **)&d_div, interp_bytes * q_comp_div));
1082         CeedCallHip(ceed, hipMemcpy(d_div, div, interp_bytes * q_comp_div, hipMemcpyHostToDevice));
1083         if (in) {
1084           diag->d_interp_in = d_interp;
1085           diag->d_div_in    = d_div;
1086         } else {
1087           diag->d_interp_out = d_interp;
1088           diag->d_div_out    = d_div;
1089         }
1090       } break;
1091       case CEED_FE_SPACE_HCURL: {
1092         CeedInt           q_comp_interp, q_comp_curl;
1093         const CeedScalar *interp, *curl;
1094         CeedScalar       *d_interp, *d_curl;
1095 
1096         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
1097         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
1098 
1099         CeedCallBackend(CeedBasisGetInterp(basis, &interp));
1100         CeedCallHip(ceed, hipMalloc((void **)&d_interp, interp_bytes * q_comp_interp));
1101         CeedCallHip(ceed, hipMemcpy(d_interp, interp, interp_bytes * q_comp_interp, hipMemcpyHostToDevice));
1102         CeedCallBackend(CeedBasisGetCurl(basis, &curl));
1103         CeedCallHip(ceed, hipMalloc((void **)&d_curl, interp_bytes * q_comp_curl));
1104         CeedCallHip(ceed, hipMemcpy(d_curl, curl, interp_bytes * q_comp_curl, hipMemcpyHostToDevice));
1105         if (in) {
1106           diag->d_interp_in = d_interp;
1107           diag->d_curl_in   = d_curl;
1108         } else {
1109           diag->d_interp_out = d_interp;
1110           diag->d_curl_out   = d_curl;
1111         }
1112       } break;
1113     }
1114   }
1115 
1116   // Arrays of eval_modes
1117   CeedCallHip(ceed, hipMalloc((void **)&diag->d_eval_modes_in, num_eval_modes_in * eval_modes_bytes));
1118   CeedCallHip(ceed, hipMemcpy(diag->d_eval_modes_in, eval_modes_in, num_eval_modes_in * eval_modes_bytes, hipMemcpyHostToDevice));
1119   CeedCallHip(ceed, hipMalloc((void **)&diag->d_eval_modes_out, num_eval_modes_out * eval_modes_bytes));
1120   CeedCallHip(ceed, hipMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, hipMemcpyHostToDevice));
1121   CeedCallBackend(CeedFree(&eval_modes_in));
1122   CeedCallBackend(CeedFree(&eval_modes_out));
1123   return CEED_ERROR_SUCCESS;
1124 }
1125 
1126 //------------------------------------------------------------------------------
1127 // Assemble Diagonal Setup (Compilation)
1128 //------------------------------------------------------------------------------
1129 static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) {
1130   Ceed                ceed;
1131   char               *diagonal_kernel_source;
1132   const char         *diagonal_kernel_path;
1133   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
1134   CeedInt             num_comp, q_comp, num_nodes, num_qpts;
1135   CeedBasis           basis_in = NULL, basis_out = NULL;
1136   CeedQFunctionField *qf_fields;
1137   CeedQFunction       qf;
1138   CeedOperatorField  *op_fields;
1139   CeedOperator_Hip   *impl;
1140 
1141   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1142   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1143   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
1144 
1145   // Determine active input basis
1146   CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
1147   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
1148   for (CeedInt i = 0; i < num_input_fields; i++) {
1149     CeedVector vec;
1150 
1151     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
1152     if (vec == CEED_VECTOR_ACTIVE) {
1153       CeedEvalMode eval_mode;
1154 
1155       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in));
1156       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
1157       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
1158       if (eval_mode != CEED_EVAL_WEIGHT) {
1159         num_eval_modes_in += q_comp;
1160       }
1161     }
1162   }
1163 
1164   // Determine active output basis
1165   CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
1166   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
1167   for (CeedInt i = 0; i < num_output_fields; i++) {
1168     CeedVector vec;
1169 
1170     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
1171     if (vec == CEED_VECTOR_ACTIVE) {
1172       CeedEvalMode eval_mode;
1173 
1174       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out));
1175       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
1176       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
1177       if (eval_mode != CEED_EVAL_WEIGHT) {
1178         num_eval_modes_out += q_comp;
1179       }
1180     }
1181   }
1182 
1183   // Operator data struct
1184   CeedCallBackend(CeedOperatorGetData(op, &impl));
1185   CeedOperatorDiag_Hip *diag = impl->diag;
1186 
1187   // Assemble kernel
1188   hipModule_t *module          = is_point_block ? &diag->module_point_block : &diag->module;
1189   CeedInt      elems_per_block = 1;
1190   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
1191   CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
1192   if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
1193   else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
1194   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path));
1195   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n");
1196   CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source));
1197   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n");
1198   CeedCallHip(ceed, CeedCompile_Hip(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
1199                                     num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE",
1200                                     use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block));
1201   CeedCallHip(ceed, CeedGetKernel_Hip(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
1202   CeedCallBackend(CeedFree(&diagonal_kernel_path));
1203   CeedCallBackend(CeedFree(&diagonal_kernel_source));
1204   return CEED_ERROR_SUCCESS;
1205 }
1206 
1207 //------------------------------------------------------------------------------
1208 // Assemble Diagonal Core
1209 //------------------------------------------------------------------------------
1210 static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) {
1211   Ceed                ceed;
1212   CeedInt             num_elem, num_nodes;
1213   CeedScalar         *elem_diag_array;
1214   const CeedScalar   *assembled_qf_array;
1215   CeedVector          assembled_qf   = NULL, elem_diag;
1216   CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out, diag_rstr;
1217   CeedOperator_Hip   *impl;
1218 
1219   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1220   CeedCallBackend(CeedOperatorGetData(op, &impl));
1221 
1222   // Assemble QFunction
1223   CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, request));
1224   CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr));
1225   CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array));
1226 
1227   // Setup
1228   if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op));
1229   CeedOperatorDiag_Hip *diag = impl->diag;
1230 
1231   assert(diag != NULL);
1232 
1233   // Assemble kernel if needed
1234   if ((!is_point_block && !diag->LinearDiagonal) || (is_point_block && !diag->LinearPointBlock)) {
1235     CeedSize assembled_length, assembled_qf_length;
1236     CeedInt  use_ceedsize_idx = 0;
1237     CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length));
1238     CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
1239     if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
1240 
1241     CeedCallBackend(CeedOperatorAssembleDiagonalSetupCompile_Hip(op, use_ceedsize_idx, is_point_block));
1242   }
1243 
1244   // Restriction and diagonal vector
1245   CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out));
1246   CeedCheck(rstr_in == rstr_out, ceed, CEED_ERROR_BACKEND,
1247             "Cannot assemble operator diagonal with different input and output active element restrictions");
1248   if (!is_point_block && !diag->diag_rstr) {
1249     CeedCallBackend(CeedElemRestrictionCreateUnsignedCopy(rstr_out, &diag->diag_rstr));
1250     CeedCallBackend(CeedElemRestrictionCreateVector(diag->diag_rstr, NULL, &diag->elem_diag));
1251   } else if (is_point_block && !diag->point_block_diag_rstr) {
1252     CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr));
1253     CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag));
1254   }
1255   diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr;
1256   elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag;
1257   CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0));
1258 
1259   // Only assemble diagonal if the basis has nodes, otherwise inputs are null pointers
1260   CeedCallBackend(CeedElemRestrictionGetElementSize(diag_rstr, &num_nodes));
1261   if (num_nodes > 0) {
1262     // Assemble element operator diagonals
1263     CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array));
1264     CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem));
1265 
1266     // Compute the diagonal of B^T D B
1267     CeedInt elems_per_block = 1;
1268     CeedInt grid            = CeedDivUpInt(num_elem, elems_per_block);
1269     void   *args[]          = {(void *)&num_elem,      &diag->d_identity,       &diag->d_interp_in,  &diag->d_grad_in, &diag->d_div_in,
1270                                &diag->d_curl_in,       &diag->d_interp_out,     &diag->d_grad_out,   &diag->d_div_out, &diag->d_curl_out,
1271                                &diag->d_eval_modes_in, &diag->d_eval_modes_out, &assembled_qf_array, &elem_diag_array};
1272 
1273     if (is_point_block) {
1274       CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->LinearPointBlock, grid, num_nodes, 1, elems_per_block, args));
1275     } else {
1276       CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->LinearDiagonal, grid, num_nodes, 1, elems_per_block, args));
1277     }
1278 
1279     // Restore arrays
1280     CeedCallBackend(CeedVectorRestoreArray(elem_diag, &elem_diag_array));
1281     CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array));
1282   }
1283 
1284   // Assemble local operator diagonal
1285   CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request));
1286 
1287   // Cleanup
1288   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
1289   return CEED_ERROR_SUCCESS;
1290 }
1291 
1292 //------------------------------------------------------------------------------
1293 // Assemble Linear Diagonal
1294 //------------------------------------------------------------------------------
1295 static int CeedOperatorLinearAssembleAddDiagonal_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
1296   CeedCallBackend(CeedOperatorAssembleDiagonalCore_Hip(op, assembled, request, false));
1297   return CEED_ERROR_SUCCESS;
1298 }
1299 
1300 //------------------------------------------------------------------------------
1301 // Assemble Linear Point Block Diagonal
1302 //------------------------------------------------------------------------------
1303 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
1304   CeedCallBackend(CeedOperatorAssembleDiagonalCore_Hip(op, assembled, request, true));
1305   return CEED_ERROR_SUCCESS;
1306 }
1307 
1308 //------------------------------------------------------------------------------
1309 // Single Operator Assembly Setup
1310 //------------------------------------------------------------------------------
1311 static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) {
1312   Ceed                ceed;
1313   char               *assembly_kernel_source;
1314   const char         *assembly_kernel_path;
1315   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
1316   CeedInt             elem_size_in, num_qpts_in = 0, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp;
1317   CeedEvalMode       *eval_modes_in = NULL, *eval_modes_out = NULL;
1318   CeedElemRestriction rstr_in = NULL, rstr_out = NULL;
1319   CeedBasis           basis_in = NULL, basis_out = NULL;
1320   CeedQFunctionField *qf_fields;
1321   CeedQFunction       qf;
1322   CeedOperatorField  *input_fields, *output_fields;
1323   CeedOperator_Hip   *impl;
1324 
1325   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1326   CeedCallBackend(CeedOperatorGetData(op, &impl));
1327 
1328   // Get intput and output fields
1329   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
1330 
1331   // Determine active input basis eval mode
1332   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1333   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
1334   for (CeedInt i = 0; i < num_input_fields; i++) {
1335     CeedVector vec;
1336 
1337     CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec));
1338     if (vec == CEED_VECTOR_ACTIVE) {
1339       CeedBasis    basis;
1340       CeedEvalMode eval_mode;
1341 
1342       CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis));
1343       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
1344       basis_in = basis;
1345       CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in));
1346       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in));
1347       if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in;
1348       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in));
1349       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
1350       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
1351       if (eval_mode != CEED_EVAL_WEIGHT) {
1352         // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly
1353         CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in));
1354         for (CeedInt d = 0; d < q_comp; d++) {
1355           eval_modes_in[num_eval_modes_in + d] = eval_mode;
1356         }
1357         num_eval_modes_in += q_comp;
1358       }
1359     }
1360   }
1361 
1362   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
1363   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
1364   for (CeedInt i = 0; i < num_output_fields; i++) {
1365     CeedVector vec;
1366 
1367     CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec));
1368     if (vec == CEED_VECTOR_ACTIVE) {
1369       CeedBasis    basis;
1370       CeedEvalMode eval_mode;
1371 
1372       CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis));
1373       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
1374                 "Backend does not implement operator assembly with multiple active bases");
1375       basis_out = basis;
1376       CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out));
1377       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out));
1378       if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out;
1379       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out));
1380       CeedCheck(num_qpts_in == num_qpts_out, ceed, CEED_ERROR_UNSUPPORTED,
1381                 "Active input and output bases must have the same number of quadrature points");
1382       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
1383       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
1384       if (eval_mode != CEED_EVAL_WEIGHT) {
1385         // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly
1386         CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out));
1387         for (CeedInt d = 0; d < q_comp; d++) {
1388           eval_modes_out[num_eval_modes_out + d] = eval_mode;
1389         }
1390         num_eval_modes_out += q_comp;
1391       }
1392     }
1393   }
1394   CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
1395 
1396   CeedCallBackend(CeedCalloc(1, &impl->asmb));
1397   CeedOperatorAssemble_Hip *asmb = impl->asmb;
1398   asmb->elems_per_block          = 1;
1399   asmb->block_size_x             = elem_size_in;
1400   asmb->block_size_y             = elem_size_out;
1401 
1402   bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > 1024;
1403 
1404   if (fallback) {
1405     // Use fallback kernel with 1D threadblock
1406     asmb->block_size_y = 1;
1407   }
1408 
1409   // Compile kernels
1410   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in));
1411   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out));
1412   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble.h", &assembly_kernel_path));
1413   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n");
1414   CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source));
1415   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n");
1416   CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
1417                                   num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in,
1418                                   "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE",
1419                                   asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "USE_CEEDSIZE",
1420                                   use_ceedsize_idx));
1421   CeedCallBackend(CeedGetKernel_Hip(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble));
1422   CeedCallBackend(CeedFree(&assembly_kernel_path));
1423   CeedCallBackend(CeedFree(&assembly_kernel_source));
1424 
1425   // Load into B_in, in order that they will be used in eval_modes_in
1426   {
1427     const CeedInt in_bytes           = elem_size_in * num_qpts_in * num_eval_modes_in * sizeof(CeedScalar);
1428     CeedInt       d_in               = 0;
1429     CeedEvalMode  eval_modes_in_prev = CEED_EVAL_NONE;
1430     bool          has_eval_none      = false;
1431     CeedScalar   *identity           = NULL;
1432 
1433     for (CeedInt i = 0; i < num_eval_modes_in; i++) {
1434       has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE);
1435     }
1436     if (has_eval_none) {
1437       CeedCallBackend(CeedCalloc(elem_size_in * num_qpts_in, &identity));
1438       for (CeedInt i = 0; i < (elem_size_in < num_qpts_in ? elem_size_in : num_qpts_in); i++) identity[i * elem_size_in + i] = 1.0;
1439     }
1440 
1441     CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_in, in_bytes));
1442     for (CeedInt i = 0; i < num_eval_modes_in; i++) {
1443       const CeedScalar *h_B_in;
1444 
1445       CeedCallBackend(CeedOperatorGetBasisPointer(basis_in, eval_modes_in[i], identity, &h_B_in));
1446       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_modes_in[i], &q_comp));
1447       if (q_comp > 1) {
1448         if (i == 0 || eval_modes_in[i] != eval_modes_in_prev) d_in = 0;
1449         else h_B_in = &h_B_in[(++d_in) * elem_size_in * num_qpts_in];
1450       }
1451       eval_modes_in_prev = eval_modes_in[i];
1452 
1453       CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar),
1454                                   hipMemcpyHostToDevice));
1455     }
1456 
1457     if (identity) {
1458       CeedCallBackend(CeedFree(&identity));
1459     }
1460   }
1461 
1462   // Load into B_out, in order that they will be used in eval_modes_out
1463   {
1464     const CeedInt out_bytes           = elem_size_out * num_qpts_out * num_eval_modes_out * sizeof(CeedScalar);
1465     CeedInt       d_out               = 0;
1466     CeedEvalMode  eval_modes_out_prev = CEED_EVAL_NONE;
1467     bool          has_eval_none       = false;
1468     CeedScalar   *identity            = NULL;
1469 
1470     for (CeedInt i = 0; i < num_eval_modes_out; i++) {
1471       has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE);
1472     }
1473     if (has_eval_none) {
1474       CeedCallBackend(CeedCalloc(elem_size_out * num_qpts_out, &identity));
1475       for (CeedInt i = 0; i < (elem_size_out < num_qpts_out ? elem_size_out : num_qpts_out); i++) identity[i * elem_size_out + i] = 1.0;
1476     }
1477 
1478     CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_out, out_bytes));
1479     for (CeedInt i = 0; i < num_eval_modes_out; i++) {
1480       const CeedScalar *h_B_out;
1481 
1482       CeedCallBackend(CeedOperatorGetBasisPointer(basis_out, eval_modes_out[i], identity, &h_B_out));
1483       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_modes_out[i], &q_comp));
1484       if (q_comp > 1) {
1485         if (i == 0 || eval_modes_out[i] != eval_modes_out_prev) d_out = 0;
1486         else h_B_out = &h_B_out[(++d_out) * elem_size_out * num_qpts_out];
1487       }
1488       eval_modes_out_prev = eval_modes_out[i];
1489 
1490       CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar),
1491                                   hipMemcpyHostToDevice));
1492     }
1493 
1494     if (identity) {
1495       CeedCallBackend(CeedFree(&identity));
1496     }
1497   }
1498   return CEED_ERROR_SUCCESS;
1499 }
1500 
1501 //------------------------------------------------------------------------------
1502 // Assemble matrix data for COO matrix of assembled operator.
1503 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
1504 //
1505 // Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval
1506 // modes).
1507 // TODO: allow multiple active input restrictions/basis objects
1508 //------------------------------------------------------------------------------
1509 static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedVector values) {
1510   Ceed                ceed;
1511   CeedSize            values_length = 0, assembled_qf_length = 0;
1512   CeedInt             use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out;
1513   CeedScalar         *values_array;
1514   const CeedScalar   *assembled_qf_array;
1515   CeedVector          assembled_qf   = NULL;
1516   CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out;
1517   CeedRestrictionType rstr_type_in, rstr_type_out;
1518   const bool         *orients_in = NULL, *orients_out = NULL;
1519   const CeedInt8     *curl_orients_in = NULL, *curl_orients_out = NULL;
1520   CeedOperator_Hip   *impl;
1521 
1522   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1523   CeedCallBackend(CeedOperatorGetData(op, &impl));
1524 
1525   // Assemble QFunction
1526   CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, CEED_REQUEST_IMMEDIATE));
1527   CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr));
1528   CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array));
1529 
1530   CeedCallBackend(CeedVectorGetLength(values, &values_length));
1531   CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
1532   if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
1533 
1534   // Setup
1535   if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Hip(op, use_ceedsize_idx));
1536   CeedOperatorAssemble_Hip *asmb = impl->asmb;
1537 
1538   assert(asmb != NULL);
1539 
1540   // Assemble element operator
1541   CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array));
1542   values_array += offset;
1543 
1544   CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out));
1545   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem_in));
1546   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in));
1547 
1548   CeedCallBackend(CeedElemRestrictionGetType(rstr_in, &rstr_type_in));
1549   if (rstr_type_in == CEED_RESTRICTION_ORIENTED) {
1550     CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_in, CEED_MEM_DEVICE, &orients_in));
1551   } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) {
1552     CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_in, CEED_MEM_DEVICE, &curl_orients_in));
1553   }
1554 
1555   if (rstr_in != rstr_out) {
1556     CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_out, &num_elem_out));
1557     CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED,
1558               "Active input and output operator restrictions must have the same number of elements");
1559     CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out));
1560 
1561     CeedCallBackend(CeedElemRestrictionGetType(rstr_out, &rstr_type_out));
1562     if (rstr_type_out == CEED_RESTRICTION_ORIENTED) {
1563       CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_out, CEED_MEM_DEVICE, &orients_out));
1564     } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) {
1565       CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_out, CEED_MEM_DEVICE, &curl_orients_out));
1566     }
1567   } else {
1568     elem_size_out    = elem_size_in;
1569     orients_out      = orients_in;
1570     curl_orients_out = curl_orients_in;
1571   }
1572 
1573   // Compute B^T D B
1574   CeedInt shared_mem =
1575       ((curl_orients_in || curl_orients_out ? elem_size_in * elem_size_out : 0) + (curl_orients_in ? elem_size_in * asmb->block_size_y : 0)) *
1576       sizeof(CeedScalar);
1577   CeedInt grid   = CeedDivUpInt(num_elem_in, asmb->elems_per_block);
1578   void   *args[] = {(void *)&num_elem_in, &asmb->d_B_in,     &asmb->d_B_out,      &orients_in,  &curl_orients_in,
1579                     &orients_out,         &curl_orients_out, &assembled_qf_array, &values_array};
1580 
1581   CeedCallBackend(
1582       CeedRunKernelDimShared_Hip(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args));
1583 
1584   // Restore arrays
1585   CeedCallBackend(CeedVectorRestoreArray(values, &values_array));
1586   CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array));
1587 
1588   // Cleanup
1589   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
1590   if (rstr_type_in == CEED_RESTRICTION_ORIENTED) {
1591     CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_in, &orients_in));
1592   } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) {
1593     CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_in, &curl_orients_in));
1594   }
1595   if (rstr_in != rstr_out) {
1596     if (rstr_type_out == CEED_RESTRICTION_ORIENTED) {
1597       CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_out, &orients_out));
1598     } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) {
1599       CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out));
1600     }
1601   }
1602   return CEED_ERROR_SUCCESS;
1603 }
1604 
1605 //------------------------------------------------------------------------------
1606 // Assemble Linear QFunction AtPoints
1607 //------------------------------------------------------------------------------
1608 static int CeedOperatorLinearAssembleQFunctionAtPoints_Hip(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
1609   return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Backend does not implement CeedOperatorLinearAssembleQFunction");
1610 }
1611 
1612 //------------------------------------------------------------------------------
1613 // Assemble Linear Diagonal AtPoints
1614 //------------------------------------------------------------------------------
1615 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
1616   CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields;
1617   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
1618   CeedQFunctionField *qf_input_fields, *qf_output_fields;
1619   CeedQFunction       qf;
1620   CeedOperatorField  *op_input_fields, *op_output_fields;
1621   CeedOperator_Hip   *impl;
1622 
1623   CeedCallBackend(CeedOperatorGetData(op, &impl));
1624   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1625   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
1626   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1627   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1628   CeedInt num_points[num_elem];
1629 
1630   // Setup
1631   CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op));
1632   max_num_points = impl->max_num_points;
1633   for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
1634 
1635   // Create separate output e-vecs
1636   if (impl->has_shared_e_vecs) {
1637     for (CeedInt i = 0; i < impl->num_outputs; i++) {
1638       CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
1639       CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[impl->num_inputs + i]));
1640     }
1641     CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
1642                                                 num_input_fields, num_output_fields, max_num_points, num_elem));
1643   }
1644   impl->has_shared_e_vecs = false;
1645 
1646   // Input Evecs and Restriction
1647   CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
1648 
1649   // Get point coordinates
1650   if (!impl->point_coords_elem) {
1651     CeedVector          point_coords = NULL;
1652     CeedElemRestriction rstr_points  = NULL;
1653 
1654     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
1655     CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
1656     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
1657   }
1658 
1659   // Clear active input Qvecs
1660   for (CeedInt i = 0; i < num_input_fields; i++) {
1661     CeedVector vec;
1662 
1663     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
1664     if (vec != CEED_VECTOR_ACTIVE) continue;
1665     CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
1666   }
1667 
1668   // Input basis apply if needed
1669   CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
1670 
1671   // Output pointers, as necessary
1672   for (CeedInt i = 0; i < num_output_fields; i++) {
1673     CeedEvalMode eval_mode;
1674 
1675     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1676     if (eval_mode == CEED_EVAL_NONE) {
1677       // Set the output Q-Vector to use the E-Vector data directly.
1678       CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
1679       CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
1680     }
1681   }
1682 
1683   // Loop over active fields
1684   for (CeedInt i = 0; i < num_input_fields; i++) {
1685     bool                is_active_at_points = true;
1686     CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0;
1687     CeedRestrictionType rstr_type;
1688     CeedVector          vec;
1689     CeedElemRestriction elem_rstr;
1690 
1691     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
1692     // -- Skip non-active input
1693     if (vec != CEED_VECTOR_ACTIVE) continue;
1694 
1695     // -- Get active restriction type
1696     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1697     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
1698     is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
1699     if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1700     else elem_size = max_num_points;
1701     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
1702 
1703     e_vec_size = elem_size * num_comp_active;
1704     for (CeedInt s = 0; s < e_vec_size; s++) {
1705       bool         is_active_input = false;
1706       CeedEvalMode eval_mode;
1707       CeedVector   vec;
1708       CeedBasis    basis;
1709 
1710       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
1711       // Skip non-active input
1712       is_active_input = vec == CEED_VECTOR_ACTIVE;
1713       if (!is_active_input) continue;
1714 
1715       // Update unit vector
1716       if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
1717       else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s - 1, e_vec_size, 0.0));
1718       CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 1.0));
1719 
1720       // Basis action
1721       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1722       switch (eval_mode) {
1723         case CEED_EVAL_NONE:
1724           CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
1725           break;
1726         case CEED_EVAL_INTERP:
1727         case CEED_EVAL_GRAD:
1728         case CEED_EVAL_DIV:
1729         case CEED_EVAL_CURL:
1730           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1731           CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs[i],
1732                                                  impl->q_vecs_in[i]));
1733           break;
1734         case CEED_EVAL_WEIGHT:
1735           break;  // No action
1736       }
1737 
1738       // Q function
1739       CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
1740 
1741       // Output basis apply if needed
1742       for (CeedInt j = 0; j < num_output_fields; j++) {
1743         bool                is_active_output = false;
1744         CeedInt             elem_size        = 0;
1745         CeedRestrictionType rstr_type;
1746         CeedEvalMode        eval_mode;
1747         CeedVector          vec;
1748         CeedElemRestriction elem_rstr;
1749         CeedBasis           basis;
1750 
1751         CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
1752         // ---- Skip non-active output
1753         is_active_output = vec == CEED_VECTOR_ACTIVE;
1754         if (!is_active_output) continue;
1755 
1756         // ---- Check if elem size matches
1757         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
1758         CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
1759         if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue;
1760         if (rstr_type == CEED_RESTRICTION_POINTS) {
1761           CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &elem_size));
1762         } else {
1763           CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1764         }
1765         {
1766           CeedInt num_comp = 0;
1767 
1768           CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1769           if (e_vec_size != num_comp * elem_size) continue;
1770         }
1771 
1772         // Basis action
1773         CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
1774         switch (eval_mode) {
1775           case CEED_EVAL_NONE:
1776             CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[j + impl->num_inputs], &e_data[j + num_input_fields]));
1777             break;
1778           case CEED_EVAL_INTERP:
1779           case CEED_EVAL_GRAD:
1780           case CEED_EVAL_DIV:
1781           case CEED_EVAL_CURL:
1782             CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
1783             CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
1784                                                    impl->q_vecs_out[j], impl->e_vecs[j + impl->num_inputs]));
1785             break;
1786           // LCOV_EXCL_START
1787           case CEED_EVAL_WEIGHT: {
1788             return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1789             // LCOV_EXCL_STOP
1790           }
1791         }
1792 
1793         // Mask output e-vec
1794         CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs[j + impl->num_inputs], impl->e_vecs[i], impl->e_vecs[j + impl->num_inputs]));
1795 
1796         // Restrict
1797         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
1798         CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[j + impl->num_inputs], assembled, request));
1799 
1800         // Reset q_vec for
1801         if (eval_mode == CEED_EVAL_NONE) {
1802           CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[j + impl->num_inputs], CEED_MEM_DEVICE, &e_data[j + num_input_fields]));
1803           CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[j], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[j + num_input_fields]));
1804         }
1805       }
1806 
1807       // Reset vec
1808       if (s == e_vec_size - 1 && i != num_input_fields - 1) CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
1809     }
1810   }
1811 
1812   // Restore CEED_EVAL_NONE
1813   for (CeedInt i = 0; i < num_output_fields; i++) {
1814     CeedEvalMode eval_mode;
1815 
1816     // Get eval_mode
1817     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1818 
1819     // Restore evec
1820     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1821     if (eval_mode == CEED_EVAL_NONE) {
1822       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
1823     }
1824   }
1825 
1826   // Restore input arrays
1827   CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
1828   return CEED_ERROR_SUCCESS;
1829 }
1830 
1831 //------------------------------------------------------------------------------
1832 // Create operator
1833 //------------------------------------------------------------------------------
1834 int CeedOperatorCreate_Hip(CeedOperator op) {
1835   Ceed              ceed;
1836   CeedOperator_Hip *impl;
1837 
1838   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1839   CeedCallBackend(CeedCalloc(1, &impl));
1840   CeedCallBackend(CeedOperatorSetData(op, impl));
1841   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip));
1842   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip));
1843   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Hip));
1844   CeedCallBackend(
1845       CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip));
1846   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Hip));
1847   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip));
1848   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip));
1849   return CEED_ERROR_SUCCESS;
1850 }
1851 
1852 //------------------------------------------------------------------------------
1853 // Create operator AtPoints
1854 //------------------------------------------------------------------------------
1855 int CeedOperatorCreateAtPoints_Hip(CeedOperator op) {
1856   Ceed              ceed;
1857   CeedOperator_Hip *impl;
1858 
1859   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1860   CeedCallBackend(CeedCalloc(1, &impl));
1861   CeedCallBackend(CeedOperatorSetData(op, impl));
1862 
1863   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunctionAtPoints_Hip));
1864   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip));
1865   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Hip));
1866   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip));
1867   return CEED_ERROR_SUCCESS;
1868 }
1869 
1870 //------------------------------------------------------------------------------
1871