xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-operator.c (revision d06a2c125c427f0694fb06710c5ee86c8219d1b1)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include <stdbool.h>
15 #include <string.h>
16 
17 #include "../cuda/ceed-cuda-common.h"
18 #include "../cuda/ceed-cuda-compile.h"
19 #include "ceed-cuda-ref.h"
20 
21 //------------------------------------------------------------------------------
22 // Destroy operator
23 //------------------------------------------------------------------------------
24 static int CeedOperatorDestroy_Cuda(CeedOperator op) {
25   CeedOperator_Cuda *impl;
26 
27   CeedCallBackend(CeedOperatorGetData(op, &impl));
28 
29   // Apply data
30   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
31   CeedCallBackend(CeedFree(&impl->skip_rstr_out));
32   CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
33   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
34     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
35   }
36   CeedCallBackend(CeedFree(&impl->e_vecs));
37   CeedCallBackend(CeedFree(&impl->input_states));
38 
39   for (CeedInt i = 0; i < impl->num_inputs; i++) {
40     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i]));
41   }
42   CeedCallBackend(CeedFree(&impl->q_vecs_in));
43 
44   for (CeedInt i = 0; i < impl->num_outputs; i++) {
45     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
46   }
47   CeedCallBackend(CeedFree(&impl->q_vecs_out));
48   CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem));
49 
50   // QFunction assembly data
51   for (CeedInt i = 0; i < impl->num_active_in; i++) {
52     CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
53   }
54   CeedCallBackend(CeedFree(&impl->qf_active_in));
55 
56   // Diag data
57   if (impl->diag) {
58     Ceed ceed;
59 
60     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
61     if (impl->diag->module) {
62       CeedCallCuda(ceed, cuModuleUnload(impl->diag->module));
63     }
64     if (impl->diag->module_point_block) {
65       CeedCallCuda(ceed, cuModuleUnload(impl->diag->module_point_block));
66     }
67     CeedCallCuda(ceed, cudaFree(impl->diag->d_eval_modes_in));
68     CeedCallCuda(ceed, cudaFree(impl->diag->d_eval_modes_out));
69     CeedCallCuda(ceed, cudaFree(impl->diag->d_identity));
70     CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_in));
71     CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_out));
72     CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_in));
73     CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_out));
74     CeedCallCuda(ceed, cudaFree(impl->diag->d_div_in));
75     CeedCallCuda(ceed, cudaFree(impl->diag->d_div_out));
76     CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_in));
77     CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_out));
78     CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
79     CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
80     CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag));
81     CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag));
82   }
83   CeedCallBackend(CeedFree(&impl->diag));
84 
85   if (impl->asmb) {
86     Ceed ceed;
87 
88     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
89     CeedCallCuda(ceed, cuModuleUnload(impl->asmb->module));
90     CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_in));
91     CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_out));
92   }
93   CeedCallBackend(CeedFree(&impl->asmb));
94 
95   CeedCallBackend(CeedFree(&impl));
96   return CEED_ERROR_SUCCESS;
97 }
98 
99 //------------------------------------------------------------------------------
100 // Setup infields or outfields
101 //------------------------------------------------------------------------------
102 static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, bool *apply_add_basis,
103                                         CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
104   Ceed                ceed;
105   CeedQFunctionField *qf_fields;
106   CeedOperatorField  *op_fields;
107 
108   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
109   if (is_input) {
110     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
111     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
112   } else {
113     CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
114     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
115   }
116 
117   // Loop over fields
118   for (CeedInt i = 0; i < num_fields; i++) {
119     bool         is_strided = false, skip_restriction = false;
120     CeedSize     q_size;
121     CeedInt      size;
122     CeedEvalMode eval_mode;
123     CeedBasis    basis;
124 
125     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
126     if (eval_mode != CEED_EVAL_WEIGHT) {
127       CeedElemRestriction elem_rstr;
128 
129       // Check whether this field can skip the element restriction:
130       // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
131       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
132 
133       // First, check whether the field is input or output:
134       if (is_input) {
135         CeedVector vec;
136 
137         // Check for passive input
138         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
139         if (vec != CEED_VECTOR_ACTIVE) {
140           // Check eval_mode
141           if (eval_mode == CEED_EVAL_NONE) {
142             // Check for strided restriction
143             CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
144             if (is_strided) {
145               // Check if vector is already in preferred backend ordering
146               CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction));
147             }
148           }
149         }
150       }
151       if (skip_restriction) {
152         // We do not need an E-Vector, but will use the input field vector's data directly in the operator application.
153         e_vecs[i + start_e] = NULL;
154       } else {
155         CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e]));
156       }
157     }
158 
159     switch (eval_mode) {
160       case CEED_EVAL_NONE:
161         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
162         q_size = (CeedSize)num_elem * Q * size;
163         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
164         break;
165       case CEED_EVAL_INTERP:
166       case CEED_EVAL_GRAD:
167       case CEED_EVAL_DIV:
168       case CEED_EVAL_CURL:
169         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
170         q_size = (CeedSize)num_elem * Q * size;
171         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
172         break;
173       case CEED_EVAL_WEIGHT:  // Only on input fields
174         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
175         q_size = (CeedSize)num_elem * Q;
176         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
177         if (is_at_points) {
178           CeedInt num_points[num_elem];
179 
180           for (CeedInt i = 0; i < num_elem; i++) num_points[i] = Q;
181           CeedCallBackend(
182               CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, q_vecs[i]));
183         } else {
184           CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
185         }
186         break;
187     }
188   }
189   // Drop duplicate restrictions
190   if (is_input) {
191     for (CeedInt i = 0; i < num_fields; i++) {
192       CeedVector          vec_i;
193       CeedElemRestriction rstr_i;
194 
195       CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
196       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
197       for (CeedInt j = i + 1; j < num_fields; j++) {
198         CeedVector          vec_j;
199         CeedElemRestriction rstr_j;
200 
201         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
202         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
203         if (vec_i == vec_j && rstr_i == rstr_j) {
204           CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i + start_e], &e_vecs[j + start_e]));
205           skip_rstr[j] = true;
206         }
207       }
208     }
209   } else {
210     for (CeedInt i = num_fields - 1; i >= 0; i--) {
211       CeedVector          vec_i;
212       CeedElemRestriction rstr_i;
213 
214       CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
215       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
216       for (CeedInt j = i - 1; j >= 0; j--) {
217         CeedVector          vec_j;
218         CeedElemRestriction rstr_j;
219 
220         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
221         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
222         if (vec_i == vec_j && rstr_i == rstr_j) {
223           CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i + start_e], &e_vecs[j + start_e]));
224           skip_rstr[j]       = true;
225           apply_add_basis[i] = true;
226         }
227       }
228     }
229   }
230   return CEED_ERROR_SUCCESS;
231 }
232 
233 //------------------------------------------------------------------------------
234 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
235 //------------------------------------------------------------------------------
236 static int CeedOperatorSetup_Cuda(CeedOperator op) {
237   Ceed                ceed;
238   bool                is_setup_done;
239   CeedInt             Q, num_elem, num_input_fields, num_output_fields;
240   CeedQFunctionField *qf_input_fields, *qf_output_fields;
241   CeedQFunction       qf;
242   CeedOperatorField  *op_input_fields, *op_output_fields;
243   CeedOperator_Cuda  *impl;
244 
245   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
246   if (is_setup_done) return CEED_ERROR_SUCCESS;
247 
248   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
249   CeedCallBackend(CeedOperatorGetData(op, &impl));
250   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
251   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
252   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
253   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
254   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
255 
256   // Allocate
257   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
258   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
259   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
260   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
261   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
262   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
263   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
264   impl->num_inputs  = num_input_fields;
265   impl->num_outputs = num_output_fields;
266 
267   // Set up infield and outfield e_vecs and q_vecs
268   // Infields
269   CeedCallBackend(
270       CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
271   // Outfields
272   CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
273                                                num_input_fields, num_output_fields, Q, num_elem));
274 
275   CeedCallBackend(CeedOperatorSetSetupDone(op));
276   return CEED_ERROR_SUCCESS;
277 }
278 
279 //------------------------------------------------------------------------------
280 // Setup Operator Inputs
281 //------------------------------------------------------------------------------
282 static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
283                                                CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
284                                                CeedOperator_Cuda *impl, CeedRequest *request) {
285   for (CeedInt i = 0; i < num_input_fields; i++) {
286     CeedEvalMode        eval_mode;
287     CeedVector          vec;
288     CeedElemRestriction elem_rstr;
289 
290     // Get input vector
291     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
292     if (vec == CEED_VECTOR_ACTIVE) {
293       if (skip_active) continue;
294       else vec = in_vec;
295     }
296 
297     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
298     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
299     } else {
300       // Get input vector
301       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
302       // Get input element restriction
303       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
304       if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
305       // Restrict, if necessary
306       if (!impl->e_vecs[i]) {
307         // No restriction for this field; read data directly from vec.
308         CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
309       } else {
310         uint64_t state;
311 
312         CeedCallBackend(CeedVectorGetState(vec, &state));
313         if (state != impl->input_states[i] && !impl->skip_rstr_in[i]) {
314           CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
315         }
316         impl->input_states[i] = state;
317         // Get evec
318         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
319       }
320     }
321   }
322   return CEED_ERROR_SUCCESS;
323 }
324 
325 //------------------------------------------------------------------------------
326 // Input Basis Action
327 //------------------------------------------------------------------------------
328 static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
329                                               CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
330                                               CeedOperator_Cuda *impl) {
331   for (CeedInt i = 0; i < num_input_fields; i++) {
332     CeedInt             elem_size, size;
333     CeedEvalMode        eval_mode;
334     CeedElemRestriction elem_rstr;
335     CeedBasis           basis;
336 
337     // Skip active input
338     if (skip_active) {
339       CeedVector vec;
340 
341       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
342       if (vec == CEED_VECTOR_ACTIVE) continue;
343     }
344     // Get elem_size, eval_mode, size
345     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
346     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
347     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
348     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
349     // Basis action
350     switch (eval_mode) {
351       case CEED_EVAL_NONE:
352         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
353         break;
354       case CEED_EVAL_INTERP:
355       case CEED_EVAL_GRAD:
356       case CEED_EVAL_DIV:
357       case CEED_EVAL_CURL:
358         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
359         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i]));
360         break;
361       case CEED_EVAL_WEIGHT:
362         break;  // No action
363     }
364   }
365   return CEED_ERROR_SUCCESS;
366 }
367 
368 //------------------------------------------------------------------------------
369 // Restore Input Vectors
370 //------------------------------------------------------------------------------
371 static inline int CeedOperatorRestoreInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
372                                                  const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) {
373   for (CeedInt i = 0; i < num_input_fields; i++) {
374     CeedEvalMode eval_mode;
375     CeedVector   vec;
376 
377     // Skip active input
378     if (skip_active) {
379       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
380       if (vec == CEED_VECTOR_ACTIVE) continue;
381     }
382     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
383     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
384     } else {
385       if (!impl->e_vecs[i]) {  // This was a skip_restriction case
386         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
387         CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i]));
388       } else {
389         CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i]));
390       }
391     }
392   }
393   return CEED_ERROR_SUCCESS;
394 }
395 
396 //------------------------------------------------------------------------------
397 // Apply and add to output
398 //------------------------------------------------------------------------------
399 static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
400   CeedInt             Q, num_elem, elem_size, num_input_fields, num_output_fields, size;
401   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
402   CeedQFunctionField *qf_input_fields, *qf_output_fields;
403   CeedQFunction       qf;
404   CeedOperatorField  *op_input_fields, *op_output_fields;
405   CeedOperator_Cuda  *impl;
406 
407   CeedCallBackend(CeedOperatorGetData(op, &impl));
408   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
409   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
410   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
411   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
412   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
413 
414   // Setup
415   CeedCallBackend(CeedOperatorSetup_Cuda(op));
416 
417   // Input Evecs and Restriction
418   CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
419 
420   // Input basis apply if needed
421   CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
422 
423   // Output pointers, as necessary
424   for (CeedInt i = 0; i < num_output_fields; i++) {
425     CeedEvalMode eval_mode;
426 
427     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
428     if (eval_mode == CEED_EVAL_NONE) {
429       // Set the output Q-Vector to use the E-Vector data directly.
430       CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
431       CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
432     }
433   }
434 
435   // Q function
436   CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));
437 
438   // Output basis apply if needed
439   for (CeedInt i = 0; i < num_output_fields; i++) {
440     CeedEvalMode        eval_mode;
441     CeedElemRestriction elem_rstr;
442     CeedBasis           basis;
443 
444     // Get elem_size, eval_mode, size
445     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
446     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
447     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
448     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
449     // Basis action
450     switch (eval_mode) {
451       case CEED_EVAL_NONE:
452         break;  // No action
453       case CEED_EVAL_INTERP:
454       case CEED_EVAL_GRAD:
455       case CEED_EVAL_DIV:
456       case CEED_EVAL_CURL:
457         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
458         if (impl->apply_add_basis_out[i]) {
459           CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
460         } else {
461           CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
462         }
463         break;
464       // LCOV_EXCL_START
465       case CEED_EVAL_WEIGHT: {
466         return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
467         // LCOV_EXCL_STOP
468       }
469     }
470   }
471 
472   // Output restriction
473   for (CeedInt i = 0; i < num_output_fields; i++) {
474     CeedEvalMode        eval_mode;
475     CeedVector          vec;
476     CeedElemRestriction elem_rstr;
477 
478     // Restore evec
479     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
480     if (eval_mode == CEED_EVAL_NONE) {
481       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
482     }
483     if (impl->skip_rstr_out[i]) continue;
484     // Get output vector
485     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
486     // Restrict
487     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
488     // Active
489     if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
490 
491     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
492   }
493 
494   // Restore input arrays
495   CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
496   return CEED_ERROR_SUCCESS;
497 }
498 
499 //------------------------------------------------------------------------------
500 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
501 //------------------------------------------------------------------------------
502 static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
503   Ceed                ceed;
504   bool                is_setup_done;
505   CeedInt             max_num_points = -1, num_elem, num_input_fields, num_output_fields;
506   CeedQFunctionField *qf_input_fields, *qf_output_fields;
507   CeedQFunction       qf;
508   CeedOperatorField  *op_input_fields, *op_output_fields;
509   CeedOperator_Cuda  *impl;
510 
511   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
512   if (is_setup_done) return CEED_ERROR_SUCCESS;
513 
514   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
515   CeedCallBackend(CeedOperatorGetData(op, &impl));
516   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
517   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
518   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
519   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
520   {
521     CeedElemRestriction elem_rstr = NULL;
522 
523     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL));
524     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points));
525   }
526   impl->max_num_points = max_num_points;
527 
528   // Allocate
529   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
530   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
531   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
532   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
533   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
534   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
535   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
536   impl->num_inputs  = num_input_fields;
537   impl->num_outputs = num_output_fields;
538 
539   // Set up infield and outfield e_vecs and q_vecs
540   // Infields
541   CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
542                                                max_num_points, num_elem));
543   // Outfields
544   CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
545                                                num_input_fields, num_output_fields, max_num_points, num_elem));
546 
547   CeedCallBackend(CeedOperatorSetSetupDone(op));
548   return CEED_ERROR_SUCCESS;
549 }
550 
551 //------------------------------------------------------------------------------
552 // Input Basis Action AtPoints
553 //------------------------------------------------------------------------------
554 static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedInt num_elem, const CeedInt *num_points, CeedQFunctionField *qf_input_fields,
555                                                       CeedOperatorField *op_input_fields, CeedInt num_input_fields, const bool skip_active,
556                                                       CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) {
557   for (CeedInt i = 0; i < num_input_fields; i++) {
558     CeedInt             elem_size, size;
559     CeedEvalMode        eval_mode;
560     CeedElemRestriction elem_rstr;
561     CeedBasis           basis;
562 
563     // Skip active input
564     if (skip_active) {
565       CeedVector vec;
566 
567       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
568       if (vec == CEED_VECTOR_ACTIVE) continue;
569     }
570     // Get elem_size, eval_mode, size
571     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
572     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
573     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
574     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
575     // Basis action
576     switch (eval_mode) {
577       case CEED_EVAL_NONE:
578         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
579         break;
580       case CEED_EVAL_INTERP:
581       case CEED_EVAL_GRAD:
582       case CEED_EVAL_DIV:
583       case CEED_EVAL_CURL:
584         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
585         CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs[i],
586                                                impl->q_vecs_in[i]));
587         break;
588       case CEED_EVAL_WEIGHT:
589         break;  // No action
590     }
591   }
592   return CEED_ERROR_SUCCESS;
593 }
594 
595 //------------------------------------------------------------------------------
596 // Apply and add to output AtPoints
597 //------------------------------------------------------------------------------
598 static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
599   CeedInt             max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
600   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
601   CeedQFunctionField *qf_input_fields, *qf_output_fields;
602   CeedQFunction       qf;
603   CeedOperatorField  *op_input_fields, *op_output_fields;
604   CeedOperator_Cuda  *impl;
605 
606   CeedCallBackend(CeedOperatorGetData(op, &impl));
607   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
608   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
609   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
610   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
611   CeedInt num_points[num_elem];
612 
613   // Setup
614   CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
615   max_num_points = impl->max_num_points;
616   for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
617 
618   // Input Evecs and Restriction
619   CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
620 
621   // Get point coordinates
622   if (!impl->point_coords_elem) {
623     CeedVector          point_coords = NULL;
624     CeedElemRestriction rstr_points  = NULL;
625 
626     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
627     CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
628     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
629   }
630 
631   // Input basis apply if needed
632   CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
633 
634   // Output pointers, as necessary
635   for (CeedInt i = 0; i < num_output_fields; i++) {
636     CeedEvalMode eval_mode;
637 
638     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
639     if (eval_mode == CEED_EVAL_NONE) {
640       // Set the output Q-Vector to use the E-Vector data directly.
641       CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
642       CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
643     }
644   }
645 
646   // Q function
647   CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
648 
649   // Output basis apply if needed
650   for (CeedInt i = 0; i < num_output_fields; i++) {
651     CeedEvalMode        eval_mode;
652     CeedElemRestriction elem_rstr;
653     CeedBasis           basis;
654 
655     // Get elem_size, eval_mode, size
656     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
657     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
658     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
659     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
660     // Basis action
661     switch (eval_mode) {
662       case CEED_EVAL_NONE:
663         break;  // No action
664       case CEED_EVAL_INTERP:
665       case CEED_EVAL_GRAD:
666       case CEED_EVAL_DIV:
667       case CEED_EVAL_CURL:
668         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
669         if (impl->apply_add_basis_out[i]) {
670           CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
671                                                     impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
672         } else {
673           CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
674                                                  impl->e_vecs[i + impl->num_inputs]));
675         }
676         break;
677       // LCOV_EXCL_START
678       case CEED_EVAL_WEIGHT: {
679         return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
680         // LCOV_EXCL_STOP
681       }
682     }
683   }
684 
685   // Output restriction
686   for (CeedInt i = 0; i < num_output_fields; i++) {
687     CeedEvalMode        eval_mode;
688     CeedVector          vec;
689     CeedElemRestriction elem_rstr;
690 
691     // Restore evec
692     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
693     if (eval_mode == CEED_EVAL_NONE) {
694       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
695     }
696     if (impl->skip_rstr_out[i]) continue;
697     // Get output vector
698     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
699     // Restrict
700     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
701     // Active
702     if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
703 
704     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
705   }
706 
707   // Restore input arrays
708   CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
709   return CEED_ERROR_SUCCESS;
710 }
711 
712 //------------------------------------------------------------------------------
713 // Linear QFunction Assembly Core
714 //------------------------------------------------------------------------------
715 static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
716                                                                CeedRequest *request) {
717   Ceed                ceed, ceed_parent;
718   CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
719   CeedScalar         *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL};
720   CeedVector         *active_inputs;
721   CeedQFunctionField *qf_input_fields, *qf_output_fields;
722   CeedQFunction       qf;
723   CeedOperatorField  *op_input_fields, *op_output_fields;
724   CeedOperator_Cuda  *impl;
725 
726   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
727   CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent));
728   CeedCallBackend(CeedOperatorGetData(op, &impl));
729   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
730   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
731   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
732   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
733   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
734   active_inputs = impl->qf_active_in;
735   num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
736 
737   // Setup
738   CeedCallBackend(CeedOperatorSetup_Cuda(op));
739 
740   // Input Evecs and Restriction
741   CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
742 
743   // Count number of active input fields
744   if (!num_active_in) {
745     for (CeedInt i = 0; i < num_input_fields; i++) {
746       CeedScalar *q_vec_array;
747       CeedVector  vec;
748 
749       // Get input vector
750       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
751       // Check if active input
752       if (vec == CEED_VECTOR_ACTIVE) {
753         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
754         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
755         CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array));
756         CeedCallBackend(CeedRealloc(num_active_in + size, &active_inputs));
757         for (CeedInt field = 0; field < size; field++) {
758           CeedSize q_size = (CeedSize)Q * num_elem;
759 
760           CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field]));
761           CeedCallBackend(
762               CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem]));
763         }
764         num_active_in += size;
765         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
766       }
767     }
768     impl->num_active_in = num_active_in;
769     impl->qf_active_in  = active_inputs;
770   }
771 
772   // Count number of active output fields
773   if (!num_active_out) {
774     for (CeedInt i = 0; i < num_output_fields; i++) {
775       CeedVector vec;
776 
777       // Get output vector
778       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
779       // Check if active output
780       if (vec == CEED_VECTOR_ACTIVE) {
781         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
782         num_active_out += size;
783       }
784     }
785     impl->num_active_out = num_active_out;
786   }
787 
788   // Check sizes
789   CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
790 
791   // Build objects if needed
792   if (build_objects) {
793     CeedSize l_size     = (CeedSize)num_elem * Q * num_active_in * num_active_out;
794     CeedInt  strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
795 
796     // Create output restriction
797     CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
798                                                      (CeedSize)num_active_in * (CeedSize)num_active_out * (CeedSize)num_elem * (CeedSize)Q, strides,
799                                                      rstr));
800     // Create assembled vector
801     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
802   }
803   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
804   CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array));
805 
806   // Input basis apply
807   CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
808 
809   // Assemble QFunction
810   for (CeedInt in = 0; in < num_active_in; in++) {
811     // Set Inputs
812     CeedCallBackend(CeedVectorSetValue(active_inputs[in], 1.0));
813     if (num_active_in > 1) {
814       CeedCallBackend(CeedVectorSetValue(active_inputs[(in + num_active_in - 1) % num_active_in], 0.0));
815     }
816     // Set Outputs
817     for (CeedInt out = 0; out < num_output_fields; out++) {
818       CeedVector vec;
819 
820       // Get output vector
821       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
822       // Check if active output
823       if (vec == CEED_VECTOR_ACTIVE) {
824         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array));
825         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
826         assembled_array += size * Q * num_elem;  // Advance the pointer by the size of the output
827       }
828     }
829     // Apply QFunction
830     CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
831   }
832 
833   // Un-set output q_vecs to prevent accidental overwrite of Assembled
834   for (CeedInt out = 0; out < num_output_fields; out++) {
835     CeedVector vec;
836 
837     // Get output vector
838     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
839     // Check if active output
840     if (vec == CEED_VECTOR_ACTIVE) {
841       CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL));
842     }
843   }
844 
845   // Restore input arrays
846   CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
847 
848   // Restore output
849   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
850   return CEED_ERROR_SUCCESS;
851 }
852 
853 //------------------------------------------------------------------------------
854 // Assemble Linear QFunction
855 //------------------------------------------------------------------------------
856 static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
857   return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr, request);
858 }
859 
860 //------------------------------------------------------------------------------
861 // Update Assembled Linear QFunction
862 //------------------------------------------------------------------------------
863 static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
864   return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled, &rstr, request);
865 }
866 
867 //------------------------------------------------------------------------------
868 // Assemble Diagonal Setup
869 //------------------------------------------------------------------------------
870 static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
871   Ceed                ceed;
872   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
873   CeedInt             q_comp, num_nodes, num_qpts;
874   CeedEvalMode       *eval_modes_in = NULL, *eval_modes_out = NULL;
875   CeedBasis           basis_in = NULL, basis_out = NULL;
876   CeedQFunctionField *qf_fields;
877   CeedQFunction       qf;
878   CeedOperatorField  *op_fields;
879   CeedOperator_Cuda  *impl;
880 
881   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
882   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
883   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
884 
885   // Determine active input basis
886   CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
887   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
888   for (CeedInt i = 0; i < num_input_fields; i++) {
889     CeedVector vec;
890 
891     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
892     if (vec == CEED_VECTOR_ACTIVE) {
893       CeedBasis    basis;
894       CeedEvalMode eval_mode;
895 
896       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
897       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND,
898                 "Backend does not implement operator diagonal assembly with multiple active bases");
899       basis_in = basis;
900       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
901       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
902       if (eval_mode != CEED_EVAL_WEIGHT) {
903         // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly
904         CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in));
905         for (CeedInt d = 0; d < q_comp; d++) eval_modes_in[num_eval_modes_in + d] = eval_mode;
906         num_eval_modes_in += q_comp;
907       }
908     }
909   }
910 
911   // Determine active output basis
912   CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
913   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
914   for (CeedInt i = 0; i < num_output_fields; i++) {
915     CeedVector vec;
916 
917     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
918     if (vec == CEED_VECTOR_ACTIVE) {
919       CeedBasis    basis;
920       CeedEvalMode eval_mode;
921 
922       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
923       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
924                 "Backend does not implement operator diagonal assembly with multiple active bases");
925       basis_out = basis;
926       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
927       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
928       if (eval_mode != CEED_EVAL_WEIGHT) {
929         // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly
930         CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out));
931         for (CeedInt d = 0; d < q_comp; d++) eval_modes_out[num_eval_modes_out + d] = eval_mode;
932         num_eval_modes_out += q_comp;
933       }
934     }
935   }
936 
937   // Operator data struct
938   CeedCallBackend(CeedOperatorGetData(op, &impl));
939   CeedCallBackend(CeedCalloc(1, &impl->diag));
940   CeedOperatorDiag_Cuda *diag = impl->diag;
941 
942   // Basis matrices
943   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
944   if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
945   else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
946   const CeedInt interp_bytes     = num_nodes * num_qpts * sizeof(CeedScalar);
947   const CeedInt eval_modes_bytes = sizeof(CeedEvalMode);
948   bool          has_eval_none    = false;
949 
950   // CEED_EVAL_NONE
951   for (CeedInt i = 0; i < num_eval_modes_in; i++) has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE);
952   for (CeedInt i = 0; i < num_eval_modes_out; i++) has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE);
953   if (has_eval_none) {
954     CeedScalar *identity = NULL;
955 
956     CeedCallBackend(CeedCalloc(num_nodes * num_qpts, &identity));
957     for (CeedInt i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0;
958     CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_identity, interp_bytes));
959     CeedCallCuda(ceed, cudaMemcpy(diag->d_identity, identity, interp_bytes, cudaMemcpyHostToDevice));
960     CeedCallBackend(CeedFree(&identity));
961   }
962 
963   // CEED_EVAL_INTERP, CEED_EVAL_GRAD, CEED_EVAL_DIV, and CEED_EVAL_CURL
964   for (CeedInt in = 0; in < 2; in++) {
965     CeedFESpace fespace;
966     CeedBasis   basis = in ? basis_in : basis_out;
967 
968     CeedCallBackend(CeedBasisGetFESpace(basis, &fespace));
969     switch (fespace) {
970       case CEED_FE_SPACE_H1: {
971         CeedInt           q_comp_interp, q_comp_grad;
972         const CeedScalar *interp, *grad;
973         CeedScalar       *d_interp, *d_grad;
974 
975         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
976         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
977 
978         CeedCallBackend(CeedBasisGetInterp(basis, &interp));
979         CeedCallCuda(ceed, cudaMalloc((void **)&d_interp, interp_bytes * q_comp_interp));
980         CeedCallCuda(ceed, cudaMemcpy(d_interp, interp, interp_bytes * q_comp_interp, cudaMemcpyHostToDevice));
981         CeedCallBackend(CeedBasisGetGrad(basis, &grad));
982         CeedCallCuda(ceed, cudaMalloc((void **)&d_grad, interp_bytes * q_comp_grad));
983         CeedCallCuda(ceed, cudaMemcpy(d_grad, grad, interp_bytes * q_comp_grad, cudaMemcpyHostToDevice));
984         if (in) {
985           diag->d_interp_in = d_interp;
986           diag->d_grad_in   = d_grad;
987         } else {
988           diag->d_interp_out = d_interp;
989           diag->d_grad_out   = d_grad;
990         }
991       } break;
992       case CEED_FE_SPACE_HDIV: {
993         CeedInt           q_comp_interp, q_comp_div;
994         const CeedScalar *interp, *div;
995         CeedScalar       *d_interp, *d_div;
996 
997         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
998         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
999 
1000         CeedCallBackend(CeedBasisGetInterp(basis, &interp));
1001         CeedCallCuda(ceed, cudaMalloc((void **)&d_interp, interp_bytes * q_comp_interp));
1002         CeedCallCuda(ceed, cudaMemcpy(d_interp, interp, interp_bytes * q_comp_interp, cudaMemcpyHostToDevice));
1003         CeedCallBackend(CeedBasisGetDiv(basis, &div));
1004         CeedCallCuda(ceed, cudaMalloc((void **)&d_div, interp_bytes * q_comp_div));
1005         CeedCallCuda(ceed, cudaMemcpy(d_div, div, interp_bytes * q_comp_div, cudaMemcpyHostToDevice));
1006         if (in) {
1007           diag->d_interp_in = d_interp;
1008           diag->d_div_in    = d_div;
1009         } else {
1010           diag->d_interp_out = d_interp;
1011           diag->d_div_out    = d_div;
1012         }
1013       } break;
1014       case CEED_FE_SPACE_HCURL: {
1015         CeedInt           q_comp_interp, q_comp_curl;
1016         const CeedScalar *interp, *curl;
1017         CeedScalar       *d_interp, *d_curl;
1018 
1019         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
1020         CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
1021 
1022         CeedCallBackend(CeedBasisGetInterp(basis, &interp));
1023         CeedCallCuda(ceed, cudaMalloc((void **)&d_interp, interp_bytes * q_comp_interp));
1024         CeedCallCuda(ceed, cudaMemcpy(d_interp, interp, interp_bytes * q_comp_interp, cudaMemcpyHostToDevice));
1025         CeedCallBackend(CeedBasisGetCurl(basis, &curl));
1026         CeedCallCuda(ceed, cudaMalloc((void **)&d_curl, interp_bytes * q_comp_curl));
1027         CeedCallCuda(ceed, cudaMemcpy(d_curl, curl, interp_bytes * q_comp_curl, cudaMemcpyHostToDevice));
1028         if (in) {
1029           diag->d_interp_in = d_interp;
1030           diag->d_curl_in   = d_curl;
1031         } else {
1032           diag->d_interp_out = d_interp;
1033           diag->d_curl_out   = d_curl;
1034         }
1035       } break;
1036     }
1037   }
1038 
1039   // Arrays of eval_modes
1040   CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_eval_modes_in, num_eval_modes_in * eval_modes_bytes));
1041   CeedCallCuda(ceed, cudaMemcpy(diag->d_eval_modes_in, eval_modes_in, num_eval_modes_in * eval_modes_bytes, cudaMemcpyHostToDevice));
1042   CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_eval_modes_out, num_eval_modes_out * eval_modes_bytes));
1043   CeedCallCuda(ceed, cudaMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, cudaMemcpyHostToDevice));
1044   CeedCallBackend(CeedFree(&eval_modes_in));
1045   CeedCallBackend(CeedFree(&eval_modes_out));
1046   return CEED_ERROR_SUCCESS;
1047 }
1048 
1049 //------------------------------------------------------------------------------
1050 // Assemble Diagonal Setup (Compilation)
1051 //------------------------------------------------------------------------------
1052 static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) {
1053   Ceed                ceed;
1054   char               *diagonal_kernel_source;
1055   const char         *diagonal_kernel_path;
1056   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
1057   CeedInt             num_comp, q_comp, num_nodes, num_qpts;
1058   CeedBasis           basis_in = NULL, basis_out = NULL;
1059   CeedQFunctionField *qf_fields;
1060   CeedQFunction       qf;
1061   CeedOperatorField  *op_fields;
1062   CeedOperator_Cuda  *impl;
1063 
1064   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1065   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1066   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
1067 
1068   // Determine active input basis
1069   CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
1070   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
1071   for (CeedInt i = 0; i < num_input_fields; i++) {
1072     CeedVector vec;
1073 
1074     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
1075     if (vec == CEED_VECTOR_ACTIVE) {
1076       CeedEvalMode eval_mode;
1077 
1078       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in));
1079       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
1080       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
1081       if (eval_mode != CEED_EVAL_WEIGHT) {
1082         num_eval_modes_in += q_comp;
1083       }
1084     }
1085   }
1086 
1087   // Determine active output basis
1088   CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
1089   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
1090   for (CeedInt i = 0; i < num_output_fields; i++) {
1091     CeedVector vec;
1092 
1093     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
1094     if (vec == CEED_VECTOR_ACTIVE) {
1095       CeedEvalMode eval_mode;
1096 
1097       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out));
1098       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
1099       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
1100       if (eval_mode != CEED_EVAL_WEIGHT) {
1101         num_eval_modes_out += q_comp;
1102       }
1103     }
1104   }
1105 
1106   // Operator data struct
1107   CeedCallBackend(CeedOperatorGetData(op, &impl));
1108   CeedOperatorDiag_Cuda *diag = impl->diag;
1109 
1110   // Assemble kernel
1111   CUmodule *module          = is_point_block ? &diag->module_point_block : &diag->module;
1112   CeedInt   elems_per_block = 1;
1113   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
1114   CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
1115   if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
1116   else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
1117   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path));
1118   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n");
1119   CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source));
1120   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n");
1121   CeedCallCuda(ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
1122                                       num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE",
1123                                       use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block));
1124   CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
1125   CeedCallBackend(CeedFree(&diagonal_kernel_path));
1126   CeedCallBackend(CeedFree(&diagonal_kernel_source));
1127   return CEED_ERROR_SUCCESS;
1128 }
1129 
1130 //------------------------------------------------------------------------------
1131 // Assemble Diagonal Core
1132 //------------------------------------------------------------------------------
1133 static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) {
1134   Ceed                ceed;
1135   CeedInt             num_elem, num_nodes;
1136   CeedScalar         *elem_diag_array;
1137   const CeedScalar   *assembled_qf_array;
1138   CeedVector          assembled_qf   = NULL, elem_diag;
1139   CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out, diag_rstr;
1140   CeedOperator_Cuda  *impl;
1141 
1142   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1143   CeedCallBackend(CeedOperatorGetData(op, &impl));
1144 
1145   // Assemble QFunction
1146   CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, request));
1147   CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr));
1148   CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array));
1149 
1150   // Setup
1151   if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op));
1152   CeedOperatorDiag_Cuda *diag = impl->diag;
1153 
1154   assert(diag != NULL);
1155 
1156   // Assemble kernel if needed
1157   if ((!is_point_block && !diag->LinearDiagonal) || (is_point_block && !diag->LinearPointBlock)) {
1158     CeedSize assembled_length, assembled_qf_length;
1159     CeedInt  use_ceedsize_idx = 0;
1160     CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length));
1161     CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
1162     if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
1163 
1164     CeedCallBackend(CeedOperatorAssembleDiagonalSetupCompile_Cuda(op, use_ceedsize_idx, is_point_block));
1165   }
1166 
1167   // Restriction and diagonal vector
1168   CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out));
1169   CeedCheck(rstr_in == rstr_out, ceed, CEED_ERROR_BACKEND,
1170             "Cannot assemble operator diagonal with different input and output active element restrictions");
1171   if (!is_point_block && !diag->diag_rstr) {
1172     CeedCallBackend(CeedElemRestrictionCreateUnsignedCopy(rstr_out, &diag->diag_rstr));
1173     CeedCallBackend(CeedElemRestrictionCreateVector(diag->diag_rstr, NULL, &diag->elem_diag));
1174   } else if (is_point_block && !diag->point_block_diag_rstr) {
1175     CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr));
1176     CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag));
1177   }
1178   diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr;
1179   elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag;
1180   CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0));
1181 
1182   // Only assemble diagonal if the basis has nodes, otherwise inputs are null pointers
1183   CeedCallBackend(CeedElemRestrictionGetElementSize(diag_rstr, &num_nodes));
1184   if (num_nodes > 0) {
1185     // Assemble element operator diagonals
1186     CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem));
1187     CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array));
1188 
1189     // Compute the diagonal of B^T D B
1190     CeedInt elems_per_block = 1;
1191     CeedInt grid            = CeedDivUpInt(num_elem, elems_per_block);
1192     void   *args[]          = {(void *)&num_elem,      &diag->d_identity,       &diag->d_interp_in,  &diag->d_grad_in, &diag->d_div_in,
1193                                &diag->d_curl_in,       &diag->d_interp_out,     &diag->d_grad_out,   &diag->d_div_out, &diag->d_curl_out,
1194                                &diag->d_eval_modes_in, &diag->d_eval_modes_out, &assembled_qf_array, &elem_diag_array};
1195 
1196     if (is_point_block) {
1197       CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->LinearPointBlock, grid, num_nodes, 1, elems_per_block, args));
1198     } else {
1199       CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->LinearDiagonal, grid, num_nodes, 1, elems_per_block, args));
1200     }
1201 
1202     // Restore arrays
1203     CeedCallBackend(CeedVectorRestoreArray(elem_diag, &elem_diag_array));
1204     CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array));
1205   }
1206 
1207   // Assemble local operator diagonal
1208   CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request));
1209 
1210   // Cleanup
1211   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
1212   return CEED_ERROR_SUCCESS;
1213 }
1214 
1215 //------------------------------------------------------------------------------
1216 // Assemble Linear Diagonal
1217 //------------------------------------------------------------------------------
1218 static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
1219   CeedCallBackend(CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false));
1220   return CEED_ERROR_SUCCESS;
1221 }
1222 
1223 //------------------------------------------------------------------------------
1224 // Assemble Linear Point Block Diagonal
1225 //------------------------------------------------------------------------------
1226 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
1227   CeedCallBackend(CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true));
1228   return CEED_ERROR_SUCCESS;
1229 }
1230 
1231 //------------------------------------------------------------------------------
1232 // Single Operator Assembly Setup
1233 //------------------------------------------------------------------------------
1234 static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) {
1235   Ceed                ceed;
1236   Ceed_Cuda          *cuda_data;
1237   char               *assembly_kernel_source;
1238   const char         *assembly_kernel_path;
1239   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
1240   CeedInt             elem_size_in, num_qpts_in = 0, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp;
1241   CeedEvalMode       *eval_modes_in = NULL, *eval_modes_out = NULL;
1242   CeedElemRestriction rstr_in = NULL, rstr_out = NULL;
1243   CeedBasis           basis_in = NULL, basis_out = NULL;
1244   CeedQFunctionField *qf_fields;
1245   CeedQFunction       qf;
1246   CeedOperatorField  *input_fields, *output_fields;
1247   CeedOperator_Cuda  *impl;
1248 
1249   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1250   CeedCallBackend(CeedOperatorGetData(op, &impl));
1251 
1252   // Get intput and output fields
1253   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
1254 
1255   // Determine active input basis eval mode
1256   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1257   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
1258   for (CeedInt i = 0; i < num_input_fields; i++) {
1259     CeedVector vec;
1260 
1261     CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec));
1262     if (vec == CEED_VECTOR_ACTIVE) {
1263       CeedBasis    basis;
1264       CeedEvalMode eval_mode;
1265 
1266       CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis));
1267       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
1268       basis_in = basis;
1269       CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in));
1270       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in));
1271       if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in;
1272       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in));
1273       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
1274       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
1275       if (eval_mode != CEED_EVAL_WEIGHT) {
1276         // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly
1277         CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in));
1278         for (CeedInt d = 0; d < q_comp; d++) {
1279           eval_modes_in[num_eval_modes_in + d] = eval_mode;
1280         }
1281         num_eval_modes_in += q_comp;
1282       }
1283     }
1284   }
1285 
1286   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
1287   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
1288   for (CeedInt i = 0; i < num_output_fields; i++) {
1289     CeedVector vec;
1290 
1291     CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec));
1292     if (vec == CEED_VECTOR_ACTIVE) {
1293       CeedBasis    basis;
1294       CeedEvalMode eval_mode;
1295 
1296       CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis));
1297       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
1298                 "Backend does not implement operator assembly with multiple active bases");
1299       basis_out = basis;
1300       CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out));
1301       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out));
1302       if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out;
1303       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out));
1304       CeedCheck(num_qpts_in == num_qpts_out, ceed, CEED_ERROR_UNSUPPORTED,
1305                 "Active input and output bases must have the same number of quadrature points");
1306       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
1307       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
1308       if (eval_mode != CEED_EVAL_WEIGHT) {
1309         // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly
1310         CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out));
1311         for (CeedInt d = 0; d < q_comp; d++) {
1312           eval_modes_out[num_eval_modes_out + d] = eval_mode;
1313         }
1314         num_eval_modes_out += q_comp;
1315       }
1316     }
1317   }
1318   CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
1319 
1320   CeedCallBackend(CeedCalloc(1, &impl->asmb));
1321   CeedOperatorAssemble_Cuda *asmb = impl->asmb;
1322   asmb->elems_per_block           = 1;
1323   asmb->block_size_x              = elem_size_in;
1324   asmb->block_size_y              = elem_size_out;
1325 
1326   CeedCallBackend(CeedGetData(ceed, &cuda_data));
1327   bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > cuda_data->device_prop.maxThreadsPerBlock;
1328 
1329   if (fallback) {
1330     // Use fallback kernel with 1D threadblock
1331     asmb->block_size_y = 1;
1332   }
1333 
1334   // Compile kernels
1335   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in));
1336   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out));
1337   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path));
1338   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n");
1339   CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source));
1340   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n");
1341   CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
1342                                    num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in,
1343                                    "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE",
1344                                    asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y,
1345                                    "USE_CEEDSIZE", use_ceedsize_idx));
1346   CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble));
1347   CeedCallBackend(CeedFree(&assembly_kernel_path));
1348   CeedCallBackend(CeedFree(&assembly_kernel_source));
1349 
1350   // Load into B_in, in order that they will be used in eval_modes_in
1351   {
1352     const CeedInt in_bytes           = elem_size_in * num_qpts_in * num_eval_modes_in * sizeof(CeedScalar);
1353     CeedInt       d_in               = 0;
1354     CeedEvalMode  eval_modes_in_prev = CEED_EVAL_NONE;
1355     bool          has_eval_none      = false;
1356     CeedScalar   *identity           = NULL;
1357 
1358     for (CeedInt i = 0; i < num_eval_modes_in; i++) {
1359       has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE);
1360     }
1361     if (has_eval_none) {
1362       CeedCallBackend(CeedCalloc(elem_size_in * num_qpts_in, &identity));
1363       for (CeedInt i = 0; i < (elem_size_in < num_qpts_in ? elem_size_in : num_qpts_in); i++) identity[i * elem_size_in + i] = 1.0;
1364     }
1365 
1366     CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_in, in_bytes));
1367     for (CeedInt i = 0; i < num_eval_modes_in; i++) {
1368       const CeedScalar *h_B_in;
1369 
1370       CeedCallBackend(CeedOperatorGetBasisPointer(basis_in, eval_modes_in[i], identity, &h_B_in));
1371       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_modes_in[i], &q_comp));
1372       if (q_comp > 1) {
1373         if (i == 0 || eval_modes_in[i] != eval_modes_in_prev) d_in = 0;
1374         else h_B_in = &h_B_in[(++d_in) * elem_size_in * num_qpts_in];
1375       }
1376       eval_modes_in_prev = eval_modes_in[i];
1377 
1378       CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar),
1379                                     cudaMemcpyHostToDevice));
1380     }
1381 
1382     if (identity) {
1383       CeedCallBackend(CeedFree(&identity));
1384     }
1385   }
1386 
1387   // Load into B_out, in order that they will be used in eval_modes_out
1388   {
1389     const CeedInt out_bytes           = elem_size_out * num_qpts_out * num_eval_modes_out * sizeof(CeedScalar);
1390     CeedInt       d_out               = 0;
1391     CeedEvalMode  eval_modes_out_prev = CEED_EVAL_NONE;
1392     bool          has_eval_none       = false;
1393     CeedScalar   *identity            = NULL;
1394 
1395     for (CeedInt i = 0; i < num_eval_modes_out; i++) {
1396       has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE);
1397     }
1398     if (has_eval_none) {
1399       CeedCallBackend(CeedCalloc(elem_size_out * num_qpts_out, &identity));
1400       for (CeedInt i = 0; i < (elem_size_out < num_qpts_out ? elem_size_out : num_qpts_out); i++) identity[i * elem_size_out + i] = 1.0;
1401     }
1402 
1403     CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_out, out_bytes));
1404     for (CeedInt i = 0; i < num_eval_modes_out; i++) {
1405       const CeedScalar *h_B_out;
1406 
1407       CeedCallBackend(CeedOperatorGetBasisPointer(basis_out, eval_modes_out[i], identity, &h_B_out));
1408       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_modes_out[i], &q_comp));
1409       if (q_comp > 1) {
1410         if (i == 0 || eval_modes_out[i] != eval_modes_out_prev) d_out = 0;
1411         else h_B_out = &h_B_out[(++d_out) * elem_size_out * num_qpts_out];
1412       }
1413       eval_modes_out_prev = eval_modes_out[i];
1414 
1415       CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar),
1416                                     cudaMemcpyHostToDevice));
1417     }
1418 
1419     if (identity) {
1420       CeedCallBackend(CeedFree(&identity));
1421     }
1422   }
1423   return CEED_ERROR_SUCCESS;
1424 }
1425 
1426 //------------------------------------------------------------------------------
1427 // Assemble matrix data for COO matrix of assembled operator.
1428 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
1429 //
1430 // Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval
1431 // modes).
1432 // TODO: allow multiple active input restrictions/basis objects
1433 //------------------------------------------------------------------------------
1434 static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) {
1435   Ceed                ceed;
1436   CeedSize            values_length = 0, assembled_qf_length = 0;
1437   CeedInt             use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out;
1438   CeedScalar         *values_array;
1439   const CeedScalar   *assembled_qf_array;
1440   CeedVector          assembled_qf   = NULL;
1441   CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out;
1442   CeedRestrictionType rstr_type_in, rstr_type_out;
1443   const bool         *orients_in = NULL, *orients_out = NULL;
1444   const CeedInt8     *curl_orients_in = NULL, *curl_orients_out = NULL;
1445   CeedOperator_Cuda  *impl;
1446 
1447   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1448   CeedCallBackend(CeedOperatorGetData(op, &impl));
1449 
1450   // Assemble QFunction
1451   CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, CEED_REQUEST_IMMEDIATE));
1452   CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr));
1453   CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array));
1454 
1455   CeedCallBackend(CeedVectorGetLength(values, &values_length));
1456   CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
1457   if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
1458 
1459   // Setup
1460   if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx));
1461   CeedOperatorAssemble_Cuda *asmb = impl->asmb;
1462 
1463   assert(asmb != NULL);
1464 
1465   // Assemble element operator
1466   CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array));
1467   values_array += offset;
1468 
1469   CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out));
1470   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem_in));
1471   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in));
1472 
1473   CeedCallBackend(CeedElemRestrictionGetType(rstr_in, &rstr_type_in));
1474   if (rstr_type_in == CEED_RESTRICTION_ORIENTED) {
1475     CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_in, CEED_MEM_DEVICE, &orients_in));
1476   } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) {
1477     CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_in, CEED_MEM_DEVICE, &curl_orients_in));
1478   }
1479 
1480   if (rstr_in != rstr_out) {
1481     CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_out, &num_elem_out));
1482     CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED,
1483               "Active input and output operator restrictions must have the same number of elements");
1484     CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out));
1485 
1486     CeedCallBackend(CeedElemRestrictionGetType(rstr_out, &rstr_type_out));
1487     if (rstr_type_out == CEED_RESTRICTION_ORIENTED) {
1488       CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_out, CEED_MEM_DEVICE, &orients_out));
1489     } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) {
1490       CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_out, CEED_MEM_DEVICE, &curl_orients_out));
1491     }
1492   } else {
1493     elem_size_out    = elem_size_in;
1494     orients_out      = orients_in;
1495     curl_orients_out = curl_orients_in;
1496   }
1497 
1498   // Compute B^T D B
1499   CeedInt shared_mem =
1500       ((curl_orients_in || curl_orients_out ? elem_size_in * elem_size_out : 0) + (curl_orients_in ? elem_size_in * asmb->block_size_y : 0)) *
1501       sizeof(CeedScalar);
1502   CeedInt grid   = CeedDivUpInt(num_elem_in, asmb->elems_per_block);
1503   void   *args[] = {(void *)&num_elem_in, &asmb->d_B_in,     &asmb->d_B_out,      &orients_in,  &curl_orients_in,
1504                     &orients_out,         &curl_orients_out, &assembled_qf_array, &values_array};
1505 
1506   CeedCallBackend(
1507       CeedRunKernelDimShared_Cuda(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args));
1508 
1509   // Restore arrays
1510   CeedCallBackend(CeedVectorRestoreArray(values, &values_array));
1511   CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array));
1512 
1513   // Cleanup
1514   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
1515   if (rstr_type_in == CEED_RESTRICTION_ORIENTED) {
1516     CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_in, &orients_in));
1517   } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) {
1518     CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_in, &curl_orients_in));
1519   }
1520   if (rstr_in != rstr_out) {
1521     if (rstr_type_out == CEED_RESTRICTION_ORIENTED) {
1522       CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_out, &orients_out));
1523     } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) {
1524       CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out));
1525     }
1526   }
1527   return CEED_ERROR_SUCCESS;
1528 }
1529 
1530 //------------------------------------------------------------------------------
1531 // Assemble Linear QFunction AtPoints
1532 //------------------------------------------------------------------------------
1533 static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
1534   return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Backend does not implement CeedOperatorLinearAssembleQFunction");
1535 }
1536 
1537 //------------------------------------------------------------------------------
1538 // Assemble Linear Diagonal AtPoints
1539 //------------------------------------------------------------------------------
1540 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
1541   CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields;
1542   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
1543   CeedQFunctionField *qf_input_fields, *qf_output_fields;
1544   CeedQFunction       qf;
1545   CeedOperatorField  *op_input_fields, *op_output_fields;
1546   CeedOperator_Cuda  *impl;
1547 
1548   CeedCallBackend(CeedOperatorGetData(op, &impl));
1549   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1550   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
1551   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1552   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1553   CeedInt num_points[num_elem];
1554 
1555   // Setup
1556   CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
1557   max_num_points = impl->max_num_points;
1558   for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
1559 
1560   // Input Evecs and Restriction
1561   CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
1562 
1563   // Get point coordinates
1564   if (!impl->point_coords_elem) {
1565     CeedVector          point_coords = NULL;
1566     CeedElemRestriction rstr_points  = NULL;
1567 
1568     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
1569     CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
1570     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
1571   }
1572 
1573   // Clear active input Qvecs
1574   for (CeedInt i = 0; i < num_input_fields; i++) {
1575     CeedVector vec;
1576 
1577     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
1578     if (vec != CEED_VECTOR_ACTIVE) continue;
1579     CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
1580   }
1581 
1582   // Input basis apply if needed
1583   CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
1584 
1585   // Output pointers, as necessary
1586   for (CeedInt i = 0; i < num_output_fields; i++) {
1587     CeedEvalMode eval_mode;
1588 
1589     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1590     if (eval_mode == CEED_EVAL_NONE) {
1591       // Set the output Q-Vector to use the E-Vector data directly.
1592       CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
1593       CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
1594     }
1595   }
1596 
1597   // Loop over active fields
1598   for (CeedInt i = 0; i < num_input_fields; i++) {
1599     bool                is_active_at_points = true;
1600     CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0;
1601     CeedRestrictionType rstr_type;
1602     CeedVector          vec;
1603     CeedElemRestriction elem_rstr;
1604 
1605     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
1606     // -- Skip non-active input
1607     if (vec != CEED_VECTOR_ACTIVE) continue;
1608 
1609     // -- Get active restriction type
1610     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1611     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
1612     is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
1613     if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1614     else elem_size = max_num_points;
1615     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
1616 
1617     e_vec_size = elem_size * num_comp_active;
1618     for (CeedInt s = 0; s < e_vec_size; s++) {
1619       bool         is_active_input = false;
1620       CeedEvalMode eval_mode;
1621       CeedVector   vec;
1622       CeedBasis    basis;
1623 
1624       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
1625       // Skip non-active input
1626       is_active_input = vec == CEED_VECTOR_ACTIVE;
1627       if (!is_active_input) continue;
1628 
1629       // Update unit vector
1630       if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
1631       else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s - 1, e_vec_size, 0.0));
1632       CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 1.0));
1633 
1634       // Basis action
1635       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1636       switch (eval_mode) {
1637         case CEED_EVAL_NONE:
1638           CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
1639           break;
1640         case CEED_EVAL_INTERP:
1641         case CEED_EVAL_GRAD:
1642         case CEED_EVAL_DIV:
1643         case CEED_EVAL_CURL:
1644           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1645           CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs[i],
1646                                                  impl->q_vecs_in[i]));
1647           break;
1648         case CEED_EVAL_WEIGHT:
1649           break;  // No action
1650       }
1651 
1652       // Q function
1653       CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
1654 
1655       // Output basis apply if needed
1656       for (CeedInt j = 0; j < num_output_fields; j++) {
1657         bool                is_active_output = false;
1658         CeedInt             elem_size        = 0;
1659         CeedRestrictionType rstr_type;
1660         CeedEvalMode        eval_mode;
1661         CeedVector          vec;
1662         CeedElemRestriction elem_rstr;
1663         CeedBasis           basis;
1664 
1665         CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
1666         // ---- Skip non-active output
1667         is_active_output = vec == CEED_VECTOR_ACTIVE;
1668         if (!is_active_output) continue;
1669 
1670         // ---- Check if elem size matches
1671         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
1672         CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
1673         if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue;
1674         if (rstr_type == CEED_RESTRICTION_POINTS) {
1675           CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &elem_size));
1676         } else {
1677           CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1678         }
1679         {
1680           CeedInt num_comp = 0;
1681 
1682           CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1683           if (e_vec_size != num_comp * elem_size) continue;
1684         }
1685 
1686         // Basis action
1687         CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
1688         switch (eval_mode) {
1689           case CEED_EVAL_NONE:
1690             CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[j + impl->num_inputs], &e_data[j + num_input_fields]));
1691             break;
1692           case CEED_EVAL_INTERP:
1693           case CEED_EVAL_GRAD:
1694           case CEED_EVAL_DIV:
1695           case CEED_EVAL_CURL:
1696             CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
1697             CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
1698                                                    impl->q_vecs_out[j], impl->e_vecs[j + impl->num_inputs]));
1699             break;
1700           // LCOV_EXCL_START
1701           case CEED_EVAL_WEIGHT: {
1702             return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1703             // LCOV_EXCL_STOP
1704           }
1705         }
1706 
1707         // Mask output e-vec
1708         CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs[j + impl->num_inputs], impl->e_vecs[i], impl->e_vecs[j + impl->num_inputs]));
1709 
1710         // Restrict
1711         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
1712         CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[j + impl->num_inputs], assembled, request));
1713 
1714         // Reset q_vec for
1715         if (eval_mode == CEED_EVAL_NONE) {
1716           CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[j + impl->num_inputs], CEED_MEM_DEVICE, &e_data[j + num_input_fields]));
1717           CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[j], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[j + num_input_fields]));
1718         }
1719       }
1720 
1721       // Reset vec
1722       if (s == e_vec_size - 1 && i != num_input_fields - 1) CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
1723     }
1724   }
1725 
1726   // Restore CEED_EVAL_NONE
1727   for (CeedInt i = 0; i < num_output_fields; i++) {
1728     CeedEvalMode eval_mode;
1729 
1730     // Get eval_mode
1731     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1732 
1733     // Restore evec
1734     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1735     if (eval_mode == CEED_EVAL_NONE) {
1736       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
1737     }
1738   }
1739 
1740   // Restore input arrays
1741   CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
1742   return CEED_ERROR_SUCCESS;
1743 }
1744 
1745 //------------------------------------------------------------------------------
1746 // Create operator
1747 //------------------------------------------------------------------------------
1748 int CeedOperatorCreate_Cuda(CeedOperator op) {
1749   Ceed               ceed;
1750   CeedOperator_Cuda *impl;
1751 
1752   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1753   CeedCallBackend(CeedCalloc(1, &impl));
1754   CeedCallBackend(CeedOperatorSetData(op, impl));
1755 
1756   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda));
1757   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Cuda));
1758   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Cuda));
1759   CeedCallBackend(
1760       CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda));
1761   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Cuda));
1762   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda));
1763   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda));
1764   return CEED_ERROR_SUCCESS;
1765 }
1766 
1767 //------------------------------------------------------------------------------
1768 // Create operator AtPoints
1769 //------------------------------------------------------------------------------
1770 int CeedOperatorCreateAtPoints_Cuda(CeedOperator op) {
1771   Ceed               ceed;
1772   CeedOperator_Cuda *impl;
1773 
1774   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1775   CeedCallBackend(CeedCalloc(1, &impl));
1776   CeedCallBackend(CeedOperatorSetData(op, impl));
1777 
1778   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunctionAtPoints_Cuda));
1779   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda));
1780   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Cuda));
1781   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda));
1782   return CEED_ERROR_SUCCESS;
1783 }
1784 
1785 //------------------------------------------------------------------------------
1786