xref: /libCEED/backends/blocked/ceed-blocked-operator.c (revision 77d1c127eaba12da4c1761ef74a16ca3fc16e493)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <stdbool.h>
11 #include <stddef.h>
12 #include <stdint.h>
13 
14 #include "ceed-blocked.h"
15 
16 //------------------------------------------------------------------------------
17 // Setup Input/Output Fields
18 //------------------------------------------------------------------------------
19 static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, CeedElemRestriction *blk_restr, CeedVector *e_vecs_full,
20                                            CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
21   CeedInt  num_comp, size, P;
22   CeedSize e_size, q_size;
23   Ceed     ceed;
24   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
25   CeedBasis           basis;
26   CeedElemRestriction r;
27   CeedOperatorField  *op_fields;
28   CeedQFunctionField *qf_fields;
29   if (is_input) {
30     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
31     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
32   } else {
33     CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
34     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
35   }
36   const CeedInt blk_size = 8;
37 
38   // Loop over fields
39   for (CeedInt i = 0; i < num_fields; i++) {
40     CeedEvalMode eval_mode;
41     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
42 
43     if (eval_mode != CEED_EVAL_WEIGHT) {
44       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &r));
45       CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
46       CeedSize l_size;
47       CeedInt  num_elem, elem_size, comp_stride;
48       CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem));
49       CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size));
50       CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size));
51       CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp));
52 
53       bool strided;
54       CeedCallBackend(CeedElemRestrictionIsStrided(r, &strided));
55       if (strided) {
56         CeedInt strides[3];
57         CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides));
58         CeedCallBackend(
59             CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, elem_size, blk_size, num_comp, l_size, strides, &blk_restr[i + start_e]));
60       } else {
61         const CeedInt *offsets      = NULL;
62         const bool    *orients      = NULL;
63         const CeedInt *curl_orients = NULL;
64         CeedCallBackend(CeedElemRestrictionGetOffsets(r, CEED_MEM_HOST, &offsets));
65         CeedCallBackend(CeedElemRestrictionGetOrientations(r, CEED_MEM_HOST, &orients));
66         CeedCallBackend(CeedElemRestrictionGetCurlOrientations(r, CEED_MEM_HOST, &curl_orients));
67         CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride));
68         if (!orients && !curl_orients) {
69           CeedCallBackend(CeedElemRestrictionCreateBlocked(ceed, num_elem, elem_size, blk_size, num_comp, comp_stride, l_size, CEED_MEM_HOST,
70                                                            CEED_COPY_VALUES, offsets, &blk_restr[i + start_e]));
71         } else if (!curl_orients) {
72           CeedCallBackend(CeedElemRestrictionCreateBlockedOriented(ceed, num_elem, elem_size, blk_size, num_comp, comp_stride, l_size, CEED_MEM_HOST,
73                                                                    CEED_COPY_VALUES, offsets, orients, &blk_restr[i + start_e]));
74         } else {
75           CeedCallBackend(CeedElemRestrictionCreateBlockedCurlOriented(ceed, num_elem, elem_size, blk_size, num_comp, comp_stride, l_size,
76                                                                        CEED_MEM_HOST, CEED_COPY_VALUES, offsets, curl_orients,
77                                                                        &blk_restr[i + start_e]));
78         }
79         CeedCallBackend(CeedElemRestrictionRestoreOffsets(r, &offsets));
80         CeedCallBackend(CeedElemRestrictionRestoreOrientations(r, &orients));
81         CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(r, &curl_orients));
82       }
83       CeedCallBackend(CeedElemRestrictionCreateVector(blk_restr[i + start_e], NULL, &e_vecs_full[i + start_e]));
84     }
85 
86     switch (eval_mode) {
87       case CEED_EVAL_NONE:
88         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
89         q_size = (CeedSize)Q * size * blk_size;
90         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
91         break;
92       case CEED_EVAL_INTERP:
93       case CEED_EVAL_GRAD:
94       case CEED_EVAL_DIV:
95       case CEED_EVAL_CURL:
96         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
97         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
98         CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
99         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
100         e_size = (CeedSize)P * num_comp * blk_size;
101         CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i]));
102         q_size = (CeedSize)Q * size * blk_size;
103         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
104         break;
105       case CEED_EVAL_WEIGHT:  // Only on input fields
106         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
107         q_size = (CeedSize)Q * blk_size;
108         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
109         CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
110         break;
111     }
112   }
113   return CEED_ERROR_SUCCESS;
114 }
115 
116 //------------------------------------------------------------------------------
117 // Setup Operator
118 //------------------------------------------------------------------------------
119 static int CeedOperatorSetup_Blocked(CeedOperator op) {
120   bool is_setup_done;
121   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
122   if (is_setup_done) return CEED_ERROR_SUCCESS;
123   Ceed ceed;
124   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
125   CeedOperator_Blocked *impl;
126   CeedCallBackend(CeedOperatorGetData(op, &impl));
127   CeedQFunction qf;
128   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
129   CeedInt Q, num_input_fields, num_output_fields;
130   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
131   CeedCallBackend(CeedQFunctionIsIdentity(qf, &impl->is_identity_qf));
132   CeedOperatorField *op_input_fields, *op_output_fields;
133   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
134   CeedQFunctionField *qf_input_fields, *qf_output_fields;
135   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
136 
137   // Allocate
138   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->blk_restr));
139   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));
140 
141   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
142   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
143   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
144   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
145   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
146 
147   impl->num_inputs  = num_input_fields;
148   impl->num_outputs = num_output_fields;
149 
150   // Set up infield and outfield pointer arrays
151   // Infields
152   CeedCallBackend(
153       CeedOperatorSetupFields_Blocked(qf, op, true, impl->blk_restr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q));
154   // Outfields
155   CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, impl->blk_restr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out,
156                                                   num_input_fields, num_output_fields, Q));
157 
158   // Identity QFunctions
159   if (impl->is_identity_qf) {
160     CeedEvalMode        in_mode, out_mode;
161     CeedQFunctionField *in_fields, *out_fields;
162     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields));
163     CeedCallBackend(CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode));
164     CeedCallBackend(CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode));
165 
166     if (in_mode == CEED_EVAL_NONE && out_mode == CEED_EVAL_NONE) {
167       impl->is_identity_restr_op = true;
168     } else {
169       CeedCallBackend(CeedVectorReferenceCopy(impl->q_vecs_in[0], &impl->q_vecs_out[0]));
170     }
171   }
172 
173   CeedCallBackend(CeedOperatorSetSetupDone(op));
174 
175   return CEED_ERROR_SUCCESS;
176 }
177 
178 //------------------------------------------------------------------------------
179 // Setup Operator Inputs
180 //------------------------------------------------------------------------------
181 static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
182                                                   CeedVector in_vec, bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX],
183                                                   CeedOperator_Blocked *impl, CeedRequest *request) {
184   CeedEvalMode eval_mode;
185   CeedVector   vec;
186   uint64_t     state;
187 
188   for (CeedInt i = 0; i < num_input_fields; i++) {
189     // Get input vector
190     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
191     if (vec == CEED_VECTOR_ACTIVE) {
192       if (skip_active) continue;
193       else vec = in_vec;
194     }
195 
196     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
197     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
198     } else {
199       // Restrict
200       CeedCallBackend(CeedVectorGetState(vec, &state));
201       if (state != impl->input_states[i] || vec == in_vec) {
202         CeedCallBackend(CeedElemRestrictionApply(impl->blk_restr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request));
203         impl->input_states[i] = state;
204       }
205       // Get evec
206       CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data_full[i]));
207     }
208   }
209   return CEED_ERROR_SUCCESS;
210 }
211 
212 //------------------------------------------------------------------------------
213 // Input Basis Action
214 //------------------------------------------------------------------------------
215 static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
216                                                  CeedInt num_input_fields, CeedInt blk_size, bool skip_active,
217                                                  CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) {
218   CeedInt             elem_size, size, num_comp;
219   CeedElemRestriction elem_restr;
220   CeedEvalMode        eval_mode;
221   CeedBasis           basis;
222 
223   for (CeedInt i = 0; i < num_input_fields; i++) {
224     // Skip active input
225     if (skip_active) {
226       CeedVector vec;
227       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
228       if (vec == CEED_VECTOR_ACTIVE) continue;
229     }
230 
231     // Get elem_size, eval_mode, size
232     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_restr));
233     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_restr, &elem_size));
234     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
235     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
236     // Basis action
237     switch (eval_mode) {
238       case CEED_EVAL_NONE:
239         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][e * Q * size]));
240         break;
241       case CEED_EVAL_INTERP:
242       case CEED_EVAL_GRAD:
243       case CEED_EVAL_DIV:
244       case CEED_EVAL_CURL:
245         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
246         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
247         CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][e * elem_size * num_comp]));
248         CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i]));
249         break;
250       case CEED_EVAL_WEIGHT:
251         break;  // No action
252     }
253   }
254   return CEED_ERROR_SUCCESS;
255 }
256 
257 //------------------------------------------------------------------------------
258 // Output Basis Action
259 //------------------------------------------------------------------------------
260 static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields,
261                                                   CeedInt blk_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op,
262                                                   CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) {
263   CeedInt             elem_size, num_comp;
264   CeedElemRestriction elem_restr;
265   CeedEvalMode        eval_mode;
266   CeedBasis           basis;
267 
268   for (CeedInt i = 0; i < num_output_fields; i++) {
269     // Get elem_size, eval_mode, size
270     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_restr));
271     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_restr, &elem_size));
272     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
273     // Basis action
274     switch (eval_mode) {
275       case CEED_EVAL_NONE:
276         break;  // No action
277       case CEED_EVAL_INTERP:
278       case CEED_EVAL_GRAD:
279       case CEED_EVAL_DIV:
280       case CEED_EVAL_CURL:
281         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
282         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
283         CeedCallBackend(
284             CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][e * elem_size * num_comp]));
285         CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
286         break;
287       // LCOV_EXCL_START
288       case CEED_EVAL_WEIGHT: {
289         Ceed ceed;
290         CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
291         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
292         // LCOV_EXCL_STOP
293       }
294     }
295   }
296   return CEED_ERROR_SUCCESS;
297 }
298 
299 //------------------------------------------------------------------------------
300 // Restore Input Vectors
301 //------------------------------------------------------------------------------
302 static inline int CeedOperatorRestoreInputs_Blocked(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
303                                                     bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) {
304   CeedEvalMode eval_mode;
305 
306   for (CeedInt i = 0; i < num_input_fields; i++) {
307     // Skip active inputs
308     if (skip_active) {
309       CeedVector vec;
310       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
311       if (vec == CEED_VECTOR_ACTIVE) continue;
312     }
313     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
314     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
315     } else {
316       CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_full[i], (const CeedScalar **)&e_data_full[i]));
317     }
318   }
319   return CEED_ERROR_SUCCESS;
320 }
321 
322 //------------------------------------------------------------------------------
323 // Operator Apply
324 //------------------------------------------------------------------------------
325 static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
326   CeedOperator_Blocked *impl;
327   CeedCallBackend(CeedOperatorGetData(op, &impl));
328   const CeedInt blk_size = 8;
329   CeedInt       Q, num_input_fields, num_output_fields, num_elem, size;
330   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
331   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
332   CeedInt       num_blks = (num_elem / blk_size) + !!(num_elem % blk_size);
333   CeedQFunction qf;
334   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
335   CeedOperatorField *op_input_fields, *op_output_fields;
336   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
337   CeedQFunctionField *qf_input_fields, *qf_output_fields;
338   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
339   CeedEvalMode eval_mode;
340   CeedVector   vec;
341   CeedScalar  *e_data_full[2 * CEED_FIELD_MAX] = {0};
342 
343   // Setup
344   CeedCallBackend(CeedOperatorSetup_Blocked(op));
345 
346   // Restriction only operator
347   if (impl->is_identity_restr_op) {
348     CeedCallBackend(CeedElemRestrictionApply(impl->blk_restr[0], CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request));
349     CeedCallBackend(CeedElemRestrictionApply(impl->blk_restr[1], CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request));
350     return CEED_ERROR_SUCCESS;
351   }
352 
353   // Input Evecs and Restriction
354   CeedCallBackend(CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data_full, impl, request));
355 
356   // Output Evecs
357   for (CeedInt i = 0; i < num_output_fields; i++) {
358     CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields]));
359   }
360 
361   // Loop through elements
362   for (CeedInt e = 0; e < num_blks * blk_size; e += blk_size) {
363     // Output pointers
364     for (CeedInt i = 0; i < num_output_fields; i++) {
365       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
366       if (eval_mode == CEED_EVAL_NONE) {
367         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
368         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][e * Q * size]));
369       }
370     }
371 
372     // Input basis apply
373     CeedCallBackend(CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields, num_input_fields, blk_size, false, e_data_full, impl));
374 
375     // Q function
376     if (!impl->is_identity_qf) {
377       CeedCallBackend(CeedQFunctionApply(qf, Q * blk_size, impl->q_vecs_in, impl->q_vecs_out));
378     }
379 
380     // Output basis apply
381     CeedCallBackend(CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields, blk_size, num_input_fields, num_output_fields, op,
382                                                     e_data_full, impl));
383   }
384 
385   // Output restriction
386   for (CeedInt i = 0; i < num_output_fields; i++) {
387     // Restore evec
388     CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_full[i + impl->num_inputs], &e_data_full[i + num_input_fields]));
389     // Get output vector
390     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
391     // Active
392     if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
393     // Restrict
394     CeedCallBackend(
395         CeedElemRestrictionApply(impl->blk_restr[i + impl->num_inputs], CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request));
396   }
397 
398   // Restore input arrays
399   CeedCallBackend(CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, false, e_data_full, impl));
400 
401   return CEED_ERROR_SUCCESS;
402 }
403 
404 //------------------------------------------------------------------------------
405 // Core code for assembling linear QFunction
406 //------------------------------------------------------------------------------
407 static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator op, bool build_objects, CeedVector *assembled,
408                                                                   CeedElemRestriction *rstr, CeedRequest *request) {
409   CeedOperator_Blocked *impl;
410   CeedCallBackend(CeedOperatorGetData(op, &impl));
411   const CeedInt blk_size = 8;
412   CeedInt       Q, num_input_fields, num_output_fields, num_elem, size;
413   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
414   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
415   CeedInt       num_blks = (num_elem / blk_size) + !!(num_elem % blk_size);
416   CeedQFunction qf;
417   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
418   CeedOperatorField *op_input_fields, *op_output_fields;
419   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
420   CeedQFunctionField *qf_input_fields, *qf_output_fields;
421   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
422   CeedVector  vec, l_vec = impl->qf_l_vec;
423   CeedInt     num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
424   CeedSize    q_size;
425   CeedVector *active_in = impl->qf_active_in;
426   CeedScalar *a, *tmp;
427   Ceed        ceed;
428   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
429   CeedScalar *e_data_full[2 * CEED_FIELD_MAX] = {0};
430 
431   // Setup
432   CeedCallBackend(CeedOperatorSetup_Blocked(op));
433 
434   // Check for identity
435   CeedCheck(!impl->is_identity_qf, ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported");
436 
437   // Input Evecs and Restriction
438   CeedCallBackend(CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request));
439 
440   // Count number of active input fields
441   if (!num_active_in) {
442     for (CeedInt i = 0; i < num_input_fields; i++) {
443       // Get input vector
444       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
445       // Check if active input
446       if (vec == CEED_VECTOR_ACTIVE) {
447         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
448         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
449         CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp));
450         CeedCallBackend(CeedRealloc(num_active_in + size, &active_in));
451         for (CeedInt field = 0; field < size; field++) {
452           q_size = (CeedSize)Q * blk_size;
453           CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field]));
454           CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &tmp[field * Q * blk_size]));
455         }
456         num_active_in += size;
457         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp));
458       }
459     }
460     impl->num_active_in = num_active_in;
461     impl->qf_active_in  = active_in;
462   }
463 
464   // Count number of active output fields
465   if (!num_active_out) {
466     for (CeedInt i = 0; i < num_output_fields; i++) {
467       // Get output vector
468       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
469       // Check if active output
470       if (vec == CEED_VECTOR_ACTIVE) {
471         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
472         num_active_out += size;
473       }
474     }
475     impl->num_active_out = num_active_out;
476   }
477 
478   // Check sizes
479   CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
480 
481   // Setup Lvec
482   if (!l_vec) {
483     CeedSize l_size = (CeedSize)num_blks * blk_size * Q * num_active_in * num_active_out;
484     CeedCallBackend(CeedVectorCreate(ceed, l_size, &l_vec));
485     impl->qf_l_vec = l_vec;
486   }
487   CeedCallBackend(CeedVectorGetArrayWrite(l_vec, CEED_MEM_HOST, &a));
488 
489   // Build objects if needed
490   CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q};
491   if (build_objects) {
492     // Create output restriction
493     CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, num_active_in * num_active_out, num_active_in * num_active_out * num_elem * Q,
494                                                      strides, rstr));
495     // Create assembled vector
496     CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out;
497     CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
498   }
499 
500   // Loop through elements
501   for (CeedInt e = 0; e < num_blks * blk_size; e += blk_size) {
502     // Input basis apply
503     CeedCallBackend(CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields, num_input_fields, blk_size, true, e_data_full, impl));
504 
505     // Assemble QFunction
506     for (CeedInt in = 0; in < num_active_in; in++) {
507       // Set Inputs
508       CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0));
509       if (num_active_in > 1) {
510         CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0));
511       }
512       // Set Outputs
513       for (CeedInt out = 0; out < num_output_fields; out++) {
514         // Get output vector
515         CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
516         // Check if active output
517         if (vec == CEED_VECTOR_ACTIVE) {
518           CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, a));
519           CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
520           a += size * Q * blk_size;  // Advance the pointer by the size of the output
521         }
522       }
523       // Apply QFunction
524       CeedCallBackend(CeedQFunctionApply(qf, Q * blk_size, impl->q_vecs_in, impl->q_vecs_out));
525     }
526   }
527 
528   // Un-set output Qvecs to prevent accidental overwrite of Assembled
529   for (CeedInt out = 0; out < num_output_fields; out++) {
530     // Get output vector
531     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
532     // Check if active output
533     if (vec == CEED_VECTOR_ACTIVE) {
534       CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_COPY_VALUES, NULL));
535     }
536   }
537 
538   // Restore input arrays
539   CeedCallBackend(CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, true, e_data_full, impl));
540 
541   // Output blocked restriction
542   CeedCallBackend(CeedVectorRestoreArray(l_vec, &a));
543   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
544   CeedElemRestriction blk_rstr = impl->qf_blk_rstr;
545   if (!impl->qf_blk_rstr) {
546     CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, blk_size, num_active_in * num_active_out,
547                                                             num_active_in * num_active_out * num_elem * Q, strides, &blk_rstr));
548     impl->qf_blk_rstr = blk_rstr;
549   }
550   CeedCallBackend(CeedElemRestrictionApply(blk_rstr, CEED_TRANSPOSE, l_vec, *assembled, request));
551 
552   return CEED_ERROR_SUCCESS;
553 }
554 
555 //------------------------------------------------------------------------------
556 // Assemble Linear QFunction
557 //------------------------------------------------------------------------------
558 static int CeedOperatorLinearAssembleQFunction_Blocked(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
559   return CeedOperatorLinearAssembleQFunctionCore_Blocked(op, true, assembled, rstr, request);
560 }
561 
562 //------------------------------------------------------------------------------
563 // Update Assembled Linear QFunction
564 //------------------------------------------------------------------------------
565 static int CeedOperatorLinearAssembleQFunctionUpdate_Blocked(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
566   return CeedOperatorLinearAssembleQFunctionCore_Blocked(op, false, &assembled, &rstr, request);
567 }
568 
569 //------------------------------------------------------------------------------
570 // Operator Destroy
571 //------------------------------------------------------------------------------
572 static int CeedOperatorDestroy_Blocked(CeedOperator op) {
573   CeedOperator_Blocked *impl;
574   CeedCallBackend(CeedOperatorGetData(op, &impl));
575 
576   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
577     CeedCallBackend(CeedElemRestrictionDestroy(&impl->blk_restr[i]));
578     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i]));
579   }
580   CeedCallBackend(CeedFree(&impl->blk_restr));
581   CeedCallBackend(CeedFree(&impl->e_vecs_full));
582   CeedCallBackend(CeedFree(&impl->input_states));
583 
584   for (CeedInt i = 0; i < impl->num_inputs; i++) {
585     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i]));
586     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i]));
587   }
588   CeedCallBackend(CeedFree(&impl->e_vecs_in));
589   CeedCallBackend(CeedFree(&impl->q_vecs_in));
590 
591   for (CeedInt i = 0; i < impl->num_outputs; i++) {
592     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i]));
593     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
594   }
595   CeedCallBackend(CeedFree(&impl->e_vecs_out));
596   CeedCallBackend(CeedFree(&impl->q_vecs_out));
597 
598   // QFunction assembly data
599   for (CeedInt i = 0; i < impl->num_active_in; i++) {
600     CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
601   }
602   CeedCallBackend(CeedFree(&impl->qf_active_in));
603   CeedCallBackend(CeedVectorDestroy(&impl->qf_l_vec));
604   CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_blk_rstr));
605 
606   CeedCallBackend(CeedFree(&impl));
607   return CEED_ERROR_SUCCESS;
608 }
609 
610 //------------------------------------------------------------------------------
611 // Operator Create
612 //------------------------------------------------------------------------------
613 int CeedOperatorCreate_Blocked(CeedOperator op) {
614   Ceed ceed;
615   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
616   CeedOperator_Blocked *impl;
617 
618   CeedCallBackend(CeedCalloc(1, &impl));
619   CeedCallBackend(CeedOperatorSetData(op, impl));
620 
621   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Blocked));
622   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Blocked));
623   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Blocked));
624   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Blocked));
625   return CEED_ERROR_SUCCESS;
626 }
627 
628 //------------------------------------------------------------------------------
629