xref: /libCEED/backends/blocked/ceed-blocked-operator.c (revision ebbcc8a346ef79f72cc33926c55f927f02fd96aa)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed/ceed.h>
18 #include <ceed/backend.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include "ceed-blocked.h"
23 
24 //------------------------------------------------------------------------------
25 // Setup Input/Output Fields
26 //------------------------------------------------------------------------------
27 static int CeedOperatorSetupFields_Blocked(CeedQFunction qf,
28     CeedOperator op, bool is_input, CeedElemRestriction *blk_restr,
29     CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs,
30     CeedInt start_e, CeedInt num_fields, CeedInt Q) {
31   CeedInt ierr, num_comp, size, P;
32   Ceed ceed;
33   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
34   CeedBasis basis;
35   CeedElemRestriction r;
36   CeedOperatorField *op_fields;
37   CeedQFunctionField *qf_fields;
38   if (is_input) {
39     ierr = CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL);
40     CeedChkBackend(ierr);
41     ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL);
42     CeedChkBackend(ierr);
43   } else {
44     ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields);
45     CeedChkBackend(ierr);
46     ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields);
47     CeedChkBackend(ierr);
48   }
49   const CeedInt blk_size = 8;
50 
51   // Loop over fields
52   for (CeedInt i=0; i<num_fields; i++) {
53     CeedEvalMode eval_mode;
54     ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode);
55     CeedChkBackend(ierr);
56 
57     if (eval_mode != CEED_EVAL_WEIGHT) {
58       ierr = CeedOperatorFieldGetElemRestriction(op_fields[i], &r);
59       CeedChkBackend(ierr);
60       ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
61       CeedInt num_elem, elem_size, l_size, comp_stride;
62       ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr);
63       ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
64       ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr);
65       ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr);
66 
67       bool strided;
68       ierr = CeedElemRestrictionIsStrided(r, &strided); CeedChkBackend(ierr);
69       if (strided) {
70         CeedInt strides[3];
71         ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr);
72         ierr = CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, elem_size,
73                blk_size, num_comp, l_size, strides, &blk_restr[i+start_e]);
74         CeedChkBackend(ierr);
75       } else {
76         const CeedInt *offsets = NULL;
77         ierr = CeedElemRestrictionGetOffsets(r, CEED_MEM_HOST, &offsets);
78         CeedChkBackend(ierr);
79         ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr);
80         ierr = CeedElemRestrictionCreateBlocked(ceed, num_elem, elem_size,
81                                                 blk_size, num_comp, comp_stride,
82                                                 l_size, CEED_MEM_HOST,
83                                                 CEED_COPY_VALUES, offsets,
84                                                 &blk_restr[i+start_e]);
85         CeedChkBackend(ierr);
86         ierr = CeedElemRestrictionRestoreOffsets(r, &offsets); CeedChkBackend(ierr);
87       }
88       ierr = CeedElemRestrictionCreateVector(blk_restr[i+start_e], NULL,
89                                              &e_vecs_full[i+start_e]);
90       CeedChkBackend(ierr);
91     }
92 
93     switch(eval_mode) {
94     case CEED_EVAL_NONE:
95       ierr = CeedQFunctionFieldGetSize(qf_fields[i], &size); CeedChkBackend(ierr);
96       ierr = CeedVectorCreate(ceed, Q*size*blk_size, &q_vecs[i]);
97       CeedChkBackend(ierr);
98       break;
99     case CEED_EVAL_INTERP:
100       ierr = CeedOperatorFieldGetBasis(op_fields[i], &basis); CeedChkBackend(ierr);
101       ierr = CeedQFunctionFieldGetSize(qf_fields[i], &size); CeedChkBackend(ierr);
102       ierr = CeedBasisGetNumNodes(basis, &P); CeedChkBackend(ierr);
103       ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
104       ierr = CeedVectorCreate(ceed, P*num_comp*blk_size, &e_vecs[i]);
105       CeedChkBackend(ierr);
106       ierr = CeedVectorCreate(ceed, Q*size*blk_size, &q_vecs[i]);
107       CeedChkBackend(ierr);
108       break;
109     case CEED_EVAL_GRAD:
110       ierr = CeedOperatorFieldGetBasis(op_fields[i], &basis); CeedChkBackend(ierr);
111       ierr = CeedQFunctionFieldGetSize(qf_fields[i], &size); CeedChkBackend(ierr);
112       ierr = CeedBasisGetNumNodes(basis, &P); CeedChkBackend(ierr);
113       ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
114       ierr = CeedVectorCreate(ceed, P*num_comp*blk_size, &e_vecs[i]);
115       CeedChkBackend(ierr);
116       ierr = CeedVectorCreate(ceed, Q*size*blk_size, &q_vecs[i]);
117       CeedChkBackend(ierr);
118       break;
119     case CEED_EVAL_WEIGHT: // Only on input fields
120       ierr = CeedOperatorFieldGetBasis(op_fields[i], &basis); CeedChkBackend(ierr);
121       ierr = CeedVectorCreate(ceed, Q*blk_size, &q_vecs[i]); CeedChkBackend(ierr);
122       ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE,
123                             CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]);
124       CeedChkBackend(ierr);
125 
126       break;
127     case CEED_EVAL_DIV:
128       break; // Not implemented
129     case CEED_EVAL_CURL:
130       break; // Not implemented
131     }
132   }
133   return CEED_ERROR_SUCCESS;
134 }
135 
136 //------------------------------------------------------------------------------
137 // Setup Operator
138 //------------------------------------------------------------------------------
139 static int CeedOperatorSetup_Blocked(CeedOperator op) {
140   int ierr;
141   bool is_setup_done;
142   ierr = CeedOperatorIsSetupDone(op, &is_setup_done); CeedChkBackend(ierr);
143   if (is_setup_done) return CEED_ERROR_SUCCESS;
144   Ceed ceed;
145   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
146   CeedOperator_Blocked *impl;
147   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
148   CeedQFunction qf;
149   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
150   CeedInt Q, num_input_fields, num_output_fields;
151   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
152   ierr = CeedQFunctionIsIdentity(qf, &impl->is_identity_qf); CeedChkBackend(ierr);
153   CeedOperatorField *op_input_fields, *op_output_fields;
154   ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields,
155                                &num_output_fields, &op_output_fields);
156   CeedChkBackend(ierr);
157   CeedQFunctionField *qf_input_fields, *qf_output_fields;
158   ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL,
159                                 &qf_output_fields);
160   CeedChkBackend(ierr);
161 
162   // Allocate
163   ierr = CeedCalloc(num_input_fields + num_output_fields, &impl->blk_restr);
164   CeedChkBackend(ierr);
165   ierr = CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full);
166   CeedChkBackend(ierr);
167 
168   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->input_states); CeedChkBackend(ierr);
169   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in); CeedChkBackend(ierr);
170   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out); CeedChkBackend(ierr);
171   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in); CeedChkBackend(ierr);
172   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out); CeedChkBackend(ierr);
173 
174   impl->num_inputs = num_input_fields;
175   impl->num_outputs = num_output_fields;
176 
177   // Set up infield and outfield pointer arrays
178   // Infields
179   ierr = CeedOperatorSetupFields_Blocked(qf, op, true, impl->blk_restr,
180                                          impl->e_vecs_full, impl->e_vecs_in,
181                                          impl->q_vecs_in, 0,
182                                          num_input_fields, Q);
183   CeedChkBackend(ierr);
184   // Outfields
185   ierr = CeedOperatorSetupFields_Blocked(qf, op, false, impl->blk_restr,
186                                          impl->e_vecs_full, impl->e_vecs_out,
187                                          impl->q_vecs_out, num_input_fields,
188                                          num_output_fields, Q);
189   CeedChkBackend(ierr);
190 
191   // Identity QFunctions
192   if (impl->is_identity_qf) {
193     CeedEvalMode in_mode, out_mode;
194     CeedQFunctionField *in_fields, *out_fields;
195     ierr = CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields);
196     CeedChkBackend(ierr);
197     ierr = CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode);
198     CeedChkBackend(ierr);
199     ierr = CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode);
200     CeedChkBackend(ierr);
201 
202     if (in_mode == CEED_EVAL_NONE && out_mode == CEED_EVAL_NONE) {
203       impl->is_identity_restr_op = true;
204     } else {
205       ierr = CeedVectorDestroy(&impl->q_vecs_out[0]); CeedChkBackend(ierr);
206       impl->q_vecs_out[0] = impl->q_vecs_in[0];
207       ierr = CeedVectorAddReference(impl->q_vecs_in[0]); CeedChkBackend(ierr);
208     }
209   }
210 
211   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
212 
213   return CEED_ERROR_SUCCESS;
214 }
215 
216 //------------------------------------------------------------------------------
217 // Setup Operator Inputs
218 //------------------------------------------------------------------------------
219 static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields,
220     CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
221     CeedVector in_vec, bool skip_active, CeedScalar *e_data_full[2*CEED_FIELD_MAX],
222     CeedOperator_Blocked *impl, CeedRequest *request) {
223   CeedInt ierr;
224   CeedEvalMode eval_mode;
225   CeedVector vec;
226   uint64_t state;
227 
228   for (CeedInt i=0; i<num_input_fields; i++) {
229     // Get input vector
230     ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
231     CeedChkBackend(ierr);
232     if (vec == CEED_VECTOR_ACTIVE) {
233       if (skip_active)
234         continue;
235       else
236         vec = in_vec;
237     }
238 
239     ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode);
240     CeedChkBackend(ierr);
241     if (eval_mode == CEED_EVAL_WEIGHT) { // Skip
242     } else {
243       // Restrict
244       ierr = CeedVectorGetState(vec, &state); CeedChkBackend(ierr);
245       if (state != impl->input_states[i] || vec == in_vec) {
246         ierr = CeedElemRestrictionApply(impl->blk_restr[i], CEED_NOTRANSPOSE,
247                                         vec, impl->e_vecs_full[i], request);
248         CeedChkBackend(ierr);
249         impl->input_states[i] = state;
250       }
251       // Get evec
252       ierr = CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST,
253                                     (const CeedScalar **) &e_data_full[i]);
254       CeedChkBackend(ierr);
255     }
256   }
257   return CEED_ERROR_SUCCESS;
258 }
259 
260 //------------------------------------------------------------------------------
261 // Input Basis Action
262 //------------------------------------------------------------------------------
263 static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q,
264     CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
265     CeedInt num_input_fields, CeedInt blk_size, bool skip_active,
266     CeedScalar *e_data_full[2*CEED_FIELD_MAX], CeedOperator_Blocked *impl) {
267   CeedInt ierr;
268   CeedInt dim, elem_size, size;
269   CeedElemRestriction elem_restr;
270   CeedEvalMode eval_mode;
271   CeedBasis basis;
272 
273   for (CeedInt i=0; i<num_input_fields; i++) {
274     // Skip active input
275     if (skip_active) {
276       CeedVector vec;
277       ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
278       CeedChkBackend(ierr);
279       if (vec == CEED_VECTOR_ACTIVE)
280         continue;
281     }
282 
283     // Get elem_size, eval_mode, size
284     ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_restr);
285     CeedChkBackend(ierr);
286     ierr = CeedElemRestrictionGetElementSize(elem_restr, &elem_size);
287     CeedChkBackend(ierr);
288     ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode);
289     CeedChkBackend(ierr);
290     ierr = CeedQFunctionFieldGetSize(qf_input_fields[i], &size);
291     CeedChkBackend(ierr);
292     // Basis action
293     switch(eval_mode) {
294     case CEED_EVAL_NONE:
295       ierr = CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST,
296                                 CEED_USE_POINTER, &e_data_full[i][e*Q*size]);
297       CeedChkBackend(ierr);
298       break;
299     case CEED_EVAL_INTERP:
300       ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis);
301       CeedChkBackend(ierr);
302       ierr = CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST,
303                                 CEED_USE_POINTER, &e_data_full[i][e*elem_size*size]);
304       CeedChkBackend(ierr);
305       ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE,
306                             CEED_EVAL_INTERP, impl->e_vecs_in[i],
307                             impl->q_vecs_in[i]); CeedChkBackend(ierr);
308       break;
309     case CEED_EVAL_GRAD:
310       ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis);
311       CeedChkBackend(ierr);
312       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
313       ierr = CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST,
314                                 CEED_USE_POINTER, &e_data_full[i][e*elem_size*size/dim]);
315       CeedChkBackend(ierr);
316       ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE,
317                             CEED_EVAL_GRAD, impl->e_vecs_in[i],
318                             impl->q_vecs_in[i]); CeedChkBackend(ierr);
319       break;
320     case CEED_EVAL_WEIGHT:
321       break;  // No action
322     // LCOV_EXCL_START
323     case CEED_EVAL_DIV:
324     case CEED_EVAL_CURL: {
325       ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis);
326       CeedChkBackend(ierr);
327       Ceed ceed;
328       ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
329       return CeedError(ceed, CEED_ERROR_BACKEND,
330                        "Ceed evaluation mode not implemented");
331       // LCOV_EXCL_STOP
332     }
333     }
334   }
335   return CEED_ERROR_SUCCESS;
336 }
337 
338 //------------------------------------------------------------------------------
339 // Output Basis Action
340 //------------------------------------------------------------------------------
341 static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q,
342     CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields,
343     CeedInt blk_size, CeedInt num_input_fields, CeedInt num_output_fields,
344     CeedOperator op, CeedScalar *e_data_full[2*CEED_FIELD_MAX],
345     CeedOperator_Blocked *impl) {
346   CeedInt ierr;
347   CeedInt dim, elem_size, size;
348   CeedElemRestriction elem_restr;
349   CeedEvalMode eval_mode;
350   CeedBasis basis;
351 
352   for (CeedInt i=0; i<num_output_fields; i++) {
353     // Get elem_size, eval_mode, size
354     ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_restr);
355     CeedChkBackend(ierr);
356     ierr = CeedElemRestrictionGetElementSize(elem_restr, &elem_size);
357     CeedChkBackend(ierr);
358     ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode);
359     CeedChkBackend(ierr);
360     ierr = CeedQFunctionFieldGetSize(qf_output_fields[i], &size);
361     CeedChkBackend(ierr);
362     // Basis action
363     switch(eval_mode) {
364     case CEED_EVAL_NONE:
365       break; // No action
366     case CEED_EVAL_INTERP:
367       ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis);
368       CeedChkBackend(ierr);
369       ierr = CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST,
370                                 CEED_USE_POINTER, &e_data_full[i + num_input_fields][e*elem_size*size]);
371       CeedChkBackend(ierr);
372       ierr = CeedBasisApply(basis, blk_size, CEED_TRANSPOSE,
373                             CEED_EVAL_INTERP, impl->q_vecs_out[i],
374                             impl->e_vecs_out[i]); CeedChkBackend(ierr);
375       break;
376     case CEED_EVAL_GRAD:
377       ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis);
378       CeedChkBackend(ierr);
379       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
380       ierr = CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST,
381                                 CEED_USE_POINTER, &e_data_full[i + num_input_fields][e*elem_size*size/dim]);
382       CeedChkBackend(ierr);
383       ierr = CeedBasisApply(basis, blk_size, CEED_TRANSPOSE,
384                             CEED_EVAL_GRAD, impl->q_vecs_out[i],
385                             impl->e_vecs_out[i]); CeedChkBackend(ierr);
386       break;
387     // LCOV_EXCL_START
388     case CEED_EVAL_WEIGHT: {
389       Ceed ceed;
390       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
391       return CeedError(ceed, CEED_ERROR_BACKEND,
392                        "CEED_EVAL_WEIGHT cannot be an output "
393                        "evaluation mode");
394     }
395     case CEED_EVAL_DIV:
396     case CEED_EVAL_CURL: {
397       Ceed ceed;
398       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
399       return CeedError(ceed, CEED_ERROR_BACKEND,
400                        "Ceed evaluation mode not implemented");
401       // LCOV_EXCL_STOP
402     }
403     }
404   }
405   return CEED_ERROR_SUCCESS;
406 }
407 
408 //------------------------------------------------------------------------------
409 // Restore Input Vectors
410 //------------------------------------------------------------------------------
411 static inline int CeedOperatorRestoreInputs_Blocked(CeedInt num_input_fields,
412     CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
413     bool skip_active, CeedScalar *e_data_full[2*CEED_FIELD_MAX],
414     CeedOperator_Blocked *impl) {
415   CeedInt ierr;
416   CeedEvalMode eval_mode;
417 
418   for (CeedInt i=0; i<num_input_fields; i++) {
419     // Skip active inputs
420     if (skip_active) {
421       CeedVector vec;
422       ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
423       CeedChkBackend(ierr);
424       if (vec == CEED_VECTOR_ACTIVE)
425         continue;
426     }
427     ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode);
428     CeedChkBackend(ierr);
429     if (eval_mode == CEED_EVAL_WEIGHT) { // Skip
430     } else {
431       ierr = CeedVectorRestoreArrayRead(impl->e_vecs_full[i],
432                                         (const CeedScalar **) &e_data_full[i]);
433       CeedChkBackend(ierr);
434     }
435   }
436   return CEED_ERROR_SUCCESS;
437 }
438 
439 //------------------------------------------------------------------------------
440 // Operator Apply
441 //------------------------------------------------------------------------------
442 static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec,
443                                         CeedVector out_vec,
444                                         CeedRequest *request) {
445   int ierr;
446   CeedOperator_Blocked *impl;
447   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
448   const CeedInt blk_size = 8;
449   CeedInt Q, num_input_fields, num_output_fields, num_elem, size;
450   ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr);
451   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
452   CeedInt num_blks = (num_elem/blk_size) + !!(num_elem%blk_size);
453   CeedQFunction qf;
454   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
455   CeedOperatorField *op_input_fields, *op_output_fields;
456   ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields,
457                                &num_output_fields, &op_output_fields);
458   CeedChkBackend(ierr);
459   CeedQFunctionField *qf_input_fields, *qf_output_fields;
460   ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL,
461                                 &qf_output_fields);
462   CeedChkBackend(ierr);
463   CeedEvalMode eval_mode;
464   CeedVector vec;
465   CeedScalar *e_data_full[2*CEED_FIELD_MAX] = {0};
466 
467   // Setup
468   ierr = CeedOperatorSetup_Blocked(op); CeedChkBackend(ierr);
469 
470   // Restriction only operator
471   if (impl->is_identity_restr_op) {
472     ierr = CeedElemRestrictionApply(impl->blk_restr[0], CEED_NOTRANSPOSE, in_vec,
473                                     impl->e_vecs_full[0], request); CeedChkBackend(ierr);
474     ierr = CeedElemRestrictionApply(impl->blk_restr[1], CEED_TRANSPOSE,
475                                     impl->e_vecs_full[0], out_vec, request); CeedChkBackend(ierr);
476     return CEED_ERROR_SUCCESS;
477   }
478 
479   // Input Evecs and Restriction
480   ierr = CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields,
481                                          op_input_fields, in_vec, false, e_data_full,
482                                          impl, request); CeedChkBackend(ierr);
483 
484   // Output Evecs
485   for (CeedInt i=0; i<num_output_fields; i++) {
486     ierr = CeedVectorGetArrayWrite(impl->e_vecs_full[i+impl->num_inputs],
487                                    CEED_MEM_HOST, &e_data_full[i + num_input_fields]);
488     CeedChkBackend(ierr);
489   }
490 
491   // Loop through elements
492   for (CeedInt e=0; e<num_blks*blk_size; e+=blk_size) {
493     // Output pointers
494     for (CeedInt i=0; i<num_output_fields; i++) {
495       ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode);
496       CeedChkBackend(ierr);
497       if (eval_mode == CEED_EVAL_NONE) {
498         ierr = CeedQFunctionFieldGetSize(qf_output_fields[i], &size);
499         CeedChkBackend(ierr);
500         ierr = CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST,
501                                   CEED_USE_POINTER, &e_data_full[i + num_input_fields][e*Q*size]);
502         CeedChkBackend(ierr);
503       }
504     }
505 
506     // Input basis apply
507     ierr = CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields,
508                                           num_input_fields, blk_size, false, e_data_full,
509                                           impl); CeedChkBackend(ierr);
510 
511     // Q function
512     if (!impl->is_identity_qf) {
513       ierr = CeedQFunctionApply(qf, Q*blk_size, impl->q_vecs_in, impl->q_vecs_out);
514       CeedChkBackend(ierr);
515     }
516 
517     // Output basis apply
518     ierr = CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields,
519                                            blk_size, num_input_fields,
520                                            num_output_fields, op, e_data_full, impl);
521     CeedChkBackend(ierr);
522   }
523 
524   // Output restriction
525   for (CeedInt i=0; i<num_output_fields; i++) {
526     // Restore evec
527     ierr = CeedVectorRestoreArray(impl->e_vecs_full[i+impl->num_inputs],
528                                   &e_data_full[i + num_input_fields]);
529     CeedChkBackend(ierr);
530     // Get output vector
531     ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec);
532     CeedChkBackend(ierr);
533     // Active
534     if (vec == CEED_VECTOR_ACTIVE)
535       vec = out_vec;
536     // Restrict
537     ierr = CeedElemRestrictionApply(impl->blk_restr[i+impl->num_inputs],
538                                     CEED_TRANSPOSE, impl->e_vecs_full[i+impl->num_inputs],
539                                     vec, request); CeedChkBackend(ierr);
540 
541   }
542 
543   // Restore input arrays
544   ierr = CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields,
545          op_input_fields, false, e_data_full, impl); CeedChkBackend(ierr);
546 
547   return CEED_ERROR_SUCCESS;
548 }
549 
550 //------------------------------------------------------------------------------
551 // Core code for assembling linear QFunction
552 //------------------------------------------------------------------------------
553 static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(
554   CeedOperator op, bool build_objects, CeedVector *assembled,
555   CeedElemRestriction *rstr, CeedRequest *request) {
556   int ierr;
557   CeedOperator_Blocked *impl;
558   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
559   const CeedInt blk_size = 8;
560   CeedInt Q, num_input_fields, num_output_fields, num_elem, size;
561   ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr);
562   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
563   CeedInt num_blks = (num_elem/blk_size) + !!(num_elem%blk_size);
564   CeedQFunction qf;
565   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
566   CeedOperatorField *op_input_fields, *op_output_fields;
567   ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields,
568                                &num_output_fields, &op_output_fields);
569   CeedChkBackend(ierr);
570   CeedQFunctionField *qf_input_fields, *qf_output_fields;
571   ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL,
572                                 &qf_output_fields);
573   CeedChkBackend(ierr);
574   CeedVector vec, l_vec = impl->qf_l_vec;
575   CeedInt num_active_in = impl->num_active_in,
576           num_active_out = impl->num_active_out;
577   CeedVector *active_in = impl->qf_active_in;
578   CeedScalar *a, *tmp;
579   Ceed ceed;
580   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
581   CeedScalar *e_data_full[2*CEED_FIELD_MAX] = {0};
582 
583   // Setup
584   ierr = CeedOperatorSetup_Blocked(op); CeedChkBackend(ierr);
585 
586   // Check for identity
587   if (impl->is_identity_qf)
588     // LCOV_EXCL_START
589     return CeedError(ceed, CEED_ERROR_BACKEND,
590                      "Assembling identity QFunctions not supported");
591   // LCOV_EXCL_STOP
592 
593   // Input Evecs and Restriction
594   ierr = CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields,
595                                          op_input_fields, NULL, true, e_data_full,
596                                          impl, request); CeedChkBackend(ierr);
597 
598   // Count number of active input fields
599   if (!num_active_in) {
600     for (CeedInt i=0; i<num_input_fields; i++) {
601       // Get input vector
602       ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
603       CeedChkBackend(ierr);
604       // Check if active input
605       if (vec == CEED_VECTOR_ACTIVE) {
606         ierr = CeedQFunctionFieldGetSize(qf_input_fields[i], &size);
607         CeedChkBackend(ierr);
608         ierr = CeedVectorSetValue(impl->q_vecs_in[i], 0.0); CeedChkBackend(ierr);
609         ierr = CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp);
610         CeedChkBackend(ierr);
611         ierr = CeedRealloc(num_active_in + size, &active_in); CeedChkBackend(ierr);
612         for (CeedInt field=0; field<size; field++) {
613           ierr = CeedVectorCreate(ceed, Q*blk_size, &active_in[num_active_in+field]);
614           CeedChkBackend(ierr);
615           ierr = CeedVectorSetArray(active_in[num_active_in+field], CEED_MEM_HOST,
616                                     CEED_USE_POINTER, &tmp[field*Q*blk_size]);
617           CeedChkBackend(ierr);
618         }
619         num_active_in += size;
620         ierr = CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp); CeedChkBackend(ierr);
621       }
622     }
623     impl->num_active_in = num_active_in;
624     impl->qf_active_in = active_in;
625   }
626 
627   // Count number of active output fields
628   if (!num_active_out) {
629     for (CeedInt i=0; i<num_output_fields; i++) {
630       // Get output vector
631       ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec);
632       CeedChkBackend(ierr);
633       // Check if active output
634       if (vec == CEED_VECTOR_ACTIVE) {
635         ierr = CeedQFunctionFieldGetSize(qf_output_fields[i], &size);
636         CeedChkBackend(ierr);
637         num_active_out += size;
638       }
639     }
640     impl->num_active_out = num_active_out;
641   }
642 
643   // Check sizes
644   if (!num_active_in || !num_active_out)
645     // LCOV_EXCL_START
646     return CeedError(ceed, CEED_ERROR_BACKEND,
647                      "Cannot assemble QFunction without active inputs "
648                      "and outputs");
649   // LCOV_EXCL_STOP
650 
651   // Setup Lvec
652   if (!l_vec) {
653     ierr = CeedVectorCreate(ceed, num_blks*blk_size*Q*num_active_in*num_active_out,
654                             &l_vec); CeedChkBackend(ierr);
655     impl->qf_l_vec = l_vec;
656   }
657   ierr = CeedVectorGetArrayWrite(l_vec, CEED_MEM_HOST, &a); CeedChkBackend(ierr);
658 
659   // Build objects if needed
660   CeedInt strides[3] = {1, Q, num_active_in *num_active_out*Q};
661   if (build_objects) {
662     // Create output restriction
663     ierr = CeedElemRestrictionCreateStrided(ceed, num_elem, Q,
664                                             num_active_in*num_active_out,
665                                             num_active_in*num_active_out*num_elem*Q,
666                                             strides, rstr); CeedChkBackend(ierr);
667     // Create assembled vector
668     ierr = CeedVectorCreate(ceed, num_elem*Q*num_active_in*num_active_out,
669                             assembled); CeedChkBackend(ierr);
670   }
671 
672   // Loop through elements
673   for (CeedInt e=0; e<num_blks*blk_size; e+=blk_size) {
674     // Input basis apply
675     ierr = CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields,
676                                           num_input_fields, blk_size, true, e_data_full,
677                                           impl); CeedChkBackend(ierr);
678 
679     // Assemble QFunction
680     for (CeedInt in=0; in<num_active_in; in++) {
681       // Set Inputs
682       ierr = CeedVectorSetValue(active_in[in], 1.0); CeedChkBackend(ierr);
683       if (num_active_in > 1) {
684         ierr = CeedVectorSetValue(active_in[(in+num_active_in-1)%num_active_in],
685                                   0.0); CeedChkBackend(ierr);
686       }
687       // Set Outputs
688       for (CeedInt out=0; out<num_output_fields; out++) {
689         // Get output vector
690         ierr = CeedOperatorFieldGetVector(op_output_fields[out], &vec);
691         CeedChkBackend(ierr);
692         // Check if active output
693         if (vec == CEED_VECTOR_ACTIVE) {
694           CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST,
695                              CEED_USE_POINTER, a); CeedChkBackend(ierr);
696           ierr = CeedQFunctionFieldGetSize(qf_output_fields[out], &size);
697           CeedChkBackend(ierr);
698           a += size*Q*blk_size; // Advance the pointer by the size of the output
699         }
700       }
701       // Apply QFunction
702       ierr = CeedQFunctionApply(qf, Q*blk_size, impl->q_vecs_in, impl->q_vecs_out);
703       CeedChkBackend(ierr);
704     }
705   }
706 
707   // Un-set output Qvecs to prevent accidental overwrite of Assembled
708   for (CeedInt out=0; out<num_output_fields; out++) {
709     // Get output vector
710     ierr = CeedOperatorFieldGetVector(op_output_fields[out], &vec);
711     CeedChkBackend(ierr);
712     // Check if active output
713     if (vec == CEED_VECTOR_ACTIVE) {
714       CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_COPY_VALUES,
715                          NULL); CeedChkBackend(ierr);
716     }
717   }
718 
719   // Restore input arrays
720   ierr = CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields,
721          op_input_fields, true, e_data_full, impl); CeedChkBackend(ierr);
722 
723   // Output blocked restriction
724   ierr = CeedVectorRestoreArray(l_vec, &a); CeedChkBackend(ierr);
725   ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr);
726   CeedElemRestriction blk_rstr = impl->qf_blk_rstr;
727   if (!impl->qf_blk_rstr) {
728     ierr = CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, blk_size,
729            num_active_in*num_active_out, num_active_in*num_active_out*num_elem*Q,
730            strides, &blk_rstr); CeedChkBackend(ierr);
731     impl->qf_blk_rstr = blk_rstr;
732   }
733   ierr = CeedElemRestrictionApply(blk_rstr, CEED_TRANSPOSE, l_vec, *assembled,
734                                   request); CeedChkBackend(ierr);
735 
736   return CEED_ERROR_SUCCESS;
737 }
738 
739 //------------------------------------------------------------------------------
740 // Assemble Linear QFunction
741 //------------------------------------------------------------------------------
742 static int CeedOperatorLinearAssembleQFunction_Blocked(CeedOperator op,
743     CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
744   return CeedOperatorLinearAssembleQFunctionCore_Blocked(op, true, assembled,
745          rstr, request);
746 }
747 
748 //------------------------------------------------------------------------------
749 // Update Assembled Linear QFunction
750 //------------------------------------------------------------------------------
751 static int CeedOperatorLinearAssembleQFunctionUpdate_Blocked(CeedOperator op,
752     CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
753   return CeedOperatorLinearAssembleQFunctionCore_Blocked(op, false, &assembled,
754          &rstr, request);
755 }
756 
757 //------------------------------------------------------------------------------
758 // Operator Destroy
759 //------------------------------------------------------------------------------
760 static int CeedOperatorDestroy_Blocked(CeedOperator op) {
761   int ierr;
762   CeedOperator_Blocked *impl;
763   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
764 
765   for (CeedInt i=0; i<impl->num_inputs+impl->num_outputs; i++) {
766     ierr = CeedElemRestrictionDestroy(&impl->blk_restr[i]); CeedChkBackend(ierr);
767     ierr = CeedVectorDestroy(&impl->e_vecs_full[i]); CeedChkBackend(ierr);
768   }
769   ierr = CeedFree(&impl->blk_restr); CeedChkBackend(ierr);
770   ierr = CeedFree(&impl->e_vecs_full); CeedChkBackend(ierr);
771   ierr = CeedFree(&impl->input_states); CeedChkBackend(ierr);
772 
773   for (CeedInt i=0; i<impl->num_inputs; i++) {
774     ierr = CeedVectorDestroy(&impl->e_vecs_in[i]); CeedChkBackend(ierr);
775     ierr = CeedVectorDestroy(&impl->q_vecs_in[i]); CeedChkBackend(ierr);
776   }
777   ierr = CeedFree(&impl->e_vecs_in); CeedChkBackend(ierr);
778   ierr = CeedFree(&impl->q_vecs_in); CeedChkBackend(ierr);
779 
780   for (CeedInt i=0; i<impl->num_outputs; i++) {
781     ierr = CeedVectorDestroy(&impl->e_vecs_out[i]); CeedChkBackend(ierr);
782     ierr = CeedVectorDestroy(&impl->q_vecs_out[i]); CeedChkBackend(ierr);
783   }
784   ierr = CeedFree(&impl->e_vecs_out); CeedChkBackend(ierr);
785   ierr = CeedFree(&impl->q_vecs_out); CeedChkBackend(ierr);
786 
787   // QFunction assembly data
788   for (CeedInt i=0; i<impl->num_active_in; i++) {
789     ierr = CeedVectorDestroy(&impl->qf_active_in[i]); CeedChkBackend(ierr);
790   }
791   ierr = CeedFree(&impl->qf_active_in); CeedChkBackend(ierr);
792   ierr = CeedVectorDestroy(&impl->qf_l_vec); CeedChkBackend(ierr);
793   ierr = CeedElemRestrictionDestroy(&impl->qf_blk_rstr); CeedChkBackend(ierr);
794 
795   ierr = CeedFree(&impl); CeedChkBackend(ierr);
796   return CEED_ERROR_SUCCESS;
797 }
798 
799 //------------------------------------------------------------------------------
800 // Operator Create
801 //------------------------------------------------------------------------------
802 int CeedOperatorCreate_Blocked(CeedOperator op) {
803   int ierr;
804   Ceed ceed;
805   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
806   CeedOperator_Blocked *impl;
807 
808   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
809   ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr);
810 
811   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction",
812                                 CeedOperatorLinearAssembleQFunction_Blocked);
813   CeedChkBackend(ierr);
814   ierr = CeedSetBackendFunction(ceed, "Operator", op,
815                                 "LinearAssembleQFunctionUpdate",
816                                 CeedOperatorLinearAssembleQFunctionUpdate_Blocked);
817   CeedChkBackend(ierr);
818   ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd",
819                                 CeedOperatorApplyAdd_Blocked); CeedChkBackend(ierr);
820   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
821                                 CeedOperatorDestroy_Blocked); CeedChkBackend(ierr);
822   return CEED_ERROR_SUCCESS;
823 }
824 //------------------------------------------------------------------------------
825