xref: /libCEED/backends/blocked/ceed-blocked-operator.c (revision 539ec17d7efe6a80c4ab8b3d6b91c3433981191e)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed/ceed.h>
18 #include <ceed/backend.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include "ceed-blocked.h"
23 
24 //------------------------------------------------------------------------------
25 // Setup Input/Output Fields
26 //------------------------------------------------------------------------------
27 static int CeedOperatorSetupFields_Blocked(CeedQFunction qf,
28     CeedOperator op, bool is_input, CeedElemRestriction *blk_restr,
29     CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs,
30     CeedInt start_e, CeedInt num_fields, CeedInt Q) {
31   CeedInt ierr, num_comp, size, P;
32   CeedSize e_size, q_size;
33   Ceed ceed;
34   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
35   CeedBasis basis;
36   CeedElemRestriction r;
37   CeedOperatorField *op_fields;
38   CeedQFunctionField *qf_fields;
39   if (is_input) {
40     ierr = CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL);
41     CeedChkBackend(ierr);
42     ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL);
43     CeedChkBackend(ierr);
44   } else {
45     ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields);
46     CeedChkBackend(ierr);
47     ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields);
48     CeedChkBackend(ierr);
49   }
50   const CeedInt blk_size = 8;
51 
52   // Loop over fields
53   for (CeedInt i=0; i<num_fields; i++) {
54     CeedEvalMode eval_mode;
55     ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode);
56     CeedChkBackend(ierr);
57 
58     if (eval_mode != CEED_EVAL_WEIGHT) {
59       ierr = CeedOperatorFieldGetElemRestriction(op_fields[i], &r);
60       CeedChkBackend(ierr);
61       ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
62       CeedSize l_size;
63       CeedInt num_elem, elem_size, comp_stride;
64       ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr);
65       ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
66       ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr);
67       ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr);
68 
69       bool strided;
70       ierr = CeedElemRestrictionIsStrided(r, &strided); CeedChkBackend(ierr);
71       if (strided) {
72         CeedInt strides[3];
73         ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr);
74         ierr = CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, elem_size,
75                blk_size, num_comp, l_size, strides, &blk_restr[i+start_e]);
76         CeedChkBackend(ierr);
77       } else {
78         const CeedInt *offsets = NULL;
79         ierr = CeedElemRestrictionGetOffsets(r, CEED_MEM_HOST, &offsets);
80         CeedChkBackend(ierr);
81         ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr);
82         ierr = CeedElemRestrictionCreateBlocked(ceed, num_elem, elem_size,
83                                                 blk_size, num_comp, comp_stride,
84                                                 l_size, CEED_MEM_HOST,
85                                                 CEED_COPY_VALUES, offsets,
86                                                 &blk_restr[i+start_e]);
87         CeedChkBackend(ierr);
88         ierr = CeedElemRestrictionRestoreOffsets(r, &offsets); CeedChkBackend(ierr);
89       }
90       ierr = CeedElemRestrictionCreateVector(blk_restr[i+start_e], NULL,
91                                              &e_vecs_full[i+start_e]);
92       CeedChkBackend(ierr);
93     }
94 
95     switch(eval_mode) {
96     case CEED_EVAL_NONE:
97       ierr = CeedQFunctionFieldGetSize(qf_fields[i], &size); CeedChkBackend(ierr);
98       q_size = (CeedSize)Q*size*blk_size;
99       ierr = CeedVectorCreate(ceed, q_size, &q_vecs[i]); CeedChkBackend(ierr);
100       break;
101     case CEED_EVAL_INTERP:
102     case CEED_EVAL_GRAD:
103       ierr = CeedOperatorFieldGetBasis(op_fields[i], &basis); CeedChkBackend(ierr);
104       ierr = CeedQFunctionFieldGetSize(qf_fields[i], &size); CeedChkBackend(ierr);
105       ierr = CeedBasisGetNumNodes(basis, &P); CeedChkBackend(ierr);
106       ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
107       e_size = (CeedSize)P*num_comp*blk_size;
108       ierr = CeedVectorCreate(ceed, e_size, &e_vecs[i]); CeedChkBackend(ierr);
109       q_size = (CeedSize)Q*size*blk_size;
110       ierr = CeedVectorCreate(ceed, q_size, &q_vecs[i]); CeedChkBackend(ierr);
111       break;
112     case CEED_EVAL_WEIGHT: // Only on input fields
113       ierr = CeedOperatorFieldGetBasis(op_fields[i], &basis); CeedChkBackend(ierr);
114       q_size = (CeedSize)Q*blk_size;
115       ierr = CeedVectorCreate(ceed, q_size, &q_vecs[i]); CeedChkBackend(ierr);
116       ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE,
117                             CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]);
118       CeedChkBackend(ierr);
119 
120       break;
121     case CEED_EVAL_DIV:
122       break; // Not implemented
123     case CEED_EVAL_CURL:
124       break; // Not implemented
125     }
126   }
127   return CEED_ERROR_SUCCESS;
128 }
129 
130 //------------------------------------------------------------------------------
131 // Setup Operator
132 //------------------------------------------------------------------------------
133 static int CeedOperatorSetup_Blocked(CeedOperator op) {
134   int ierr;
135   bool is_setup_done;
136   ierr = CeedOperatorIsSetupDone(op, &is_setup_done); CeedChkBackend(ierr);
137   if (is_setup_done) return CEED_ERROR_SUCCESS;
138   Ceed ceed;
139   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
140   CeedOperator_Blocked *impl;
141   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
142   CeedQFunction qf;
143   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
144   CeedInt Q, num_input_fields, num_output_fields;
145   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
146   ierr = CeedQFunctionIsIdentity(qf, &impl->is_identity_qf); CeedChkBackend(ierr);
147   CeedOperatorField *op_input_fields, *op_output_fields;
148   ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields,
149                                &num_output_fields, &op_output_fields);
150   CeedChkBackend(ierr);
151   CeedQFunctionField *qf_input_fields, *qf_output_fields;
152   ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL,
153                                 &qf_output_fields);
154   CeedChkBackend(ierr);
155 
156   // Allocate
157   ierr = CeedCalloc(num_input_fields + num_output_fields, &impl->blk_restr);
158   CeedChkBackend(ierr);
159   ierr = CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full);
160   CeedChkBackend(ierr);
161 
162   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->input_states); CeedChkBackend(ierr);
163   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in); CeedChkBackend(ierr);
164   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out); CeedChkBackend(ierr);
165   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in); CeedChkBackend(ierr);
166   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out); CeedChkBackend(ierr);
167 
168   impl->num_inputs = num_input_fields;
169   impl->num_outputs = num_output_fields;
170 
171   // Set up infield and outfield pointer arrays
172   // Infields
173   ierr = CeedOperatorSetupFields_Blocked(qf, op, true, impl->blk_restr,
174                                          impl->e_vecs_full, impl->e_vecs_in,
175                                          impl->q_vecs_in, 0,
176                                          num_input_fields, Q);
177   CeedChkBackend(ierr);
178   // Outfields
179   ierr = CeedOperatorSetupFields_Blocked(qf, op, false, impl->blk_restr,
180                                          impl->e_vecs_full, impl->e_vecs_out,
181                                          impl->q_vecs_out, num_input_fields,
182                                          num_output_fields, Q);
183   CeedChkBackend(ierr);
184 
185   // Identity QFunctions
186   if (impl->is_identity_qf) {
187     CeedEvalMode in_mode, out_mode;
188     CeedQFunctionField *in_fields, *out_fields;
189     ierr = CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields);
190     CeedChkBackend(ierr);
191     ierr = CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode);
192     CeedChkBackend(ierr);
193     ierr = CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode);
194     CeedChkBackend(ierr);
195 
196     if (in_mode == CEED_EVAL_NONE && out_mode == CEED_EVAL_NONE) {
197       impl->is_identity_restr_op = true;
198     } else {
199       ierr = CeedVectorDestroy(&impl->q_vecs_out[0]); CeedChkBackend(ierr);
200       impl->q_vecs_out[0] = impl->q_vecs_in[0];
201       ierr = CeedVectorAddReference(impl->q_vecs_in[0]); CeedChkBackend(ierr);
202     }
203   }
204 
205   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
206 
207   return CEED_ERROR_SUCCESS;
208 }
209 
210 //------------------------------------------------------------------------------
211 // Setup Operator Inputs
212 //------------------------------------------------------------------------------
213 static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields,
214     CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
215     CeedVector in_vec, bool skip_active, CeedScalar *e_data_full[2*CEED_FIELD_MAX],
216     CeedOperator_Blocked *impl, CeedRequest *request) {
217   CeedInt ierr;
218   CeedEvalMode eval_mode;
219   CeedVector vec;
220   uint64_t state;
221 
222   for (CeedInt i=0; i<num_input_fields; i++) {
223     // Get input vector
224     ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
225     CeedChkBackend(ierr);
226     if (vec == CEED_VECTOR_ACTIVE) {
227       if (skip_active)
228         continue;
229       else
230         vec = in_vec;
231     }
232 
233     ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode);
234     CeedChkBackend(ierr);
235     if (eval_mode == CEED_EVAL_WEIGHT) { // Skip
236     } else {
237       // Restrict
238       ierr = CeedVectorGetState(vec, &state); CeedChkBackend(ierr);
239       if (state != impl->input_states[i] || vec == in_vec) {
240         ierr = CeedElemRestrictionApply(impl->blk_restr[i], CEED_NOTRANSPOSE,
241                                         vec, impl->e_vecs_full[i], request);
242         CeedChkBackend(ierr);
243         impl->input_states[i] = state;
244       }
245       // Get evec
246       ierr = CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST,
247                                     (const CeedScalar **) &e_data_full[i]);
248       CeedChkBackend(ierr);
249     }
250   }
251   return CEED_ERROR_SUCCESS;
252 }
253 
254 //------------------------------------------------------------------------------
255 // Input Basis Action
256 //------------------------------------------------------------------------------
257 static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q,
258     CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
259     CeedInt num_input_fields, CeedInt blk_size, bool skip_active,
260     CeedScalar *e_data_full[2*CEED_FIELD_MAX], CeedOperator_Blocked *impl) {
261   CeedInt ierr;
262   CeedInt dim, elem_size, size;
263   CeedElemRestriction elem_restr;
264   CeedEvalMode eval_mode;
265   CeedBasis basis;
266 
267   for (CeedInt i=0; i<num_input_fields; i++) {
268     // Skip active input
269     if (skip_active) {
270       CeedVector vec;
271       ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
272       CeedChkBackend(ierr);
273       if (vec == CEED_VECTOR_ACTIVE)
274         continue;
275     }
276 
277     // Get elem_size, eval_mode, size
278     ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_restr);
279     CeedChkBackend(ierr);
280     ierr = CeedElemRestrictionGetElementSize(elem_restr, &elem_size);
281     CeedChkBackend(ierr);
282     ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode);
283     CeedChkBackend(ierr);
284     ierr = CeedQFunctionFieldGetSize(qf_input_fields[i], &size);
285     CeedChkBackend(ierr);
286     // Basis action
287     switch(eval_mode) {
288     case CEED_EVAL_NONE:
289       ierr = CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST,
290                                 CEED_USE_POINTER, &e_data_full[i][e*Q*size]);
291       CeedChkBackend(ierr);
292       break;
293     case CEED_EVAL_INTERP:
294       ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis);
295       CeedChkBackend(ierr);
296       ierr = CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST,
297                                 CEED_USE_POINTER, &e_data_full[i][e*elem_size*size]);
298       CeedChkBackend(ierr);
299       ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE,
300                             CEED_EVAL_INTERP, impl->e_vecs_in[i],
301                             impl->q_vecs_in[i]); CeedChkBackend(ierr);
302       break;
303     case CEED_EVAL_GRAD:
304       ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis);
305       CeedChkBackend(ierr);
306       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
307       ierr = CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST,
308                                 CEED_USE_POINTER, &e_data_full[i][e*elem_size*size/dim]);
309       CeedChkBackend(ierr);
310       ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE,
311                             CEED_EVAL_GRAD, impl->e_vecs_in[i],
312                             impl->q_vecs_in[i]); CeedChkBackend(ierr);
313       break;
314     case CEED_EVAL_WEIGHT:
315       break;  // No action
316     // LCOV_EXCL_START
317     case CEED_EVAL_DIV:
318     case CEED_EVAL_CURL: {
319       ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis);
320       CeedChkBackend(ierr);
321       Ceed ceed;
322       ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
323       return CeedError(ceed, CEED_ERROR_BACKEND,
324                        "Ceed evaluation mode not implemented");
325       // LCOV_EXCL_STOP
326     }
327     }
328   }
329   return CEED_ERROR_SUCCESS;
330 }
331 
332 //------------------------------------------------------------------------------
333 // Output Basis Action
334 //------------------------------------------------------------------------------
335 static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q,
336     CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields,
337     CeedInt blk_size, CeedInt num_input_fields, CeedInt num_output_fields,
338     CeedOperator op, CeedScalar *e_data_full[2*CEED_FIELD_MAX],
339     CeedOperator_Blocked *impl) {
340   CeedInt ierr;
341   CeedInt dim, elem_size, size;
342   CeedElemRestriction elem_restr;
343   CeedEvalMode eval_mode;
344   CeedBasis basis;
345 
346   for (CeedInt i=0; i<num_output_fields; i++) {
347     // Get elem_size, eval_mode, size
348     ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_restr);
349     CeedChkBackend(ierr);
350     ierr = CeedElemRestrictionGetElementSize(elem_restr, &elem_size);
351     CeedChkBackend(ierr);
352     ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode);
353     CeedChkBackend(ierr);
354     ierr = CeedQFunctionFieldGetSize(qf_output_fields[i], &size);
355     CeedChkBackend(ierr);
356     // Basis action
357     switch(eval_mode) {
358     case CEED_EVAL_NONE:
359       break; // No action
360     case CEED_EVAL_INTERP:
361       ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis);
362       CeedChkBackend(ierr);
363       ierr = CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST,
364                                 CEED_USE_POINTER, &e_data_full[i + num_input_fields][e*elem_size*size]);
365       CeedChkBackend(ierr);
366       ierr = CeedBasisApply(basis, blk_size, CEED_TRANSPOSE,
367                             CEED_EVAL_INTERP, impl->q_vecs_out[i],
368                             impl->e_vecs_out[i]); CeedChkBackend(ierr);
369       break;
370     case CEED_EVAL_GRAD:
371       ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis);
372       CeedChkBackend(ierr);
373       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
374       ierr = CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST,
375                                 CEED_USE_POINTER, &e_data_full[i + num_input_fields][e*elem_size*size/dim]);
376       CeedChkBackend(ierr);
377       ierr = CeedBasisApply(basis, blk_size, CEED_TRANSPOSE,
378                             CEED_EVAL_GRAD, impl->q_vecs_out[i],
379                             impl->e_vecs_out[i]); CeedChkBackend(ierr);
380       break;
381     // LCOV_EXCL_START
382     case CEED_EVAL_WEIGHT: {
383       Ceed ceed;
384       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
385       return CeedError(ceed, CEED_ERROR_BACKEND,
386                        "CEED_EVAL_WEIGHT cannot be an output "
387                        "evaluation mode");
388     }
389     case CEED_EVAL_DIV:
390     case CEED_EVAL_CURL: {
391       Ceed ceed;
392       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
393       return CeedError(ceed, CEED_ERROR_BACKEND,
394                        "Ceed evaluation mode not implemented");
395       // LCOV_EXCL_STOP
396     }
397     }
398   }
399   return CEED_ERROR_SUCCESS;
400 }
401 
402 //------------------------------------------------------------------------------
403 // Restore Input Vectors
404 //------------------------------------------------------------------------------
405 static inline int CeedOperatorRestoreInputs_Blocked(CeedInt num_input_fields,
406     CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
407     bool skip_active, CeedScalar *e_data_full[2*CEED_FIELD_MAX],
408     CeedOperator_Blocked *impl) {
409   CeedInt ierr;
410   CeedEvalMode eval_mode;
411 
412   for (CeedInt i=0; i<num_input_fields; i++) {
413     // Skip active inputs
414     if (skip_active) {
415       CeedVector vec;
416       ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
417       CeedChkBackend(ierr);
418       if (vec == CEED_VECTOR_ACTIVE)
419         continue;
420     }
421     ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode);
422     CeedChkBackend(ierr);
423     if (eval_mode == CEED_EVAL_WEIGHT) { // Skip
424     } else {
425       ierr = CeedVectorRestoreArrayRead(impl->e_vecs_full[i],
426                                         (const CeedScalar **) &e_data_full[i]);
427       CeedChkBackend(ierr);
428     }
429   }
430   return CEED_ERROR_SUCCESS;
431 }
432 
433 //------------------------------------------------------------------------------
434 // Operator Apply
435 //------------------------------------------------------------------------------
436 static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec,
437                                         CeedVector out_vec,
438                                         CeedRequest *request) {
439   int ierr;
440   CeedOperator_Blocked *impl;
441   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
442   const CeedInt blk_size = 8;
443   CeedInt Q, num_input_fields, num_output_fields, num_elem, size;
444   ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr);
445   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
446   CeedInt num_blks = (num_elem/blk_size) + !!(num_elem%blk_size);
447   CeedQFunction qf;
448   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
449   CeedOperatorField *op_input_fields, *op_output_fields;
450   ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields,
451                                &num_output_fields, &op_output_fields);
452   CeedChkBackend(ierr);
453   CeedQFunctionField *qf_input_fields, *qf_output_fields;
454   ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL,
455                                 &qf_output_fields);
456   CeedChkBackend(ierr);
457   CeedEvalMode eval_mode;
458   CeedVector vec;
459   CeedScalar *e_data_full[2*CEED_FIELD_MAX] = {0};
460 
461   // Setup
462   ierr = CeedOperatorSetup_Blocked(op); CeedChkBackend(ierr);
463 
464   // Restriction only operator
465   if (impl->is_identity_restr_op) {
466     ierr = CeedElemRestrictionApply(impl->blk_restr[0], CEED_NOTRANSPOSE, in_vec,
467                                     impl->e_vecs_full[0], request); CeedChkBackend(ierr);
468     ierr = CeedElemRestrictionApply(impl->blk_restr[1], CEED_TRANSPOSE,
469                                     impl->e_vecs_full[0], out_vec, request); CeedChkBackend(ierr);
470     return CEED_ERROR_SUCCESS;
471   }
472 
473   // Input Evecs and Restriction
474   ierr = CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields,
475                                          op_input_fields, in_vec, false, e_data_full,
476                                          impl, request); CeedChkBackend(ierr);
477 
478   // Output Evecs
479   for (CeedInt i=0; i<num_output_fields; i++) {
480     ierr = CeedVectorGetArrayWrite(impl->e_vecs_full[i+impl->num_inputs],
481                                    CEED_MEM_HOST, &e_data_full[i + num_input_fields]);
482     CeedChkBackend(ierr);
483   }
484 
485   // Loop through elements
486   for (CeedInt e=0; e<num_blks*blk_size; e+=blk_size) {
487     // Output pointers
488     for (CeedInt i=0; i<num_output_fields; i++) {
489       ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode);
490       CeedChkBackend(ierr);
491       if (eval_mode == CEED_EVAL_NONE) {
492         ierr = CeedQFunctionFieldGetSize(qf_output_fields[i], &size);
493         CeedChkBackend(ierr);
494         ierr = CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST,
495                                   CEED_USE_POINTER, &e_data_full[i + num_input_fields][e*Q*size]);
496         CeedChkBackend(ierr);
497       }
498     }
499 
500     // Input basis apply
501     ierr = CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields,
502                                           num_input_fields, blk_size, false, e_data_full,
503                                           impl); CeedChkBackend(ierr);
504 
505     // Q function
506     if (!impl->is_identity_qf) {
507       ierr = CeedQFunctionApply(qf, Q*blk_size, impl->q_vecs_in, impl->q_vecs_out);
508       CeedChkBackend(ierr);
509     }
510 
511     // Output basis apply
512     ierr = CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields,
513                                            blk_size, num_input_fields,
514                                            num_output_fields, op, e_data_full, impl);
515     CeedChkBackend(ierr);
516   }
517 
518   // Output restriction
519   for (CeedInt i=0; i<num_output_fields; i++) {
520     // Restore evec
521     ierr = CeedVectorRestoreArray(impl->e_vecs_full[i+impl->num_inputs],
522                                   &e_data_full[i + num_input_fields]);
523     CeedChkBackend(ierr);
524     // Get output vector
525     ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec);
526     CeedChkBackend(ierr);
527     // Active
528     if (vec == CEED_VECTOR_ACTIVE)
529       vec = out_vec;
530     // Restrict
531     ierr = CeedElemRestrictionApply(impl->blk_restr[i+impl->num_inputs],
532                                     CEED_TRANSPOSE, impl->e_vecs_full[i+impl->num_inputs],
533                                     vec, request); CeedChkBackend(ierr);
534 
535   }
536 
537   // Restore input arrays
538   ierr = CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields,
539          op_input_fields, false, e_data_full, impl); CeedChkBackend(ierr);
540 
541   return CEED_ERROR_SUCCESS;
542 }
543 
544 //------------------------------------------------------------------------------
545 // Core code for assembling linear QFunction
546 //------------------------------------------------------------------------------
547 static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(
548   CeedOperator op, bool build_objects, CeedVector *assembled,
549   CeedElemRestriction *rstr, CeedRequest *request) {
550   int ierr;
551   CeedOperator_Blocked *impl;
552   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
553   const CeedInt blk_size = 8;
554   CeedInt Q, num_input_fields, num_output_fields, num_elem, size;
555   ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr);
556   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
557   CeedInt num_blks = (num_elem/blk_size) + !!(num_elem%blk_size);
558   CeedQFunction qf;
559   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
560   CeedOperatorField *op_input_fields, *op_output_fields;
561   ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields,
562                                &num_output_fields, &op_output_fields);
563   CeedChkBackend(ierr);
564   CeedQFunctionField *qf_input_fields, *qf_output_fields;
565   ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL,
566                                 &qf_output_fields);
567   CeedChkBackend(ierr);
568   CeedVector vec, l_vec = impl->qf_l_vec;
569   CeedInt num_active_in = impl->num_active_in,
570           num_active_out = impl->num_active_out;
571   CeedSize q_size;
572   CeedVector *active_in = impl->qf_active_in;
573   CeedScalar *a, *tmp;
574   Ceed ceed;
575   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
576   CeedScalar *e_data_full[2*CEED_FIELD_MAX] = {0};
577 
578   // Setup
579   ierr = CeedOperatorSetup_Blocked(op); CeedChkBackend(ierr);
580 
581   // Check for identity
582   if (impl->is_identity_qf)
583     // LCOV_EXCL_START
584     return CeedError(ceed, CEED_ERROR_BACKEND,
585                      "Assembling identity QFunctions not supported");
586   // LCOV_EXCL_STOP
587 
588   // Input Evecs and Restriction
589   ierr = CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields,
590                                          op_input_fields, NULL, true, e_data_full,
591                                          impl, request); CeedChkBackend(ierr);
592 
593   // Count number of active input fields
594   if (!num_active_in) {
595     for (CeedInt i=0; i<num_input_fields; i++) {
596       // Get input vector
597       ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
598       CeedChkBackend(ierr);
599       // Check if active input
600       if (vec == CEED_VECTOR_ACTIVE) {
601         ierr = CeedQFunctionFieldGetSize(qf_input_fields[i], &size);
602         CeedChkBackend(ierr);
603         ierr = CeedVectorSetValue(impl->q_vecs_in[i], 0.0); CeedChkBackend(ierr);
604         ierr = CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp);
605         CeedChkBackend(ierr);
606         ierr = CeedRealloc(num_active_in + size, &active_in); CeedChkBackend(ierr);
607         for (CeedInt field=0; field<size; field++) {
608           q_size = (CeedSize)Q*blk_size;
609           ierr = CeedVectorCreate(ceed, q_size, &active_in[num_active_in+field]);
610           CeedChkBackend(ierr);
611           ierr = CeedVectorSetArray(active_in[num_active_in+field], CEED_MEM_HOST,
612                                     CEED_USE_POINTER, &tmp[field*Q*blk_size]);
613           CeedChkBackend(ierr);
614         }
615         num_active_in += size;
616         ierr = CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp); CeedChkBackend(ierr);
617       }
618     }
619     impl->num_active_in = num_active_in;
620     impl->qf_active_in = active_in;
621   }
622 
623   // Count number of active output fields
624   if (!num_active_out) {
625     for (CeedInt i=0; i<num_output_fields; i++) {
626       // Get output vector
627       ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec);
628       CeedChkBackend(ierr);
629       // Check if active output
630       if (vec == CEED_VECTOR_ACTIVE) {
631         ierr = CeedQFunctionFieldGetSize(qf_output_fields[i], &size);
632         CeedChkBackend(ierr);
633         num_active_out += size;
634       }
635     }
636     impl->num_active_out = num_active_out;
637   }
638 
639   // Check sizes
640   if (!num_active_in || !num_active_out)
641     // LCOV_EXCL_START
642     return CeedError(ceed, CEED_ERROR_BACKEND,
643                      "Cannot assemble QFunction without active inputs "
644                      "and outputs");
645   // LCOV_EXCL_STOP
646 
647   // Setup Lvec
648   if (!l_vec) {
649     CeedSize l_size = (CeedSize)num_blks*blk_size*Q*num_active_in*num_active_out;
650     ierr = CeedVectorCreate(ceed, l_size, &l_vec); CeedChkBackend(ierr);
651     impl->qf_l_vec = l_vec;
652   }
653   ierr = CeedVectorGetArrayWrite(l_vec, CEED_MEM_HOST, &a); CeedChkBackend(ierr);
654 
655   // Build objects if needed
656   CeedInt strides[3] = {1, Q, num_active_in *num_active_out*Q};
657   if (build_objects) {
658     // Create output restriction
659     ierr = CeedElemRestrictionCreateStrided(ceed, num_elem, Q,
660                                             num_active_in*num_active_out,
661                                             num_active_in*num_active_out*num_elem*Q,
662                                             strides, rstr); CeedChkBackend(ierr);
663     // Create assembled vector
664     CeedSize l_size = (CeedSize)num_elem*Q*num_active_in*num_active_out;
665     ierr = CeedVectorCreate(ceed, l_size, assembled); CeedChkBackend(ierr);
666   }
667 
668   // Loop through elements
669   for (CeedInt e=0; e<num_blks*blk_size; e+=blk_size) {
670     // Input basis apply
671     ierr = CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields,
672                                           num_input_fields, blk_size, true, e_data_full,
673                                           impl); CeedChkBackend(ierr);
674 
675     // Assemble QFunction
676     for (CeedInt in=0; in<num_active_in; in++) {
677       // Set Inputs
678       ierr = CeedVectorSetValue(active_in[in], 1.0); CeedChkBackend(ierr);
679       if (num_active_in > 1) {
680         ierr = CeedVectorSetValue(active_in[(in+num_active_in-1)%num_active_in],
681                                   0.0); CeedChkBackend(ierr);
682       }
683       // Set Outputs
684       for (CeedInt out=0; out<num_output_fields; out++) {
685         // Get output vector
686         ierr = CeedOperatorFieldGetVector(op_output_fields[out], &vec);
687         CeedChkBackend(ierr);
688         // Check if active output
689         if (vec == CEED_VECTOR_ACTIVE) {
690           CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST,
691                              CEED_USE_POINTER, a); CeedChkBackend(ierr);
692           ierr = CeedQFunctionFieldGetSize(qf_output_fields[out], &size);
693           CeedChkBackend(ierr);
694           a += size*Q*blk_size; // Advance the pointer by the size of the output
695         }
696       }
697       // Apply QFunction
698       ierr = CeedQFunctionApply(qf, Q*blk_size, impl->q_vecs_in, impl->q_vecs_out);
699       CeedChkBackend(ierr);
700     }
701   }
702 
703   // Un-set output Qvecs to prevent accidental overwrite of Assembled
704   for (CeedInt out=0; out<num_output_fields; out++) {
705     // Get output vector
706     ierr = CeedOperatorFieldGetVector(op_output_fields[out], &vec);
707     CeedChkBackend(ierr);
708     // Check if active output
709     if (vec == CEED_VECTOR_ACTIVE) {
710       CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_COPY_VALUES,
711                          NULL); CeedChkBackend(ierr);
712     }
713   }
714 
715   // Restore input arrays
716   ierr = CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields,
717          op_input_fields, true, e_data_full, impl); CeedChkBackend(ierr);
718 
719   // Output blocked restriction
720   ierr = CeedVectorRestoreArray(l_vec, &a); CeedChkBackend(ierr);
721   ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr);
722   CeedElemRestriction blk_rstr = impl->qf_blk_rstr;
723   if (!impl->qf_blk_rstr) {
724     ierr = CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, blk_size,
725            num_active_in*num_active_out, num_active_in*num_active_out*num_elem*Q,
726            strides, &blk_rstr); CeedChkBackend(ierr);
727     impl->qf_blk_rstr = blk_rstr;
728   }
729   ierr = CeedElemRestrictionApply(blk_rstr, CEED_TRANSPOSE, l_vec, *assembled,
730                                   request); CeedChkBackend(ierr);
731 
732   return CEED_ERROR_SUCCESS;
733 }
734 
735 //------------------------------------------------------------------------------
736 // Assemble Linear QFunction
737 //------------------------------------------------------------------------------
738 static int CeedOperatorLinearAssembleQFunction_Blocked(CeedOperator op,
739     CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
740   return CeedOperatorLinearAssembleQFunctionCore_Blocked(op, true, assembled,
741          rstr, request);
742 }
743 
744 //------------------------------------------------------------------------------
745 // Update Assembled Linear QFunction
746 //------------------------------------------------------------------------------
747 static int CeedOperatorLinearAssembleQFunctionUpdate_Blocked(CeedOperator op,
748     CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
749   return CeedOperatorLinearAssembleQFunctionCore_Blocked(op, false, &assembled,
750          &rstr, request);
751 }
752 
753 //------------------------------------------------------------------------------
754 // Operator Destroy
755 //------------------------------------------------------------------------------
756 static int CeedOperatorDestroy_Blocked(CeedOperator op) {
757   int ierr;
758   CeedOperator_Blocked *impl;
759   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
760 
761   for (CeedInt i=0; i<impl->num_inputs+impl->num_outputs; i++) {
762     ierr = CeedElemRestrictionDestroy(&impl->blk_restr[i]); CeedChkBackend(ierr);
763     ierr = CeedVectorDestroy(&impl->e_vecs_full[i]); CeedChkBackend(ierr);
764   }
765   ierr = CeedFree(&impl->blk_restr); CeedChkBackend(ierr);
766   ierr = CeedFree(&impl->e_vecs_full); CeedChkBackend(ierr);
767   ierr = CeedFree(&impl->input_states); CeedChkBackend(ierr);
768 
769   for (CeedInt i=0; i<impl->num_inputs; i++) {
770     ierr = CeedVectorDestroy(&impl->e_vecs_in[i]); CeedChkBackend(ierr);
771     ierr = CeedVectorDestroy(&impl->q_vecs_in[i]); CeedChkBackend(ierr);
772   }
773   ierr = CeedFree(&impl->e_vecs_in); CeedChkBackend(ierr);
774   ierr = CeedFree(&impl->q_vecs_in); CeedChkBackend(ierr);
775 
776   for (CeedInt i=0; i<impl->num_outputs; i++) {
777     ierr = CeedVectorDestroy(&impl->e_vecs_out[i]); CeedChkBackend(ierr);
778     ierr = CeedVectorDestroy(&impl->q_vecs_out[i]); CeedChkBackend(ierr);
779   }
780   ierr = CeedFree(&impl->e_vecs_out); CeedChkBackend(ierr);
781   ierr = CeedFree(&impl->q_vecs_out); CeedChkBackend(ierr);
782 
783   // QFunction assembly data
784   for (CeedInt i=0; i<impl->num_active_in; i++) {
785     ierr = CeedVectorDestroy(&impl->qf_active_in[i]); CeedChkBackend(ierr);
786   }
787   ierr = CeedFree(&impl->qf_active_in); CeedChkBackend(ierr);
788   ierr = CeedVectorDestroy(&impl->qf_l_vec); CeedChkBackend(ierr);
789   ierr = CeedElemRestrictionDestroy(&impl->qf_blk_rstr); CeedChkBackend(ierr);
790 
791   ierr = CeedFree(&impl); CeedChkBackend(ierr);
792   return CEED_ERROR_SUCCESS;
793 }
794 
795 //------------------------------------------------------------------------------
796 // Operator Create
797 //------------------------------------------------------------------------------
798 int CeedOperatorCreate_Blocked(CeedOperator op) {
799   int ierr;
800   Ceed ceed;
801   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
802   CeedOperator_Blocked *impl;
803 
804   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
805   ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr);
806 
807   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction",
808                                 CeedOperatorLinearAssembleQFunction_Blocked);
809   CeedChkBackend(ierr);
810   ierr = CeedSetBackendFunction(ceed, "Operator", op,
811                                 "LinearAssembleQFunctionUpdate",
812                                 CeedOperatorLinearAssembleQFunctionUpdate_Blocked);
813   CeedChkBackend(ierr);
814   ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd",
815                                 CeedOperatorApplyAdd_Blocked); CeedChkBackend(ierr);
816   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
817                                 CeedOperatorDestroy_Blocked); CeedChkBackend(ierr);
818   return CEED_ERROR_SUCCESS;
819 }
820 //------------------------------------------------------------------------------
821