xref: /libCEED/backends/blocked/ceed-blocked-operator.c (revision eab5b1a2bf6b5384761bd0b9e014e873aafcca6d)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed/ceed.h>
18 #include <ceed/backend.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include "ceed-blocked.h"
23 
24 //------------------------------------------------------------------------------
25 // Setup Input/Output Fields
26 //------------------------------------------------------------------------------
27 static int CeedOperatorSetupFields_Blocked(CeedQFunction qf,
28     CeedOperator op, bool is_input, CeedElemRestriction *blk_restr,
29     CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs,
30     CeedInt start_e, CeedInt num_fields, CeedInt Q) {
31   CeedInt dim, ierr, num_comp, size, P;
32   Ceed ceed;
33   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
34   CeedBasis basis;
35   CeedElemRestriction r;
36   CeedOperatorField *op_fields;
37   CeedQFunctionField *qf_fields;
38   if (is_input) {
39     ierr = CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL);
40     CeedChkBackend(ierr);
41     ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL);
42     CeedChkBackend(ierr);
43   } else {
44     ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields);
45     CeedChkBackend(ierr);
46     ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields);
47     CeedChkBackend(ierr);
48   }
49   const CeedInt blk_size = 8;
50 
51   // Loop over fields
52   for (CeedInt i=0; i<num_fields; i++) {
53     CeedEvalMode eval_mode;
54     ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode);
55     CeedChkBackend(ierr);
56 
57     if (eval_mode != CEED_EVAL_WEIGHT) {
58       ierr = CeedOperatorFieldGetElemRestriction(op_fields[i], &r);
59       CeedChkBackend(ierr);
60       ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
61       CeedInt num_elem, elem_size, l_size, comp_stride;
62       ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr);
63       ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
64       ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr);
65       ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr);
66 
67       bool strided;
68       ierr = CeedElemRestrictionIsStrided(r, &strided); CeedChkBackend(ierr);
69       if (strided) {
70         CeedInt strides[3];
71         ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr);
72         ierr = CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, elem_size,
73                blk_size, num_comp, l_size, strides, &blk_restr[i+start_e]);
74         CeedChkBackend(ierr);
75       } else {
76         const CeedInt *offsets = NULL;
77         ierr = CeedElemRestrictionGetOffsets(r, CEED_MEM_HOST, &offsets);
78         CeedChkBackend(ierr);
79         ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr);
80         ierr = CeedElemRestrictionCreateBlocked(ceed, num_elem, elem_size,
81                                                 blk_size, num_comp, comp_stride,
82                                                 l_size, CEED_MEM_HOST,
83                                                 CEED_COPY_VALUES, offsets,
84                                                 &blk_restr[i+start_e]);
85         CeedChkBackend(ierr);
86         ierr = CeedElemRestrictionRestoreOffsets(r, &offsets); CeedChkBackend(ierr);
87       }
88       ierr = CeedElemRestrictionCreateVector(blk_restr[i+start_e], NULL,
89                                              &e_vecs_full[i+start_e]);
90       CeedChkBackend(ierr);
91     }
92 
93     switch(eval_mode) {
94     case CEED_EVAL_NONE:
95       ierr = CeedQFunctionFieldGetSize(qf_fields[i], &size); CeedChkBackend(ierr);
96       ierr = CeedVectorCreate(ceed, Q*size*blk_size, &q_vecs[i]);
97       CeedChkBackend(ierr);
98       break;
99     case CEED_EVAL_INTERP:
100       ierr = CeedQFunctionFieldGetSize(qf_fields[i], &size); CeedChkBackend(ierr);
101       ierr = CeedElemRestrictionGetElementSize(r, &P);
102       CeedChkBackend(ierr);
103       ierr = CeedVectorCreate(ceed, P*size*blk_size, &e_vecs[i]);
104       CeedChkBackend(ierr);
105       ierr = CeedVectorCreate(ceed, Q*size*blk_size, &q_vecs[i]);
106       CeedChkBackend(ierr);
107       break;
108     case CEED_EVAL_GRAD:
109       ierr = CeedOperatorFieldGetBasis(op_fields[i], &basis); CeedChkBackend(ierr);
110       ierr = CeedQFunctionFieldGetSize(qf_fields[i], &size); CeedChkBackend(ierr);
111       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
112       ierr = CeedElemRestrictionGetElementSize(r, &P);
113       CeedChkBackend(ierr);
114       ierr = CeedVectorCreate(ceed, P*size/dim*blk_size, &e_vecs[i]);
115       CeedChkBackend(ierr);
116       ierr = CeedVectorCreate(ceed, Q*size*blk_size, &q_vecs[i]);
117       CeedChkBackend(ierr);
118       break;
119     case CEED_EVAL_WEIGHT: // Only on input fields
120       ierr = CeedOperatorFieldGetBasis(op_fields[i], &basis); CeedChkBackend(ierr);
121       ierr = CeedVectorCreate(ceed, Q*blk_size, &q_vecs[i]); CeedChkBackend(ierr);
122       ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE,
123                             CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]);
124       CeedChkBackend(ierr);
125 
126       break;
127     case CEED_EVAL_DIV:
128       break; // Not implemented
129     case CEED_EVAL_CURL:
130       break; // Not implemented
131     }
132   }
133   return CEED_ERROR_SUCCESS;
134 }
135 
136 //------------------------------------------------------------------------------
137 // Setup Operator
138 //------------------------------------------------------------------------------
139 static int CeedOperatorSetup_Blocked(CeedOperator op) {
140   int ierr;
141   bool is_setup_done;
142   ierr = CeedOperatorIsSetupDone(op, &is_setup_done); CeedChkBackend(ierr);
143   if (is_setup_done) return CEED_ERROR_SUCCESS;
144   Ceed ceed;
145   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
146   CeedOperator_Blocked *impl;
147   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
148   CeedQFunction qf;
149   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
150   CeedInt Q, num_input_fields, num_output_fields;
151   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
152   ierr = CeedQFunctionIsIdentity(qf, &impl->is_identity_qf); CeedChkBackend(ierr);
153   CeedOperatorField *op_input_fields, *op_output_fields;
154   ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields,
155                                &num_output_fields, &op_output_fields);
156   CeedChkBackend(ierr);
157   CeedQFunctionField *qf_input_fields, *qf_output_fields;
158   ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL,
159                                 &qf_output_fields);
160   CeedChkBackend(ierr);
161 
162   // Allocate
163   ierr = CeedCalloc(num_input_fields + num_output_fields, &impl->blk_restr);
164   CeedChkBackend(ierr);
165   ierr = CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full);
166   CeedChkBackend(ierr);
167 
168   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->input_states); CeedChkBackend(ierr);
169   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in); CeedChkBackend(ierr);
170   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out); CeedChkBackend(ierr);
171   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in); CeedChkBackend(ierr);
172   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out); CeedChkBackend(ierr);
173 
174   impl->num_inputs = num_input_fields;
175   impl->num_outputs = num_output_fields;
176 
177   // Set up infield and outfield pointer arrays
178   // Infields
179   ierr = CeedOperatorSetupFields_Blocked(qf, op, true, impl->blk_restr,
180                                          impl->e_vecs_full, impl->e_vecs_in,
181                                          impl->q_vecs_in, 0,
182                                          num_input_fields, Q);
183   CeedChkBackend(ierr);
184   // Outfields
185   ierr = CeedOperatorSetupFields_Blocked(qf, op, false, impl->blk_restr,
186                                          impl->e_vecs_full, impl->e_vecs_out,
187                                          impl->q_vecs_out, num_input_fields,
188                                          num_output_fields, Q);
189   CeedChkBackend(ierr);
190 
191   // Identity QFunctions
192   if (impl->is_identity_qf) {
193     CeedEvalMode in_mode, out_mode;
194     CeedQFunctionField *in_fields, *out_fields;
195     ierr = CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields);
196     CeedChkBackend(ierr);
197     ierr = CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode);
198     CeedChkBackend(ierr);
199     ierr = CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode);
200     CeedChkBackend(ierr);
201 
202     if (in_mode == CEED_EVAL_NONE && out_mode == CEED_EVAL_NONE) {
203       impl->is_identity_restr_op = true;
204     } else {
205       ierr = CeedVectorDestroy(&impl->q_vecs_out[0]); CeedChkBackend(ierr);
206       impl->q_vecs_out[0] = impl->q_vecs_in[0];
207       ierr = CeedVectorAddReference(impl->q_vecs_in[0]); CeedChkBackend(ierr);
208     }
209   }
210 
211   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
212 
213   return CEED_ERROR_SUCCESS;
214 }
215 
216 //------------------------------------------------------------------------------
217 // Setup Operator Inputs
218 //------------------------------------------------------------------------------
219 static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields,
220     CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
221     CeedVector in_vec, bool skip_active, CeedScalar *e_data_full[2*CEED_FIELD_MAX],
222     CeedOperator_Blocked *impl, CeedRequest *request) {
223   CeedInt ierr;
224   CeedEvalMode eval_mode;
225   CeedVector vec;
226   uint64_t state;
227 
228   for (CeedInt i=0; i<num_input_fields; i++) {
229     // Get input vector
230     ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
231     CeedChkBackend(ierr);
232     if (vec == CEED_VECTOR_ACTIVE) {
233       if (skip_active)
234         continue;
235       else
236         vec = in_vec;
237     }
238 
239     ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode);
240     CeedChkBackend(ierr);
241     if (eval_mode == CEED_EVAL_WEIGHT) { // Skip
242     } else {
243       // Restrict
244       ierr = CeedVectorGetState(vec, &state); CeedChkBackend(ierr);
245       if (state != impl->input_states[i] || vec == in_vec) {
246         ierr = CeedElemRestrictionApply(impl->blk_restr[i], CEED_NOTRANSPOSE,
247                                         vec, impl->e_vecs_full[i], request);
248         CeedChkBackend(ierr);
249         impl->input_states[i] = state;
250       }
251       // Get evec
252       ierr = CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST,
253                                     (const CeedScalar **) &e_data_full[i]);
254       CeedChkBackend(ierr);
255     }
256   }
257   return CEED_ERROR_SUCCESS;
258 }
259 
260 //------------------------------------------------------------------------------
261 // Input Basis Action
262 //------------------------------------------------------------------------------
263 static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q,
264     CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
265     CeedInt num_input_fields, CeedInt blk_size, bool skip_active,
266     CeedScalar *e_data_full[2*CEED_FIELD_MAX], CeedOperator_Blocked *impl) {
267   CeedInt ierr;
268   CeedInt dim, elem_size, size;
269   CeedElemRestriction elem_restr;
270   CeedEvalMode eval_mode;
271   CeedBasis basis;
272 
273   for (CeedInt i=0; i<num_input_fields; i++) {
274     // Skip active input
275     if (skip_active) {
276       CeedVector vec;
277       ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
278       CeedChkBackend(ierr);
279       if (vec == CEED_VECTOR_ACTIVE)
280         continue;
281     }
282 
283     // Get elem_size, eval_mode, size
284     ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_restr);
285     CeedChkBackend(ierr);
286     ierr = CeedElemRestrictionGetElementSize(elem_restr, &elem_size);
287     CeedChkBackend(ierr);
288     ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode);
289     CeedChkBackend(ierr);
290     ierr = CeedQFunctionFieldGetSize(qf_input_fields[i], &size);
291     CeedChkBackend(ierr);
292     // Basis action
293     switch(eval_mode) {
294     case CEED_EVAL_NONE:
295       ierr = CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST,
296                                 CEED_USE_POINTER, &e_data_full[i][e*Q*size]);
297       CeedChkBackend(ierr);
298       break;
299     case CEED_EVAL_INTERP:
300       ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis);
301       CeedChkBackend(ierr);
302       ierr = CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST,
303                                 CEED_USE_POINTER, &e_data_full[i][e*elem_size*size]);
304       CeedChkBackend(ierr);
305       ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE,
306                             CEED_EVAL_INTERP, impl->e_vecs_in[i],
307                             impl->q_vecs_in[i]); CeedChkBackend(ierr);
308       break;
309     case CEED_EVAL_GRAD:
310       ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis);
311       CeedChkBackend(ierr);
312       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
313       ierr = CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST,
314                                 CEED_USE_POINTER, &e_data_full[i][e*elem_size*size/dim]);
315       CeedChkBackend(ierr);
316       ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE,
317                             CEED_EVAL_GRAD, impl->e_vecs_in[i],
318                             impl->q_vecs_in[i]); CeedChkBackend(ierr);
319       break;
320     case CEED_EVAL_WEIGHT:
321       break;  // No action
322     // LCOV_EXCL_START
323     case CEED_EVAL_DIV:
324     case CEED_EVAL_CURL: {
325       ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis);
326       CeedChkBackend(ierr);
327       Ceed ceed;
328       ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
329       return CeedError(ceed, CEED_ERROR_BACKEND,
330                        "Ceed evaluation mode not implemented");
331       // LCOV_EXCL_STOP
332     }
333     }
334   }
335   return CEED_ERROR_SUCCESS;
336 }
337 
338 //------------------------------------------------------------------------------
339 // Output Basis Action
340 //------------------------------------------------------------------------------
341 static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q,
342     CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields,
343     CeedInt blk_size, CeedInt num_input_fields, CeedInt num_output_fields,
344     CeedOperator op, CeedScalar *e_data_full[2*CEED_FIELD_MAX],
345     CeedOperator_Blocked *impl) {
346   CeedInt ierr;
347   CeedInt dim, elem_size, size;
348   CeedElemRestriction elem_restr;
349   CeedEvalMode eval_mode;
350   CeedBasis basis;
351 
352   for (CeedInt i=0; i<num_output_fields; i++) {
353     // Get elem_size, eval_mode, size
354     ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_restr);
355     CeedChkBackend(ierr);
356     ierr = CeedElemRestrictionGetElementSize(elem_restr, &elem_size);
357     CeedChkBackend(ierr);
358     ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode);
359     CeedChkBackend(ierr);
360     ierr = CeedQFunctionFieldGetSize(qf_output_fields[i], &size);
361     CeedChkBackend(ierr);
362     // Basis action
363     switch(eval_mode) {
364     case CEED_EVAL_NONE:
365       break; // No action
366     case CEED_EVAL_INTERP:
367       ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis);
368       CeedChkBackend(ierr);
369       ierr = CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST,
370                                 CEED_USE_POINTER, &e_data_full[i + num_input_fields][e*elem_size*size]);
371       CeedChkBackend(ierr);
372       ierr = CeedBasisApply(basis, blk_size, CEED_TRANSPOSE,
373                             CEED_EVAL_INTERP, impl->q_vecs_out[i],
374                             impl->e_vecs_out[i]); CeedChkBackend(ierr);
375       break;
376     case CEED_EVAL_GRAD:
377       ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis);
378       CeedChkBackend(ierr);
379       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
380       ierr = CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST,
381                                 CEED_USE_POINTER, &e_data_full[i + num_input_fields][e*elem_size*size/dim]);
382       CeedChkBackend(ierr);
383       ierr = CeedBasisApply(basis, blk_size, CEED_TRANSPOSE,
384                             CEED_EVAL_GRAD, impl->q_vecs_out[i],
385                             impl->e_vecs_out[i]); CeedChkBackend(ierr);
386       break;
387     // LCOV_EXCL_START
388     case CEED_EVAL_WEIGHT: {
389       Ceed ceed;
390       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
391       return CeedError(ceed, CEED_ERROR_BACKEND,
392                        "CEED_EVAL_WEIGHT cannot be an output "
393                        "evaluation mode");
394     }
395     case CEED_EVAL_DIV:
396     case CEED_EVAL_CURL: {
397       Ceed ceed;
398       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
399       return CeedError(ceed, CEED_ERROR_BACKEND,
400                        "Ceed evaluation mode not implemented");
401       // LCOV_EXCL_STOP
402     }
403     }
404   }
405   return CEED_ERROR_SUCCESS;
406 }
407 
408 //------------------------------------------------------------------------------
409 // Restore Input Vectors
410 //------------------------------------------------------------------------------
411 static inline int CeedOperatorRestoreInputs_Blocked(CeedInt num_input_fields,
412     CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
413     bool skip_active, CeedScalar *e_data_full[2*CEED_FIELD_MAX],
414     CeedOperator_Blocked *impl) {
415   CeedInt ierr;
416   CeedEvalMode eval_mode;
417 
418   for (CeedInt i=0; i<num_input_fields; i++) {
419     // Skip active inputs
420     if (skip_active) {
421       CeedVector vec;
422       ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
423       CeedChkBackend(ierr);
424       if (vec == CEED_VECTOR_ACTIVE)
425         continue;
426     }
427     ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode);
428     CeedChkBackend(ierr);
429     if (eval_mode == CEED_EVAL_WEIGHT) { // Skip
430     } else {
431       ierr = CeedVectorRestoreArrayRead(impl->e_vecs_full[i],
432                                         (const CeedScalar **) &e_data_full[i]);
433       CeedChkBackend(ierr);
434     }
435   }
436   return CEED_ERROR_SUCCESS;
437 }
438 
439 //------------------------------------------------------------------------------
440 // Operator Apply
441 //------------------------------------------------------------------------------
442 static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec,
443                                         CeedVector out_vec,
444                                         CeedRequest *request) {
445   int ierr;
446   CeedOperator_Blocked *impl;
447   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
448   const CeedInt blk_size = 8;
449   CeedInt Q, num_input_fields, num_output_fields, num_elem, size;
450   ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr);
451   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
452   CeedInt num_blks = (num_elem/blk_size) + !!(num_elem%blk_size);
453   CeedQFunction qf;
454   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
455   CeedOperatorField *op_input_fields, *op_output_fields;
456   ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields,
457                                &num_output_fields, &op_output_fields);
458   CeedChkBackend(ierr);
459   CeedQFunctionField *qf_input_fields, *qf_output_fields;
460   ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL,
461                                 &qf_output_fields);
462   CeedChkBackend(ierr);
463   CeedEvalMode eval_mode;
464   CeedVector vec;
465   CeedScalar *e_data_full[2*CEED_FIELD_MAX] = {0};
466 
467   // Setup
468   ierr = CeedOperatorSetup_Blocked(op); CeedChkBackend(ierr);
469 
470   // Restriction only operator
471   if (impl->is_identity_restr_op) {
472     ierr = CeedElemRestrictionApply(impl->blk_restr[0], CEED_NOTRANSPOSE, in_vec,
473                                     impl->e_vecs_full[0], request); CeedChkBackend(ierr);
474     ierr = CeedElemRestrictionApply(impl->blk_restr[1], CEED_TRANSPOSE,
475                                     impl->e_vecs_full[0], out_vec, request); CeedChkBackend(ierr);
476     return CEED_ERROR_SUCCESS;
477   }
478 
479   // Input Evecs and Restriction
480   ierr = CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields,
481                                          op_input_fields, in_vec, false, e_data_full,
482                                          impl, request); CeedChkBackend(ierr);
483 
484   // Output Evecs
485   for (CeedInt i=0; i<num_output_fields; i++) {
486     ierr = CeedVectorGetArray(impl->e_vecs_full[i+impl->num_inputs], CEED_MEM_HOST,
487                               &e_data_full[i + num_input_fields]); CeedChkBackend(ierr);
488   }
489 
490   // Loop through elements
491   for (CeedInt e=0; e<num_blks*blk_size; e+=blk_size) {
492     // Output pointers
493     for (CeedInt i=0; i<num_output_fields; i++) {
494       ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode);
495       CeedChkBackend(ierr);
496       if (eval_mode == CEED_EVAL_NONE) {
497         ierr = CeedQFunctionFieldGetSize(qf_output_fields[i], &size);
498         CeedChkBackend(ierr);
499         ierr = CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST,
500                                   CEED_USE_POINTER, &e_data_full[i + num_input_fields][e*Q*size]);
501         CeedChkBackend(ierr);
502       }
503     }
504 
505     // Input basis apply
506     ierr = CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields,
507                                           num_input_fields, blk_size, false, e_data_full,
508                                           impl); CeedChkBackend(ierr);
509 
510     // Q function
511     if (!impl->is_identity_qf) {
512       ierr = CeedQFunctionApply(qf, Q*blk_size, impl->q_vecs_in, impl->q_vecs_out);
513       CeedChkBackend(ierr);
514     }
515 
516     // Output basis apply
517     ierr = CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields,
518                                            blk_size, num_input_fields,
519                                            num_output_fields, op, e_data_full, impl);
520     CeedChkBackend(ierr);
521   }
522 
523   // Output restriction
524   for (CeedInt i=0; i<num_output_fields; i++) {
525     // Restore evec
526     ierr = CeedVectorRestoreArray(impl->e_vecs_full[i+impl->num_inputs],
527                                   &e_data_full[i + num_input_fields]); CeedChkBackend(ierr);
528     // Get output vector
529     ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec);
530     CeedChkBackend(ierr);
531     // Active
532     if (vec == CEED_VECTOR_ACTIVE)
533       vec = out_vec;
534     // Restrict
535     ierr = CeedElemRestrictionApply(impl->blk_restr[i+impl->num_inputs],
536                                     CEED_TRANSPOSE, impl->e_vecs_full[i+impl->num_inputs],
537                                     vec, request); CeedChkBackend(ierr);
538 
539   }
540 
541   // Restore input arrays
542   ierr = CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields,
543          op_input_fields, false, e_data_full, impl); CeedChkBackend(ierr);
544 
545   return CEED_ERROR_SUCCESS;
546 }
547 
548 //------------------------------------------------------------------------------
549 // Core code for assembling linear QFunction
550 //------------------------------------------------------------------------------
551 static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(
552   CeedOperator op, bool build_objects, CeedVector *assembled,
553   CeedElemRestriction *rstr, CeedRequest *request) {
554   int ierr;
555   CeedOperator_Blocked *impl;
556   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
557   const CeedInt blk_size = 8;
558   CeedInt Q, num_input_fields, num_output_fields, num_elem, size;
559   ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr);
560   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
561   CeedInt num_blks = (num_elem/blk_size) + !!(num_elem%blk_size);
562   CeedQFunction qf;
563   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
564   CeedOperatorField *op_input_fields, *op_output_fields;
565   ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields,
566                                &num_output_fields, &op_output_fields);
567   CeedChkBackend(ierr);
568   CeedQFunctionField *qf_input_fields, *qf_output_fields;
569   ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL,
570                                 &qf_output_fields);
571   CeedChkBackend(ierr);
572   CeedVector vec, l_vec = impl->qf_l_vec;
573   CeedInt num_active_in = impl->num_active_in,
574           num_active_out = impl->num_active_out;
575   CeedVector *active_in = impl->qf_active_in;
576   CeedScalar *a, *tmp;
577   Ceed ceed;
578   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
579   CeedScalar *e_data_full[2*CEED_FIELD_MAX] = {0};
580 
581   // Setup
582   ierr = CeedOperatorSetup_Blocked(op); CeedChkBackend(ierr);
583 
584   // Check for identity
585   if (impl->is_identity_qf)
586     // LCOV_EXCL_START
587     return CeedError(ceed, CEED_ERROR_BACKEND,
588                      "Assembling identity QFunctions not supported");
589   // LCOV_EXCL_STOP
590 
591   // Input Evecs and Restriction
592   ierr = CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields,
593                                          op_input_fields, NULL, true, e_data_full,
594                                          impl, request); CeedChkBackend(ierr);
595 
596   // Count number of active input fields
597   if (!num_active_in) {
598     for (CeedInt i=0; i<num_input_fields; i++) {
599       // Get input vector
600       ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
601       CeedChkBackend(ierr);
602       // Check if active input
603       if (vec == CEED_VECTOR_ACTIVE) {
604         ierr = CeedQFunctionFieldGetSize(qf_input_fields[i], &size);
605         CeedChkBackend(ierr);
606         ierr = CeedVectorSetValue(impl->q_vecs_in[i], 0.0); CeedChkBackend(ierr);
607         ierr = CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp);
608         CeedChkBackend(ierr);
609         ierr = CeedRealloc(num_active_in + size, &active_in); CeedChkBackend(ierr);
610         for (CeedInt field=0; field<size; field++) {
611           ierr = CeedVectorCreate(ceed, Q*blk_size, &active_in[num_active_in+field]);
612           CeedChkBackend(ierr);
613           ierr = CeedVectorSetArray(active_in[num_active_in+field], CEED_MEM_HOST,
614                                     CEED_USE_POINTER, &tmp[field*Q*blk_size]);
615           CeedChkBackend(ierr);
616         }
617         num_active_in += size;
618         ierr = CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp); CeedChkBackend(ierr);
619       }
620     }
621     impl->num_active_in = num_active_in;
622     impl->qf_active_in = active_in;
623   }
624 
625   // Count number of active output fields
626   if (!num_active_out) {
627     for (CeedInt i=0; i<num_output_fields; i++) {
628       // Get output vector
629       ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec);
630       CeedChkBackend(ierr);
631       // Check if active output
632       if (vec == CEED_VECTOR_ACTIVE) {
633         ierr = CeedQFunctionFieldGetSize(qf_output_fields[i], &size);
634         CeedChkBackend(ierr);
635         num_active_out += size;
636       }
637     }
638     impl->num_active_out = num_active_out;
639   }
640 
641   // Check sizes
642   if (!num_active_in || !num_active_out)
643     // LCOV_EXCL_START
644     return CeedError(ceed, CEED_ERROR_BACKEND,
645                      "Cannot assemble QFunction without active inputs "
646                      "and outputs");
647   // LCOV_EXCL_STOP
648 
649   // Setup Lvec
650   if (!l_vec) {
651     ierr = CeedVectorCreate(ceed, num_blks*blk_size*Q*num_active_in*num_active_out,
652                             &l_vec); CeedChkBackend(ierr);
653     impl->qf_l_vec = l_vec;
654   }
655   ierr = CeedVectorGetArray(l_vec, CEED_MEM_HOST, &a); CeedChkBackend(ierr);
656 
657   // Build objects if needed
658   CeedInt strides[3] = {1, Q, num_active_in *num_active_out*Q};
659   if (build_objects) {
660     // Create output restriction
661     ierr = CeedElemRestrictionCreateStrided(ceed, num_elem, Q,
662                                             num_active_in*num_active_out,
663                                             num_active_in*num_active_out*num_elem*Q,
664                                             strides, rstr); CeedChkBackend(ierr);
665     // Create assembled vector
666     ierr = CeedVectorCreate(ceed, num_elem*Q*num_active_in*num_active_out,
667                             assembled); CeedChkBackend(ierr);
668   }
669 
670   // Loop through elements
671   for (CeedInt e=0; e<num_blks*blk_size; e+=blk_size) {
672     // Input basis apply
673     ierr = CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields,
674                                           num_input_fields, blk_size, true, e_data_full,
675                                           impl); CeedChkBackend(ierr);
676 
677     // Assemble QFunction
678     for (CeedInt in=0; in<num_active_in; in++) {
679       // Set Inputs
680       ierr = CeedVectorSetValue(active_in[in], 1.0); CeedChkBackend(ierr);
681       if (num_active_in > 1) {
682         ierr = CeedVectorSetValue(active_in[(in+num_active_in-1)%num_active_in],
683                                   0.0); CeedChkBackend(ierr);
684       }
685       // Set Outputs
686       for (CeedInt out=0; out<num_output_fields; out++) {
687         // Get output vector
688         ierr = CeedOperatorFieldGetVector(op_output_fields[out], &vec);
689         CeedChkBackend(ierr);
690         // Check if active output
691         if (vec == CEED_VECTOR_ACTIVE) {
692           CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST,
693                              CEED_USE_POINTER, a); CeedChkBackend(ierr);
694           ierr = CeedQFunctionFieldGetSize(qf_output_fields[out], &size);
695           CeedChkBackend(ierr);
696           a += size*Q*blk_size; // Advance the pointer by the size of the output
697         }
698       }
699       // Apply QFunction
700       ierr = CeedQFunctionApply(qf, Q*blk_size, impl->q_vecs_in, impl->q_vecs_out);
701       CeedChkBackend(ierr);
702     }
703   }
704 
705   // Un-set output Qvecs to prevent accidental overwrite of Assembled
706   for (CeedInt out=0; out<num_output_fields; out++) {
707     // Get output vector
708     ierr = CeedOperatorFieldGetVector(op_output_fields[out], &vec);
709     CeedChkBackend(ierr);
710     // Check if active output
711     if (vec == CEED_VECTOR_ACTIVE) {
712       CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_COPY_VALUES,
713                          NULL); CeedChkBackend(ierr);
714     }
715   }
716 
717   // Restore input arrays
718   ierr = CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields,
719          op_input_fields, true, e_data_full, impl); CeedChkBackend(ierr);
720 
721   // Output blocked restriction
722   ierr = CeedVectorRestoreArray(l_vec, &a); CeedChkBackend(ierr);
723   ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr);
724   CeedElemRestriction blk_rstr = impl->qf_blk_rstr;
725   if (!impl->qf_blk_rstr) {
726     ierr = CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, blk_size,
727            num_active_in*num_active_out, num_active_in*num_active_out*num_elem*Q,
728            strides, &blk_rstr); CeedChkBackend(ierr);
729     impl->qf_blk_rstr = blk_rstr;
730   }
731   ierr = CeedElemRestrictionApply(blk_rstr, CEED_TRANSPOSE, l_vec, *assembled,
732                                   request); CeedChkBackend(ierr);
733 
734   return CEED_ERROR_SUCCESS;
735 }
736 
737 //------------------------------------------------------------------------------
738 // Assemble Linear QFunction
739 //------------------------------------------------------------------------------
740 static int CeedOperatorLinearAssembleQFunction_Blocked(CeedOperator op,
741     CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
742   return CeedOperatorLinearAssembleQFunctionCore_Blocked(op, true, assembled,
743          rstr, request);
744 }
745 
746 //------------------------------------------------------------------------------
747 // Update Assembled Linear QFunction
748 //------------------------------------------------------------------------------
749 static int CeedOperatorLinearAssembleQFunctionUpdate_Blocked(CeedOperator op,
750     CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
751   return CeedOperatorLinearAssembleQFunctionCore_Blocked(op, false, &assembled,
752          &rstr, request);
753 }
754 
755 //------------------------------------------------------------------------------
756 // Operator Destroy
757 //------------------------------------------------------------------------------
758 static int CeedOperatorDestroy_Blocked(CeedOperator op) {
759   int ierr;
760   CeedOperator_Blocked *impl;
761   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
762 
763   for (CeedInt i=0; i<impl->num_inputs+impl->num_outputs; i++) {
764     ierr = CeedElemRestrictionDestroy(&impl->blk_restr[i]); CeedChkBackend(ierr);
765     ierr = CeedVectorDestroy(&impl->e_vecs_full[i]); CeedChkBackend(ierr);
766   }
767   ierr = CeedFree(&impl->blk_restr); CeedChkBackend(ierr);
768   ierr = CeedFree(&impl->e_vecs_full); CeedChkBackend(ierr);
769   ierr = CeedFree(&impl->input_states); CeedChkBackend(ierr);
770 
771   for (CeedInt i=0; i<impl->num_inputs; i++) {
772     ierr = CeedVectorDestroy(&impl->e_vecs_in[i]); CeedChkBackend(ierr);
773     ierr = CeedVectorDestroy(&impl->q_vecs_in[i]); CeedChkBackend(ierr);
774   }
775   ierr = CeedFree(&impl->e_vecs_in); CeedChkBackend(ierr);
776   ierr = CeedFree(&impl->q_vecs_in); CeedChkBackend(ierr);
777 
778   for (CeedInt i=0; i<impl->num_outputs; i++) {
779     ierr = CeedVectorDestroy(&impl->e_vecs_out[i]); CeedChkBackend(ierr);
780     ierr = CeedVectorDestroy(&impl->q_vecs_out[i]); CeedChkBackend(ierr);
781   }
782   ierr = CeedFree(&impl->e_vecs_out); CeedChkBackend(ierr);
783   ierr = CeedFree(&impl->q_vecs_out); CeedChkBackend(ierr);
784 
785   // QFunction assembly data
786   for (CeedInt i=0; i<impl->num_active_in; i++) {
787     ierr = CeedVectorDestroy(&impl->qf_active_in[i]); CeedChkBackend(ierr);
788   }
789   ierr = CeedFree(&impl->qf_active_in); CeedChkBackend(ierr);
790   ierr = CeedVectorDestroy(&impl->qf_l_vec); CeedChkBackend(ierr);
791   ierr = CeedElemRestrictionDestroy(&impl->qf_blk_rstr); CeedChkBackend(ierr);
792 
793   ierr = CeedFree(&impl); CeedChkBackend(ierr);
794   return CEED_ERROR_SUCCESS;
795 }
796 
797 //------------------------------------------------------------------------------
798 // Operator Create
799 //------------------------------------------------------------------------------
800 int CeedOperatorCreate_Blocked(CeedOperator op) {
801   int ierr;
802   Ceed ceed;
803   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
804   CeedOperator_Blocked *impl;
805 
806   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
807   ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr);
808 
809   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction",
810                                 CeedOperatorLinearAssembleQFunction_Blocked);
811   CeedChkBackend(ierr);
812   ierr = CeedSetBackendFunction(ceed, "Operator", op,
813                                 "LinearAssembleQFunctionUpdate",
814                                 CeedOperatorLinearAssembleQFunctionUpdate_Blocked);
815   CeedChkBackend(ierr);
816   ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd",
817                                 CeedOperatorApplyAdd_Blocked); CeedChkBackend(ierr);
818   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
819                                 CeedOperatorDestroy_Blocked); CeedChkBackend(ierr);
820   return CEED_ERROR_SUCCESS;
821 }
822 //------------------------------------------------------------------------------
823