xref: /libCEED/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp (revision 4548da4e4ef44dc0f2704ad6d48ac0ca4a16bc83)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other
2 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
3 // files for details.
4 //
5 // SPDX-License-Identifier: BSD-2-Clause
6 //
7 // This file is part of CEED:  http://github.com/ceed
8 
9 #include <ceed/backend.h>
10 #include <ceed/ceed.h>
11 
12 #include <cassert>
13 #include <string>
14 #include <sycl/sycl.hpp>
15 
16 #include "../sycl/ceed-sycl-compile.hpp"
17 #include "ceed-sycl-ref.hpp"
18 
19 class CeedOperatorSyclLinearDiagonal;
20 class CeedOperatorSyclLinearAssemble;
21 class CeedOperatorSyclLinearAssembleFallback;
22 
23 //------------------------------------------------------------------------------
24 //  Get Basis Emode Pointer
25 //------------------------------------------------------------------------------
26 void CeedOperatorGetBasisPointer_Sycl(const CeedScalar **basisptr, CeedEvalMode emode, const CeedScalar *identity, const CeedScalar *interp,
27                                       const CeedScalar *grad) {
28   switch (emode) {
29     case CEED_EVAL_NONE:
30       *basisptr = identity;
31       break;
32     case CEED_EVAL_INTERP:
33       *basisptr = interp;
34       break;
35     case CEED_EVAL_GRAD:
36       *basisptr = grad;
37       break;
38     case CEED_EVAL_WEIGHT:
39     case CEED_EVAL_DIV:
40     case CEED_EVAL_CURL:
41       break;  // Caught by QF Assembly
42   }
43 }
44 
45 //------------------------------------------------------------------------------
46 // Destroy operator
47 //------------------------------------------------------------------------------
48 static int CeedOperatorDestroy_Sycl(CeedOperator op) {
49   CeedOperator_Sycl *impl;
50   CeedCallBackend(CeedOperatorGetData(op, &impl));
51   Ceed ceed;
52   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
53   Ceed_Sycl *sycl_data;
54   CeedCallBackend(CeedGetData(ceed, &sycl_data));
55 
56   // Apply data
57   for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) {
58     CeedCallBackend(CeedVectorDestroy(&impl->evecs[i]));
59   }
60   CeedCallBackend(CeedFree(&impl->evecs));
61 
62   for (CeedInt i = 0; i < impl->numein; i++) {
63     CeedCallBackend(CeedVectorDestroy(&impl->qvecsin[i]));
64   }
65   CeedCallBackend(CeedFree(&impl->qvecsin));
66 
67   for (CeedInt i = 0; i < impl->numeout; i++) {
68     CeedCallBackend(CeedVectorDestroy(&impl->qvecsout[i]));
69   }
70   CeedCallBackend(CeedFree(&impl->qvecsout));
71 
72   // QFunction assembly data
73   for (CeedInt i = 0; i < impl->qfnumactivein; i++) {
74     CeedCallBackend(CeedVectorDestroy(&impl->qfactivein[i]));
75   }
76   CeedCallBackend(CeedFree(&impl->qfactivein));
77 
78   // Diag data
79   if (impl->diag) {
80     CeedCallBackend(CeedFree(&impl->diag->h_emodein));
81     CeedCallBackend(CeedFree(&impl->diag->h_emodeout));
82 
83     CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw());
84     CeedCallSycl(ceed, sycl::free(impl->diag->d_emodein, sycl_data->sycl_context));
85     CeedCallSycl(ceed, sycl::free(impl->diag->d_emodeout, sycl_data->sycl_context));
86     CeedCallSycl(ceed, sycl::free(impl->diag->d_identity, sycl_data->sycl_context));
87     CeedCallSycl(ceed, sycl::free(impl->diag->d_interpin, sycl_data->sycl_context));
88     CeedCallSycl(ceed, sycl::free(impl->diag->d_interpout, sycl_data->sycl_context));
89     CeedCallSycl(ceed, sycl::free(impl->diag->d_gradin, sycl_data->sycl_context));
90     CeedCallSycl(ceed, sycl::free(impl->diag->d_gradout, sycl_data->sycl_context));
91     CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr));
92 
93     CeedCallBackend(CeedVectorDestroy(&impl->diag->elemdiag));
94     CeedCallBackend(CeedVectorDestroy(&impl->diag->pbelemdiag));
95   }
96   CeedCallBackend(CeedFree(&impl->diag));
97 
98   if (impl->asmb) {
99     CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw());
100     CeedCallSycl(ceed, sycl::free(impl->asmb->d_B_in, sycl_data->sycl_context));
101     CeedCallSycl(ceed, sycl::free(impl->asmb->d_B_out, sycl_data->sycl_context));
102   }
103   CeedCallBackend(CeedFree(&impl->asmb));
104 
105   CeedCallBackend(CeedFree(&impl));
106   return CEED_ERROR_SUCCESS;
107 }
108 
109 //------------------------------------------------------------------------------
110 // Setup infields or outfields
111 //------------------------------------------------------------------------------
112 static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool isinput, CeedVector *evecs, CeedVector *qvecs, CeedInt starte,
113                                         CeedInt numfields, CeedInt Q, CeedInt numelements) {
114   CeedInt  dim, size;
115   CeedSize q_size;
116   Ceed     ceed;
117   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
118   CeedBasis           basis;
119   CeedElemRestriction Erestrict;
120   CeedOperatorField  *opfields;
121   CeedQFunctionField *qffields;
122   CeedVector          fieldvec;
123   bool                strided;
124   bool                skiprestrict;
125 
126   if (isinput) {
127     CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL));
128     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL));
129   } else {
130     CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields));
131     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields));
132   }
133 
134   // Loop over fields
135   for (CeedInt i = 0; i < numfields; i++) {
136     CeedEvalMode emode;
137     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode));
138 
139     strided      = false;
140     skiprestrict = false;
141     if (emode != CEED_EVAL_WEIGHT) {
142       CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict));
143 
144       // Check whether this field can skip the element restriction:
145       // must be passive input, with emode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
146 
147       // First, check whether the field is input or output:
148       if (isinput) {
149         // Check for passive input:
150         CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &fieldvec));
151         if (fieldvec != CEED_VECTOR_ACTIVE) {
152           // Check emode
153           if (emode == CEED_EVAL_NONE) {
154             // Check for strided restriction
155             CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &strided));
156             if (strided) {
157               // Check if vector is already in preferred backend ordering
158               CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &skiprestrict));
159             }
160           }
161         }
162       }
163       if (skiprestrict) {
164         // We do not need an E-Vector, but will use the input field vector's data directly in the operator application
165         evecs[i + starte] = NULL;
166       } else {
167         CeedCallBackend(CeedElemRestrictionCreateVector(Erestrict, NULL, &evecs[i + starte]));
168       }
169     }
170 
171     switch (emode) {
172       case CEED_EVAL_NONE:
173         CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size));
174         q_size = (CeedSize)numelements * Q * size;
175         CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i]));
176         break;
177       case CEED_EVAL_INTERP:
178         CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size));
179         q_size = (CeedSize)numelements * Q * size;
180         CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i]));
181         break;
182       case CEED_EVAL_GRAD:
183         CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis));
184         CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size));
185         CeedCallBackend(CeedBasisGetDimension(basis, &dim));
186         q_size = (CeedSize)numelements * Q * size;
187         CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i]));
188         break;
189       case CEED_EVAL_WEIGHT:  // Only on input fields
190         CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis));
191         q_size = (CeedSize)numelements * Q;
192         CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i]));
193         CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, NULL, qvecs[i]));
194         break;
195       case CEED_EVAL_DIV:
196         break;  // TODO: Not implemented
197       case CEED_EVAL_CURL:
198         break;  // TODO: Not implemented
199     }
200   }
201   return CEED_ERROR_SUCCESS;
202 }
203 
204 //------------------------------------------------------------------------------
205 // CeedOperator needs to connect all the named fields (be they active or
206 // passive) to the named inputs and outputs of its CeedQFunction.
207 //------------------------------------------------------------------------------
208 static int CeedOperatorSetup_Sycl(CeedOperator op) {
209   bool setupdone;
210   CeedCallBackend(CeedOperatorIsSetupDone(op, &setupdone));
211   if (setupdone) return CEED_ERROR_SUCCESS;
212 
213   Ceed ceed;
214   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
215   CeedOperator_Sycl *impl;
216   CeedCallBackend(CeedOperatorGetData(op, &impl));
217   CeedQFunction qf;
218   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
219   CeedInt Q, numelements, numinputfields, numoutputfields;
220   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
221   CeedCallBackend(CeedOperatorGetNumElements(op, &numelements));
222   CeedOperatorField *opinputfields, *opoutputfields;
223   CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields));
224   CeedQFunctionField *qfinputfields, *qfoutputfields;
225   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields));
226 
227   // Allocate
228   CeedCallBackend(CeedCalloc(numinputfields + numoutputfields, &impl->evecs));
229 
230   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin));
231   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout));
232 
233   impl->numein  = numinputfields;
234   impl->numeout = numoutputfields;
235 
236   // Set up infield and outfield evecs and qvecs
237   // Infields
238   CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, true, impl->evecs, impl->qvecsin, 0, numinputfields, Q, numelements));
239 
240   // Outfields
241   CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, false, impl->evecs, impl->qvecsout, numinputfields, numoutputfields, Q, numelements));
242 
243   CeedCallBackend(CeedOperatorSetSetupDone(op));
244   return CEED_ERROR_SUCCESS;
245 }
246 
247 //------------------------------------------------------------------------------
248 // Setup Operator Inputs
249 //------------------------------------------------------------------------------
250 static inline int CeedOperatorSetupInputs_Sycl(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
251                                                CeedVector invec, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX],
252                                                CeedOperator_Sycl *impl, CeedRequest *request) {
253   CeedEvalMode        emode;
254   CeedVector          vec;
255   CeedElemRestriction Erestrict;
256 
257   for (CeedInt i = 0; i < numinputfields; i++) {
258     // Get input vector
259     CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec));
260     if (vec == CEED_VECTOR_ACTIVE) {
261       if (skipactive) continue;
262       else vec = invec;
263     }
264 
265     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode));
266     if (emode == CEED_EVAL_WEIGHT) {  // Skip
267     } else {
268       // Get input vector
269       CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec));
270       // Get input element restriction
271       CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict));
272       if (vec == CEED_VECTOR_ACTIVE) vec = invec;
273       // Restrict, if necessary
274       if (!impl->evecs[i]) {
275         // No restriction for this field; read data directly from vec.
276         CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&edata[i]));
277       } else {
278         CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, impl->evecs[i], request));
279         // Get evec
280         CeedCallBackend(CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&edata[i]));
281       }
282     }
283   }
284   return CEED_ERROR_SUCCESS;
285 }
286 
287 //------------------------------------------------------------------------------
288 // Input Basis Action
289 //------------------------------------------------------------------------------
290 static inline int CeedOperatorInputBasis_Sycl(CeedInt numelements, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
291                                               CeedInt numinputfields, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX],
292                                               CeedOperator_Sycl *impl) {
293   CeedInt             elemsize, size;
294   CeedElemRestriction Erestrict;
295   CeedEvalMode        emode;
296   CeedBasis           basis;
297 
298   for (CeedInt i = 0; i < numinputfields; i++) {
299     // Skip active input
300     if (skipactive) {
301       CeedVector vec;
302       CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec));
303       if (vec == CEED_VECTOR_ACTIVE) continue;
304     }
305     // Get elemsize, emode, size
306     CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict));
307     CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize));
308     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode));
309     CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size));
310     // Basis action
311     switch (emode) {
312       case CEED_EVAL_NONE:
313         CeedCallBackend(CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i]));
314         break;
315       case CEED_EVAL_INTERP:
316         CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis));
317         CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->evecs[i], impl->qvecsin[i]));
318         break;
319       case CEED_EVAL_GRAD:
320         CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis));
321         CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->evecs[i], impl->qvecsin[i]));
322         break;
323       case CEED_EVAL_WEIGHT:
324         break;  // No action
325       case CEED_EVAL_DIV:
326         break;  // TODO: Not implemented
327       case CEED_EVAL_CURL:
328         break;  // TODO: Not implemented
329     }
330   }
331   return CEED_ERROR_SUCCESS;
332 }
333 
334 //------------------------------------------------------------------------------
335 // Restore Input Vectors
336 //------------------------------------------------------------------------------
337 static inline int CeedOperatorRestoreInputs_Sycl(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
338                                                  const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], CeedOperator_Sycl *impl) {
339   CeedEvalMode emode;
340   CeedVector   vec;
341 
342   for (CeedInt i = 0; i < numinputfields; i++) {
343     // Skip active input
344     if (skipactive) {
345       CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec));
346       if (vec == CEED_VECTOR_ACTIVE) continue;
347     }
348     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode));
349     if (emode == CEED_EVAL_WEIGHT) {  // Skip
350     } else {
351       if (!impl->evecs[i]) {  // This was a skiprestrict case
352         CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec));
353         CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&edata[i]));
354       } else {
355         CeedCallBackend(CeedVectorRestoreArrayRead(impl->evecs[i], (const CeedScalar **)&edata[i]));
356       }
357     }
358   }
359   return CEED_ERROR_SUCCESS;
360 }
361 
362 //------------------------------------------------------------------------------
363 // Apply and add to output
364 //------------------------------------------------------------------------------
365 static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request) {
366   CeedOperator_Sycl *impl;
367   CeedCallBackend(CeedOperatorGetData(op, &impl));
368   CeedQFunction qf;
369   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
370   CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size;
371   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
372   CeedCallBackend(CeedOperatorGetNumElements(op, &numelements));
373   CeedOperatorField *opinputfields, *opoutputfields;
374   CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields));
375   CeedQFunctionField *qfinputfields, *qfoutputfields;
376   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields));
377   CeedEvalMode        emode;
378   CeedVector          vec;
379   CeedBasis           basis;
380   CeedElemRestriction Erestrict;
381   CeedScalar         *edata[2 * CEED_FIELD_MAX] = {0};
382 
383   // Setup
384   CeedCallBackend(CeedOperatorSetup_Sycl(op));
385 
386   // Input Evecs and Restriction
387   CeedCallBackend(CeedOperatorSetupInputs_Sycl(numinputfields, qfinputfields, opinputfields, invec, false, edata, impl, request));
388 
389   // Input basis apply if needed
390   CeedCallBackend(CeedOperatorInputBasis_Sycl(numelements, qfinputfields, opinputfields, numinputfields, false, edata, impl));
391 
392   // Output pointers, as necessary
393   for (CeedInt i = 0; i < numoutputfields; i++) {
394     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode));
395     if (emode == CEED_EVAL_NONE) {
396       // Set the output Q-Vector to use the E-Vector data directly
397       CeedCallBackend(CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, &edata[i + numinputfields]));
398       CeedCallBackend(CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i + numinputfields]));
399     }
400   }
401 
402   // Q function
403   CeedCallBackend(CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout));
404 
405   // Output basis apply if needed
406   for (CeedInt i = 0; i < numoutputfields; i++) {
407     // Get elemsize, emode, size
408     CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict));
409     CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize));
410     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode));
411     CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size));
412     // Basis action
413     switch (emode) {
414       case CEED_EVAL_NONE:
415         break;
416       case CEED_EVAL_INTERP:
417         CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis));
418         CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->qvecsout[i], impl->evecs[i + impl->numein]));
419         break;
420       case CEED_EVAL_GRAD:
421         CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis));
422         CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->qvecsout[i], impl->evecs[i + impl->numein]));
423         break;
424       // LCOV_EXCL_START
425       case CEED_EVAL_WEIGHT:
426         Ceed ceed;
427         CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
428         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
429         break;  // Should not occur
430       case CEED_EVAL_DIV:
431         break;  // TODO: Not implemented
432       case CEED_EVAL_CURL:
433         break;  // TODO: Not implemented
434                 // LCOV_EXCL_STOP
435     }
436   }
437 
438   // Output restriction
439   for (CeedInt i = 0; i < numoutputfields; i++) {
440     // Restore evec
441     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode));
442     if (emode == CEED_EVAL_NONE) {
443       CeedCallBackend(CeedVectorRestoreArray(impl->evecs[i + impl->numein], &edata[i + numinputfields]));
444     }
445     // Get output vector
446     CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec));
447     // Restrict
448     CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict));
449     // Active
450     if (vec == CEED_VECTOR_ACTIVE) vec = outvec;
451 
452     CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, impl->evecs[i + impl->numein], vec, request));
453   }
454 
455   // Restore input arrays
456   CeedCallBackend(CeedOperatorRestoreInputs_Sycl(numinputfields, qfinputfields, opinputfields, false, edata, impl));
457   return CEED_ERROR_SUCCESS;
458 }
459 
460 //------------------------------------------------------------------------------
461 // Core code for assembling linear QFunction
462 //------------------------------------------------------------------------------
463 static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
464                                                                CeedRequest *request) {
465   CeedOperator_Sycl *impl;
466   CeedCallBackend(CeedOperatorGetData(op, &impl));
467   CeedQFunction qf;
468   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
469   CeedInt  Q, numelements, numinputfields, numoutputfields, size;
470   CeedSize q_size;
471   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
472   CeedCallBackend(CeedOperatorGetNumElements(op, &numelements));
473   CeedOperatorField *opinputfields, *opoutputfields;
474   CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields));
475   CeedQFunctionField *qfinputfields, *qfoutputfields;
476   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields));
477   CeedVector  vec;
478   CeedInt     numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout;
479   CeedVector *activein = impl->qfactivein;
480   CeedScalar *a, *tmp;
481   Ceed        ceed, ceedparent;
482   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
483   CeedCallBackend(CeedGetOperatorFallbackParentCeed(ceed, &ceedparent));
484   ceedparent = ceedparent ? ceedparent : ceed;
485   CeedScalar *edata[2 * CEED_FIELD_MAX];
486 
487   // Setup
488   CeedCallBackend(CeedOperatorSetup_Sycl(op));
489 
490   // Check for identity
491   bool identityqf;
492   CeedCallBackend(CeedQFunctionIsIdentity(qf, &identityqf));
493   if (identityqf) {
494     // LCOV_EXCL_START
495     return CeedError(ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported");
496     // LCOV_EXCL_STOP
497   }
498 
499   // Input Evecs and Restriction
500   CeedCallBackend(CeedOperatorSetupInputs_Sycl(numinputfields, qfinputfields, opinputfields, NULL, true, edata, impl, request));
501 
502   // Count number of active input fields
503   if (!numactivein) {
504     for (CeedInt i = 0; i < numinputfields; i++) {
505       // Get input vector
506       CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec));
507       // Check if active input
508       if (vec == CEED_VECTOR_ACTIVE) {
509         CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size));
510         CeedCallBackend(CeedVectorSetValue(impl->qvecsin[i], 0.0));
511         CeedCallBackend(CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp));
512         CeedCallBackend(CeedRealloc(numactivein + size, &activein));
513         for (CeedInt field = 0; field < size; field++) {
514           q_size = (CeedSize)Q * numelements;
515           CeedCallBackend(CeedVectorCreate(ceed, q_size, &activein[numactivein + field]));
516           CeedCallBackend(CeedVectorSetArray(activein[numactivein + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &tmp[field * Q * numelements]));
517         }
518         numactivein += size;
519         CeedCallBackend(CeedVectorRestoreArray(impl->qvecsin[i], &tmp));
520       }
521     }
522     impl->qfnumactivein = numactivein;
523     impl->qfactivein    = activein;
524   }
525 
526   // Count number of active output fields
527   if (!numactiveout) {
528     for (CeedInt i = 0; i < numoutputfields; i++) {
529       // Get output vector
530       CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec));
531       // Check if active output
532       if (vec == CEED_VECTOR_ACTIVE) {
533         CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size));
534         numactiveout += size;
535       }
536     }
537     impl->qfnumactiveout = numactiveout;
538   }
539 
540   // Check sizes
541   if (!numactivein || !numactiveout) {
542     // LCOV_EXCL_START
543     return CeedError(ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
544     // LCOV_EXCL_STOP
545   }
546 
547   // Build objects if needed
548   if (build_objects) {
549     // Create output restriction
550     CeedInt strides[3] = {1, numelements * Q, Q}; /* *NOPAD* */
551     CeedCallBackend(CeedElemRestrictionCreateStrided(ceedparent, numelements, Q, numactivein * numactiveout,
552                                                      numactivein * numactiveout * numelements * Q, strides, rstr));
553     // Create assembled vector
554     CeedSize l_size = (CeedSize)numelements * Q * numactivein * numactiveout;
555     CeedCallBackend(CeedVectorCreate(ceedparent, l_size, assembled));
556   }
557   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
558   CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a));
559 
560   // Input basis apply
561   CeedCallBackend(CeedOperatorInputBasis_Sycl(numelements, qfinputfields, opinputfields, numinputfields, true, edata, impl));
562 
563   // Assemble QFunction
564   for (CeedInt in = 0; in < numactivein; in++) {
565     // Set Inputs
566     CeedCallBackend(CeedVectorSetValue(activein[in], 1.0));
567     if (numactivein > 1) {
568       CeedCallBackend(CeedVectorSetValue(activein[(in + numactivein - 1) % numactivein], 0.0));
569     }
570     // Set Outputs
571     for (CeedInt out = 0; out < numoutputfields; out++) {
572       // Get output vector
573       CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[out], &vec));
574       // Check if active output
575       if (vec == CEED_VECTOR_ACTIVE) {
576         CeedCallBackend(CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE, CEED_USE_POINTER, a));
577         CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[out], &size));
578         a += size * Q * numelements;  // Advance the pointer by the size of the output
579       }
580     }
581     // Apply QFunction
582     CeedCallBackend(CeedQFunctionApply(qf, Q * numelements, impl->qvecsin, impl->qvecsout));
583   }
584 
585   // Un-set output Qvecs to prevent accidental overwrite of Assembled
586   for (CeedInt out = 0; out < numoutputfields; out++) {
587     // Get output vector
588     CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[out], &vec));
589     // Check if active output
590     if (vec == CEED_VECTOR_ACTIVE) {
591       CeedCallBackend(CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL));
592     }
593   }
594 
595   // Restore input arrays
596   CeedCallBackend(CeedOperatorRestoreInputs_Sycl(numinputfields, qfinputfields, opinputfields, true, edata, impl));
597 
598   // Restore output
599   CeedCallBackend(CeedVectorRestoreArray(*assembled, &a));
600 
601   return CEED_ERROR_SUCCESS;
602 }
603 
604 //------------------------------------------------------------------------------
605 // Assemble Linear QFunction
606 //------------------------------------------------------------------------------
607 static int CeedOperatorLinearAssembleQFunction_Sycl(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
608   return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, true, assembled, rstr, request);
609 }
610 
611 //------------------------------------------------------------------------------
612 // Update Assembled Linear QFunction
613 //------------------------------------------------------------------------------
614 static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
615   return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, false, &assembled, &rstr, request);
616 }
617 
618 //------------------------------------------------------------------------------
619 // Create point block restriction
620 //------------------------------------------------------------------------------
621 static int CreatePBRestriction(CeedElemRestriction rstr, CeedElemRestriction *pbRstr) {
622   Ceed ceed;
623   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
624   const CeedInt *offsets;
625   CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets));
626 
627   // Expand offsets
628   CeedInt  nelem, ncomp, elemsize, compstride, *pbOffsets;
629   CeedSize l_size;
630   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &nelem));
631   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &ncomp));
632   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elemsize));
633   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &compstride));
634   CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
635   CeedInt shift = ncomp;
636   if (compstride != 1) shift *= ncomp;
637   CeedCallBackend(CeedCalloc(nelem * elemsize, &pbOffsets));
638   for (CeedInt i = 0; i < nelem * elemsize; i++) {
639     pbOffsets[i] = offsets[i] * shift;
640   }
641 
642   // Create new restriction
643   CeedCallBackend(
644       CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp * ncomp, 1, l_size * ncomp, CEED_MEM_HOST, CEED_OWN_POINTER, pbOffsets, pbRstr));
645 
646   // Cleanup
647   CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets));
648 
649   return CEED_ERROR_SUCCESS;
650 }
651 
652 //------------------------------------------------------------------------------
653 // Assemble diagonal setup
654 //------------------------------------------------------------------------------
655 static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const bool pointBlock) {
656   Ceed ceed;
657   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
658   CeedQFunction qf;
659   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
660   CeedInt numinputfields, numoutputfields;
661   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields));
662 
663   // Determine active input basis
664   CeedOperatorField  *opfields;
665   CeedQFunctionField *qffields;
666   CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL));
667   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL));
668   CeedInt             numemodein = 0, ncomp = 0, dim = 1;
669   CeedEvalMode       *emodein = NULL;
670   CeedBasis           basisin = NULL;
671   CeedElemRestriction rstrin  = NULL;
672   for (CeedInt i = 0; i < numinputfields; i++) {
673     CeedVector vec;
674     CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec));
675     if (vec == CEED_VECTOR_ACTIVE) {
676       CeedElemRestriction rstr;
677       CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisin));
678       CeedCallBackend(CeedBasisGetNumComponents(basisin, &ncomp));
679       CeedCallBackend(CeedBasisGetDimension(basisin, &dim));
680       CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr));
681       if (rstrin && rstrin != rstr) {
682         // LCOV_EXCL_START
683         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly");
684         // LCOV_EXCL_STOP
685       }
686       rstrin = rstr;
687       CeedEvalMode emode;
688       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode));
689       switch (emode) {
690         case CEED_EVAL_NONE:
691         case CEED_EVAL_INTERP:
692           CeedCallBackend(CeedRealloc(numemodein + 1, &emodein));
693           emodein[numemodein] = emode;
694           numemodein += 1;
695           break;
696         case CEED_EVAL_GRAD:
697           CeedCallBackend(CeedRealloc(numemodein + dim, &emodein));
698           for (CeedInt d = 0; d < dim; d++) emodein[numemodein + d] = emode;
699           numemodein += dim;
700           break;
701         case CEED_EVAL_WEIGHT:
702         case CEED_EVAL_DIV:
703         case CEED_EVAL_CURL:
704           break;  // Caught by QF Assembly
705       }
706     }
707   }
708 
709   // Determine active output basis
710   CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields));
711   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields));
712   CeedInt             numemodeout = 0;
713   CeedEvalMode       *emodeout    = NULL;
714   CeedBasis           basisout    = NULL;
715   CeedElemRestriction rstrout     = NULL;
716   for (CeedInt i = 0; i < numoutputfields; i++) {
717     CeedVector vec;
718     CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec));
719     if (vec == CEED_VECTOR_ACTIVE) {
720       CeedElemRestriction rstr;
721       CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisout));
722       CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr));
723       if (rstrout && rstrout != rstr) {
724         // LCOV_EXCL_START
725         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly");
726         // LCOV_EXCL_STOP
727       }
728       rstrout = rstr;
729       CeedEvalMode emode;
730       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode));
731       switch (emode) {
732         case CEED_EVAL_NONE:
733         case CEED_EVAL_INTERP:
734           CeedCallBackend(CeedRealloc(numemodeout + 1, &emodeout));
735           emodeout[numemodeout] = emode;
736           numemodeout += 1;
737           break;
738         case CEED_EVAL_GRAD:
739           CeedCallBackend(CeedRealloc(numemodeout + dim, &emodeout));
740           for (CeedInt d = 0; d < dim; d++) emodeout[numemodeout + d] = emode;
741           numemodeout += dim;
742           break;
743         case CEED_EVAL_WEIGHT:
744         case CEED_EVAL_DIV:
745         case CEED_EVAL_CURL:
746           break;  // Caught by QF Assembly
747       }
748     }
749   }
750 
751   // Operator data struct
752   CeedOperator_Sycl *impl;
753   CeedCallBackend(CeedOperatorGetData(op, &impl));
754   Ceed_Sycl *sycl_data;
755   CeedCallBackend(CeedGetData(ceed, &sycl_data));
756   CeedCallBackend(CeedCalloc(1, &impl->diag));
757   CeedOperatorDiag_Sycl *diag = impl->diag;
758   diag->basisin               = basisin;
759   diag->basisout              = basisout;
760   diag->h_emodein             = emodein;
761   diag->h_emodeout            = emodeout;
762   diag->numemodein            = numemodein;
763   diag->numemodeout           = numemodeout;
764 
765   // Kernel parameters
766   CeedInt nnodes, nqpts;
767   CeedCallBackend(CeedBasisGetNumNodes(basisin, &nnodes));
768   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basisin, &nqpts));
769   diag->nnodes = nnodes;
770   diag->nqpts  = nqpts;
771   diag->ncomp  = ncomp;
772 
773   // Basis matrices
774   const CeedInt     iLen = nqpts * nnodes;
775   const CeedInt     gLen = nqpts * nnodes * dim;
776   const CeedScalar *interpin, *interpout, *gradin, *gradout;
777 
778   // CEED_EVAL_NONE
779   CeedScalar *identity = NULL;
780   bool        evalNone = false;
781   for (CeedInt i = 0; i < numemodein; i++) evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE);
782   for (CeedInt i = 0; i < numemodeout; i++) evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE);
783 
784   // Order queue
785   sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
786 
787   std::vector<sycl::event> copy_events;
788   if (evalNone) {
789     CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity));
790     for (CeedInt i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0;
791     CeedCallSycl(ceed, diag->d_identity = sycl::malloc_device<CeedScalar>(iLen, sycl_data->sycl_device, sycl_data->sycl_context));
792     sycl::event identity_copy = sycl_data->sycl_queue.copy<CeedScalar>(identity, diag->d_identity, iLen, {e});
793     copy_events.push_back(identity_copy);
794   }
795 
796   // CEED_EVAL_INTERP
797   CeedCallBackend(CeedBasisGetInterp(basisin, &interpin));
798   CeedCallSycl(ceed, diag->d_interpin = sycl::malloc_device<CeedScalar>(iLen, sycl_data->sycl_device, sycl_data->sycl_context));
799   sycl::event interpin_copy = sycl_data->sycl_queue.copy<CeedScalar>(interpin, diag->d_interpin, iLen, {e});
800   copy_events.push_back(interpin_copy);
801 
802   CeedCallBackend(CeedBasisGetInterp(basisout, &interpout));
803   CeedCallSycl(ceed, diag->d_interpout = sycl::malloc_device<CeedScalar>(iLen, sycl_data->sycl_device, sycl_data->sycl_context));
804   sycl::event interpout_copy = sycl_data->sycl_queue.copy<CeedScalar>(interpout, diag->d_interpout, iLen, {e});
805   copy_events.push_back(interpout_copy);
806 
807   // CEED_EVAL_GRAD
808   CeedCallBackend(CeedBasisGetGrad(basisin, &gradin));
809   CeedCallSycl(ceed, diag->d_gradin = sycl::malloc_device<CeedScalar>(gLen, sycl_data->sycl_device, sycl_data->sycl_context));
810   sycl::event gradin_copy = sycl_data->sycl_queue.copy<CeedScalar>(gradin, diag->d_gradin, gLen, {e});
811   copy_events.push_back(gradin_copy);
812 
813   CeedCallBackend(CeedBasisGetGrad(basisout, &gradout));
814   CeedCallSycl(ceed, diag->d_gradout = sycl::malloc_device<CeedScalar>(gLen, sycl_data->sycl_device, sycl_data->sycl_context));
815   sycl::event gradout_copy = sycl_data->sycl_queue.copy<CeedScalar>(gradout, diag->d_gradout, gLen, {e});
816   copy_events.push_back(gradout_copy);
817 
818   // Arrays of emodes
819   CeedCallSycl(ceed, diag->d_emodein = sycl::malloc_device<CeedEvalMode>(numemodein, sycl_data->sycl_device, sycl_data->sycl_context));
820   sycl::event emodein_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(emodein, diag->d_emodein, numemodein, {e});
821   copy_events.push_back(emodein_copy);
822 
823   CeedCallSycl(ceed, diag->d_emodeout = sycl::malloc_device<CeedEvalMode>(numemodeout, sycl_data->sycl_device, sycl_data->sycl_context));
824   sycl::event emodeout_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(emodeout, diag->d_emodeout, numemodeout, {e});
825   copy_events.push_back(emodeout_copy);
826 
827   // Restriction
828   diag->diagrstr = rstrout;
829 
830   // Wait for all copies to complete and handle exceptions
831   CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
832 
833   return CEED_ERROR_SUCCESS;
834 }
835 
836 //------------------------------------------------------------------------------
837 //  Kernel for diagonal assembly
838 //------------------------------------------------------------------------------
839 static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool pointBlock, const CeedInt nelem, const CeedOperatorDiag_Sycl *diag,
840                                            const CeedScalar *assembledqfarray, CeedScalar *elemdiagarray) {
841   const CeedInt nnodes      = diag->nnodes;
842   const CeedInt nqpts       = diag->nqpts;
843   const CeedInt ncomp       = diag->ncomp;
844   const CeedInt numemodein  = diag->numemodein;
845   const CeedInt numemodeout = diag->numemodeout;
846 
847   const CeedScalar   *identity  = diag->d_identity;
848   const CeedScalar   *interpin  = diag->d_interpin;
849   const CeedScalar   *gradin    = diag->d_gradin;
850   const CeedScalar   *interpout = diag->d_interpout;
851   const CeedScalar   *gradout   = diag->d_gradout;
852   const CeedEvalMode *emodein   = diag->d_emodein;
853   const CeedEvalMode *emodeout  = diag->d_emodeout;
854 
855   sycl::range<1> kernel_range(nelem * nnodes);
856 
857   // Order queue
858   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
859   sycl_queue.parallel_for<CeedOperatorSyclLinearDiagonal>(kernel_range, {e}, [=](sycl::id<1> idx) {
860     const CeedInt tid = idx % nnodes;
861     const CeedInt e   = idx / nnodes;
862 
863     // Compute the diagonal of B^T D B
864     // Each element
865     CeedInt dout = -1;
866     // Each basis eval mode pair
867     for (CeedInt eout = 0; eout < numemodeout; eout++) {
868       const CeedScalar *bt = NULL;
869       if (emodeout[eout] == CEED_EVAL_GRAD) ++dout;
870       CeedOperatorGetBasisPointer_Sycl(&bt, emodeout[eout], identity, interpout, &gradout[dout * nqpts * nnodes]);
871       CeedInt din = -1;
872       for (CeedInt ein = 0; ein < numemodein; ein++) {
873         const CeedScalar *b = NULL;
874         if (emodein[ein] == CEED_EVAL_GRAD) ++din;
875         CeedOperatorGetBasisPointer_Sycl(&b, emodein[ein], identity, interpin, &gradin[din * nqpts * nnodes]);
876         // Each component
877         for (CeedInt compOut = 0; compOut < ncomp; compOut++) {
878           // Each qpoint/node pair
879           if (pointBlock) {
880             // Point Block Diagonal
881             for (CeedInt compIn = 0; compIn < ncomp; compIn++) {
882               CeedScalar evalue = 0.0;
883               for (CeedInt q = 0; q < nqpts; q++) {
884                 const CeedScalar qfvalue =
885                     assembledqfarray[((((ein * ncomp + compIn) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q];
886                 evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid];
887               }
888               elemdiagarray[((compOut * ncomp + compIn) * nelem + e) * nnodes + tid] += evalue;
889             }
890           } else {
891             // Diagonal Only
892             CeedScalar evalue = 0.0;
893             for (CeedInt q = 0; q < nqpts; q++) {
894               const CeedScalar qfvalue =
895                   assembledqfarray[((((ein * ncomp + compOut) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q];
896               evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid];
897             }
898             elemdiagarray[(compOut * nelem + e) * nnodes + tid] += evalue;
899           }
900         }
901       }
902     }
903   });
904   return CEED_ERROR_SUCCESS;
905 }
906 
907 //------------------------------------------------------------------------------
908 // Assemble diagonal common code
909 //------------------------------------------------------------------------------
910 static inline int CeedOperatorAssembleDiagonalCore_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool pointBlock) {
911   Ceed ceed;
912   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
913   CeedOperator_Sycl *impl;
914   CeedCallBackend(CeedOperatorGetData(op, &impl));
915   Ceed_Sycl *sycl_data;
916   CeedCallBackend(CeedGetData(ceed, &sycl_data));
917 
918   // Assemble QFunction
919   CeedVector          assembledqf = NULL;
920   CeedElemRestriction rstr        = NULL;
921   CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, &rstr, request));
922   CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
923 
924   // Setup
925   if (!impl->diag) {
926     CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Sycl(op, pointBlock));
927   }
928   CeedOperatorDiag_Sycl *diag = impl->diag;
929   assert(diag != NULL);
930 
931   // Restriction
932   if (pointBlock && !diag->pbdiagrstr) {
933     CeedElemRestriction pbdiagrstr;
934     CeedCallBackend(CreatePBRestriction(diag->diagrstr, &pbdiagrstr));
935     diag->pbdiagrstr = pbdiagrstr;
936   }
937   CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr;
938 
939   // Create diagonal vector
940   CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag;
941   if (!elemdiag) {
942     CeedCallBackend(CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag));
943     if (pointBlock) diag->pbelemdiag = elemdiag;
944     else diag->elemdiag = elemdiag;
945   }
946   CeedCallBackend(CeedVectorSetValue(elemdiag, 0.0));
947 
948   // Assemble element operator diagonals
949   CeedScalar       *elemdiagarray;
950   const CeedScalar *assembledqfarray;
951   CeedCallBackend(CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray));
952   CeedCallBackend(CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray));
953   CeedInt nelem;
954   CeedCallBackend(CeedElemRestrictionGetNumElements(diagrstr, &nelem));
955 
956   // Compute the diagonal of B^T D B
957   // Umesh: This needs to be reviewed later
958   // if (pointBlock) {
959   //  CeedCallBackend(CeedOperatorLinearPointBlockDiagonal_Sycl(sycl_data->sycl_queue, nelem, diag, assembledqfarray, elemdiagarray));
960   //} else {
961   CeedCallBackend(CeedOperatorLinearDiagonal_Sycl(sycl_data->sycl_queue, pointBlock, nelem, diag, assembledqfarray, elemdiagarray));
962   // }
963 
964   // Wait for queue to complete and handle exceptions
965   sycl_data->sycl_queue.wait_and_throw();
966 
967   // Restore arrays
968   CeedCallBackend(CeedVectorRestoreArray(elemdiag, &elemdiagarray));
969   CeedCallBackend(CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray));
970 
971   // Assemble local operator diagonal
972   CeedCallBackend(CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, assembled, request));
973 
974   // Cleanup
975   CeedCallBackend(CeedVectorDestroy(&assembledqf));
976 
977   return CEED_ERROR_SUCCESS;
978 }
979 
980 //------------------------------------------------------------------------------
981 // Assemble Linear Diagonal
982 //------------------------------------------------------------------------------
983 static int CeedOperatorLinearAssembleAddDiagonal_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request) {
984   CeedCallBackend(CeedOperatorAssembleDiagonalCore_Sycl(op, assembled, request, false));
985   return CEED_ERROR_SUCCESS;
986 }
987 
988 //------------------------------------------------------------------------------
989 // Assemble Linear Point Block Diagonal
990 //------------------------------------------------------------------------------
991 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request) {
992   CeedCallBackend(CeedOperatorAssembleDiagonalCore_Sycl(op, assembled, request, true));
993   return CEED_ERROR_SUCCESS;
994 }
995 
996 //------------------------------------------------------------------------------
997 // Single operator assembly setup
998 //------------------------------------------------------------------------------
999 static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
1000   Ceed ceed;
1001   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1002   CeedOperator_Sycl *impl;
1003   CeedCallBackend(CeedOperatorGetData(op, &impl));
1004 
1005   // Get input and output fields
1006   CeedInt            num_input_fields, num_output_fields;
1007   CeedOperatorField *input_fields;
1008   CeedOperatorField *output_fields;
1009   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
1010 
1011   // Determine active input basis eval mode
1012   CeedQFunction qf;
1013   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1014   CeedQFunctionField *qf_fields;
1015   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
1016   // Note that the kernel will treat each dimension of a gradient action separately;
1017   // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment by dim.
1018   // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, so
1019   // num_B_in_mats_to_load will be incremented by 1.
1020   CeedInt             num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0;
1021   CeedEvalMode       *eval_mode_in = NULL;  // will be of size num_B_in_mats_load
1022   CeedBasis           basis_in     = NULL;
1023   CeedInt             nqpts = 0, esize = 0;
1024   CeedElemRestriction rstr_in = NULL;
1025   for (CeedInt i = 0; i < num_input_fields; i++) {
1026     CeedVector vec;
1027     CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec));
1028     if (vec == CEED_VECTOR_ACTIVE) {
1029       CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in));
1030       CeedCallBackend(CeedBasisGetDimension(basis_in, &dim));
1031       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &nqpts));
1032       CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in));
1033       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &esize));
1034       CeedEvalMode eval_mode;
1035       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
1036       if (eval_mode != CEED_EVAL_NONE) {
1037         CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in));
1038         eval_mode_in[num_B_in_mats_to_load] = eval_mode;
1039         num_B_in_mats_to_load += 1;
1040         if (eval_mode == CEED_EVAL_GRAD) {
1041           num_emode_in += dim;
1042           size_B_in += dim * esize * nqpts;
1043         } else {
1044           num_emode_in += 1;
1045           size_B_in += esize * nqpts;
1046         }
1047       }
1048     }
1049   }
1050 
1051   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
1052   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
1053   CeedInt             num_emode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0;
1054   CeedEvalMode       *eval_mode_out = NULL;
1055   CeedBasis           basis_out     = NULL;
1056   CeedElemRestriction rstr_out      = NULL;
1057   for (CeedInt i = 0; i < num_output_fields; i++) {
1058     CeedVector vec;
1059     CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec));
1060     if (vec == CEED_VECTOR_ACTIVE) {
1061       CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out));
1062       CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out));
1063       if (rstr_out && rstr_out != rstr_in) {
1064         // LCOV_EXCL_START
1065         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly");
1066         // LCOV_EXCL_STOP
1067       }
1068       CeedEvalMode eval_mode;
1069       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
1070       if (eval_mode != CEED_EVAL_NONE) {
1071         CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out));
1072         eval_mode_out[num_B_out_mats_to_load] = eval_mode;
1073         num_B_out_mats_to_load += 1;
1074         if (eval_mode == CEED_EVAL_GRAD) {
1075           num_emode_out += dim;
1076           size_B_out += dim * esize * nqpts;
1077         } else {
1078           num_emode_out += 1;
1079           size_B_out += esize * nqpts;
1080         }
1081       }
1082     }
1083   }
1084 
1085   if (num_emode_in == 0 || num_emode_out == 0) {
1086     // LCOV_EXCL_START
1087     return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
1088     // LCOV_EXCL_STOP
1089   }
1090 
1091   CeedInt nelem, ncomp;
1092   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &nelem));
1093   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &ncomp));
1094 
1095   CeedCallBackend(CeedCalloc(1, &impl->asmb));
1096   CeedOperatorAssemble_Sycl *asmb = impl->asmb;
1097   asmb->nelem                     = nelem;
1098 
1099   Ceed_Sycl *sycl_data;
1100   CeedCallBackend(CeedGetData(ceed, &sycl_data));
1101 
1102   // Kernel setup
1103   int elemsPerBlock   = 1;
1104   asmb->elemsPerBlock = elemsPerBlock;
1105   CeedInt block_size  = esize * esize * elemsPerBlock;
1106   /* CeedInt maxThreadsPerBlock = sycl_data->sycl_device.get_info<sycl::info::device::max_work_group_size>();
1107   bool    fallback           = block_size > maxThreadsPerBlock;
1108   asmb->fallback             = fallback;
1109   if (fallback) {
1110     // Use fallback kernel with 1D threadblock
1111     block_size         = esize * elemsPerBlock;
1112     asmb->block_size_x = esize;
1113     asmb->block_size_y = 1;
1114   } else {  // Use kernel with 2D threadblock
1115     asmb->block_size_x = esize;
1116     asmb->block_size_y = esize;
1117   }*/
1118   asmb->block_size_x = esize;
1119   asmb->block_size_y = esize;
1120   asmb->numemodein   = num_emode_in;
1121   asmb->numemodeout  = num_emode_out;
1122   asmb->nqpts        = nqpts;
1123   asmb->nnodes       = esize;
1124   asmb->block_size   = block_size;
1125   asmb->ncomp        = ncomp;
1126 
1127   // Build 'full' B matrices (not 1D arrays used for tensor-product matrices
1128   const CeedScalar *interp_in, *grad_in;
1129   CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in));
1130   CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in));
1131 
1132   // Load into B_in, in order that they will be used in eval_mode
1133   CeedInt mat_start = 0;
1134   CeedCallSycl(ceed, asmb->d_B_in = sycl::malloc_device<CeedScalar>(size_B_in, sycl_data->sycl_device, sycl_data->sycl_context));
1135   for (int i = 0; i < num_B_in_mats_to_load; i++) {
1136     CeedEvalMode eval_mode = eval_mode_in[i];
1137     if (eval_mode == CEED_EVAL_INTERP) {
1138       // Order queue
1139       sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
1140       sycl_data->sycl_queue.copy<CeedScalar>(interp_in, &asmb->d_B_in[mat_start], esize * nqpts, {e});
1141       mat_start += esize * nqpts;
1142     } else if (eval_mode == CEED_EVAL_GRAD) {
1143       // Order queue
1144       sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
1145       sycl_data->sycl_queue.copy<CeedScalar>(grad_in, &asmb->d_B_in[mat_start], dim * esize * nqpts, {e});
1146       mat_start += dim * esize * nqpts;
1147     }
1148   }
1149 
1150   const CeedScalar *interp_out, *grad_out;
1151   // Note that this function currently assumes 1 basis, so this should always be true
1152   // for now
1153   if (basis_out == basis_in) {
1154     interp_out = interp_in;
1155     grad_out   = grad_in;
1156   } else {
1157     CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out));
1158     CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out));
1159   }
1160 
1161   // Load into B_out, in order that they will be used in eval_mode
1162   mat_start = 0;
1163   CeedCallSycl(ceed, asmb->d_B_out = sycl::malloc_device<CeedScalar>(size_B_out, sycl_data->sycl_device, sycl_data->sycl_context));
1164   for (int i = 0; i < num_B_out_mats_to_load; i++) {
1165     CeedEvalMode eval_mode = eval_mode_out[i];
1166     if (eval_mode == CEED_EVAL_INTERP) {
1167       // Order queue
1168       sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
1169       sycl_data->sycl_queue.copy<CeedScalar>(interp_out, &asmb->d_B_out[mat_start], esize * nqpts, {e});
1170       mat_start += esize * nqpts;
1171     } else if (eval_mode == CEED_EVAL_GRAD) {
1172       // Order queue
1173       sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
1174       sycl_data->sycl_queue.copy<CeedScalar>(grad_out, &asmb->d_B_out[mat_start], dim * esize * nqpts, {e});
1175       mat_start += dim * esize * nqpts;
1176     }
1177   }
1178   return CEED_ERROR_SUCCESS;
1179 }
1180 
1181 //------------------------------------------------------------------------------
1182 // Matrix assembly kernel for low-order elements (3D thread block)
1183 //------------------------------------------------------------------------------
1184 static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOperator_Sycl *impl, const CeedScalar *qf_array,
1185                                            CeedScalar *values_array) {
1186   // This kernels assumes B_in and B_out have the same number of quadrature points and basis points.
1187   // TODO: expand to more general cases
1188   CeedOperatorAssemble_Sycl *asmb        = impl->asmb;
1189   const CeedInt              nelem       = asmb->nelem;
1190   const CeedInt              nnodes      = asmb->nnodes;
1191   const CeedInt              ncomp       = asmb->ncomp;
1192   const CeedInt              nqpts       = asmb->nqpts;
1193   const CeedInt              numemodein  = asmb->numemodein;
1194   const CeedInt              numemodeout = asmb->numemodeout;
1195 
1196   // Strides for final output ordering, determined by the reference (inference) implementation of the symbolic assembly, slowest --> fastest: element,
1197   // comp_in, comp_out, node_row, node_col
1198   const CeedInt comp_out_stride = nnodes * nnodes;
1199   const CeedInt comp_in_stride  = comp_out_stride * ncomp;
1200   const CeedInt e_stride        = comp_in_stride * ncomp;
1201   // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt
1202   const CeedInt qe_stride         = nqpts;
1203   const CeedInt qcomp_out_stride  = nelem * qe_stride;
1204   const CeedInt qemode_out_stride = qcomp_out_stride * ncomp;
1205   const CeedInt qcomp_in_stride   = qemode_out_stride * numemodeout;
1206   const CeedInt qemode_in_stride  = qcomp_in_stride * ncomp;
1207 
1208   CeedScalar *B_in, *B_out;
1209   B_in                       = asmb->d_B_in;
1210   B_out                      = asmb->d_B_out;
1211   const CeedInt block_size_x = asmb->block_size_x;
1212   const CeedInt block_size_y = asmb->block_size_y;
1213 
1214   sycl::range<3> kernel_range(nelem, block_size_y, block_size_x);
1215 
1216   // Order queue
1217   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
1218   sycl_queue.parallel_for<CeedOperatorSyclLinearAssemble>(kernel_range, {e}, [=](sycl::id<3> idx) {
1219     const int e = idx.get(0);  // Element index
1220     const int l = idx.get(1);  // The output column index of each B^TDB operation
1221     const int i = idx.get(2);  // The output row index of each B^TDB operation
1222                                // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
1223     for (CeedInt comp_in = 0; comp_in < ncomp; comp_in++) {
1224       for (CeedInt comp_out = 0; comp_out < ncomp; comp_out++) {
1225         CeedScalar result        = 0.0;
1226         CeedInt    qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
1227         for (CeedInt emode_in = 0; emode_in < numemodein; emode_in++) {
1228           CeedInt b_in_index = emode_in * nqpts * nnodes;
1229           for (CeedInt emode_out = 0; emode_out < numemodeout; emode_out++) {
1230             CeedInt b_out_index = emode_out * nqpts * nnodes;
1231             CeedInt qf_index    = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
1232             // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
1233             for (CeedInt j = 0; j < nqpts; j++) {
1234               result += B_out[b_out_index + j * nnodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * nnodes + l];
1235             }
1236           }  // end of emode_out
1237         }    // end of emode_in
1238         CeedInt val_index       = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l;
1239         values_array[val_index] = result;
1240       }  // end of out component
1241     }    // end of in component
1242   });
1243 
1244   return CEED_ERROR_SUCCESS;
1245 }
1246 
1247 //------------------------------------------------------------------------------
1248 // Fallback kernel for larger orders (1D thread block)
1249 //------------------------------------------------------------------------------
1250 /*
1251 static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, const CeedOperator_Sycl *impl, const CeedScalar *qf_array,
1252                                                    CeedScalar *values_array) {
1253   // This kernel assumes B_in and B_out have the same number of quadrature points and basis points.
1254   // TODO: expand to more general cases
1255   CeedOperatorAssemble_Sycl *asmb        = impl->asmb;
1256   const CeedInt              nelem       = asmb->nelem;
1257   const CeedInt              nnodes      = asmb->nnodes;
1258   const CeedInt              ncomp       = asmb->ncomp;
1259   const CeedInt              nqpts       = asmb->nqpts;
1260   const CeedInt              numemodein  = asmb->numemodein;
1261   const CeedInt              numemodeout = asmb->numemodeout;
1262 
1263   // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: elememt,
1264   // comp_in, comp_out, node_row, node_col
1265   const CeedInt comp_out_stride = nnodes * nnodes;
1266   const CeedInt comp_in_stride  = comp_out_stride * ncomp;
1267   const CeedInt e_stride        = comp_in_stride * ncomp;
1268   // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt
1269   const CeedInt qe_stride         = nqpts;
1270   const CeedInt qcomp_out_stride  = nelem * qe_stride;
1271   const CeedInt qemode_out_stride = qcomp_out_stride * ncomp;
1272   const CeedInt qcomp_in_stride   = qemode_out_stride * numemodeout;
1273   const CeedInt qemode_in_stride  = qcomp_in_stride * ncomp;
1274 
1275   CeedScalar *B_in, *B_out;
1276   B_in                        = asmb->d_B_in;
1277   B_out                       = asmb->d_B_out;
1278   const CeedInt elemsPerBlock = asmb->elemsPerBlock;
1279   const CeedInt block_size_x  = asmb->block_size_x;
1280   const CeedInt block_size_y  = asmb->block_size_y;  // This will be 1 for the fallback kernel
1281 
1282   const CeedInt     grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0);
1283   sycl::range<3>    local_range(block_size_x, block_size_y, elemsPerBlock);
1284   sycl::range<3>    global_range(grid * block_size_x, block_size_y, elemsPerBlock);
1285   sycl::nd_range<3> kernel_range(global_range, local_range);
1286 
1287   sycl_queue.parallel_for<CeedOperatorSyclLinearAssembleFallback>(kernel_range, [=](sycl::nd_item<3> work_item) {
1288     const CeedInt blockIdx  = work_item.get_group(0);
1289     const CeedInt gridDimx  = work_item.get_group_range(0);
1290     const CeedInt threadIdx = work_item.get_local_id(0);
1291     const CeedInt threadIdz = work_item.get_local_id(2);
1292     const CeedInt blockDimz = work_item.get_local_range(2);
1293 
1294     const int l = threadIdx;  // The output column index of each B^TDB operation
1295                               // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
1296     for (CeedInt e = blockIdx * blockDimz + threadIdz; e < nelem; e += gridDimx * blockDimz) {
1297       for (CeedInt comp_in = 0; comp_in < ncomp; comp_in++) {
1298         for (CeedInt comp_out = 0; comp_out < ncomp; comp_out++) {
1299           for (CeedInt i = 0; i < nnodes; i++) {
1300             CeedScalar result        = 0.0;
1301             CeedInt    qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
1302             for (CeedInt emode_in = 0; emode_in < numemodein; emode_in++) {
1303               CeedInt b_in_index = emode_in * nqpts * nnodes;
1304               for (CeedInt emode_out = 0; emode_out < numemodeout; emode_out++) {
1305                 CeedInt b_out_index = emode_out * nqpts * nnodes;
1306                 CeedInt qf_index    = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
1307                 // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
1308                 for (CeedInt j = 0; j < nqpts; j++) {
1309                   result += B_out[b_out_index + j * nnodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * nnodes + l];
1310                 }
1311               }  // end of emode_out
1312             }    // end of emode_in
1313             CeedInt val_index       = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l;
1314             values_array[val_index] = result;
1315           }  // end of loop over element node index, i
1316         }    // end of out component
1317       }      // end of in component
1318     }        // end of element loop
1319   });
1320   return CEED_ERROR_SUCCESS;
1321 }*/
1322 
1323 //------------------------------------------------------------------------------
1324 // Assemble matrix data for COO matrix of assembled operator.
1325 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
1326 //
1327 // Note that this (and other assembly routines) currently assume only one active
1328 // input restriction/basis per operator (could have multiple basis eval modes).
1329 // TODO: allow multiple active input restrictions/basis objects
1330 //------------------------------------------------------------------------------
1331 static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, CeedVector values) {
1332   Ceed ceed;
1333   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1334   CeedOperator_Sycl *impl;
1335   CeedCallBackend(CeedOperatorGetData(op, &impl));
1336   Ceed_Sycl *sycl_data;
1337   CeedCallBackend(CeedGetData(ceed, &sycl_data));
1338 
1339   // Setup
1340   if (!impl->asmb) {
1341     CeedCallBackend(CeedSingleOperatorAssembleSetup_Sycl(op));
1342     assert(impl->asmb != NULL);
1343   }
1344 
1345   // Assemble QFunction
1346   CeedVector          assembled_qf = NULL;
1347   CeedElemRestriction rstr_q       = NULL;
1348   CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE));
1349   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q));
1350   CeedScalar *values_array;
1351   CeedCallBackend(CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array));
1352   values_array += offset;
1353   const CeedScalar *qf_array;
1354   CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array));
1355 
1356   // Compute B^T D B
1357   CeedCallBackend(CeedOperatorLinearAssemble_Sycl(sycl_data->sycl_queue, impl, qf_array, values_array));
1358 
1359   // Wait for kernels to be completed
1360   // Kris: Review if this is necessary -- enqueing an async barrier may be sufficient
1361   sycl_data->sycl_queue.wait_and_throw();
1362 
1363   // Restore arrays
1364   CeedCallBackend(CeedVectorRestoreArray(values, &values_array));
1365   CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &qf_array));
1366 
1367   // Cleanup
1368   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
1369 
1370   return CEED_ERROR_SUCCESS;
1371 }
1372 
1373 //------------------------------------------------------------------------------
1374 // Create operator
1375 //------------------------------------------------------------------------------
1376 int CeedOperatorCreate_Sycl(CeedOperator op) {
1377   Ceed ceed;
1378   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1379   CeedOperator_Sycl *impl;
1380 
1381   CeedCallBackend(CeedCalloc(1, &impl));
1382   CeedCallBackend(CeedOperatorSetData(op, impl));
1383 
1384   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Sycl));
1385   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Sycl));
1386   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Sycl));
1387   CeedCallBackend(
1388       CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl));
1389   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Sycl));
1390   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl));
1391   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl));
1392   return CEED_ERROR_SUCCESS;
1393 }
1394 
1395 //------------------------------------------------------------------------------
1396