xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 25dfc19d2eb771681f2aae4eae966b19dcc9173b)
1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4241a4b83SYohann //
5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral
7241a4b83SYohann // element discretizations for exascale applications. For more information and
8241a4b83SYohann // source code availability see http://github.com/ceed.
9241a4b83SYohann //
10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for
13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including
14241a4b83SYohann // software, applications, hardware, advanced system engineering and early
15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative.
16241a4b83SYohann #include <ceed-backend.h>
17241a4b83SYohann #include "ceed-cuda-gen.h"
18241a4b83SYohann #include <iostream>
19241a4b83SYohann #include <sstream>
20241a4b83SYohann #include "../cuda/ceed-cuda.h"
21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h"
22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
23241a4b83SYohann 
24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE(
25241a4b83SYohann __device__ double atomicAdd(double *address, double val) {
26241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
27241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
28241a4b83SYohann   do {
29241a4b83SYohann     assumed = old;
30241a4b83SYohann     old =
31241a4b83SYohann       atomicCAS(address_as_ull, assumed,
32241a4b83SYohann                 __double_as_longlong(val +
33241a4b83SYohann                                      __longlong_as_double(assumed)));
34241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
35241a4b83SYohann     // (since NaN != NaN)
36241a4b83SYohann   } while (assumed != old);
37241a4b83SYohann   return __longlong_as_double(old);
38241a4b83SYohann }
39f1a13f77SYohann Dudouit );
40f1a13f77SYohann Dudouit 
41f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE(
42f1a13f77SYohann Dudouit 
43f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
44f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
45f1a13f77SYohann Dudouit 
46f1a13f77SYohann Dudouit typedef struct {
47f1a13f77SYohann Dudouit   CeedInt tidx;
48f1a13f77SYohann Dudouit   CeedInt tidy;
49f1a13f77SYohann Dudouit   CeedInt tidz;
50f1a13f77SYohann Dudouit   CeedInt tid;
51f1a13f77SYohann Dudouit   CeedScalar* slice;
52f1a13f77SYohann Dudouit } BackendData;
53241a4b83SYohann 
54241a4b83SYohann template <int P, int Q>
55241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
56ac421f39SYohann   for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) {
57241a4b83SYohann     B[i] = d_B[i];
58241a4b83SYohann   }
59241a4b83SYohann }
60241a4b83SYohann 
61241a4b83SYohann //****
62241a4b83SYohann // 1D
63241a4b83SYohann template <int NCOMP, int P1d>
64920dcdc4Sjeremylt inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
65241a4b83SYohann   if (data.tidx<P1d)
66241a4b83SYohann   {
674d537eeaSYohann     const CeedInt node = data.tidx;
68ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
69241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
708795c945Sjeremylt       r_u[comp] = d_u[ind + nnodes * comp];
71241a4b83SYohann     }
72241a4b83SYohann   }
73241a4b83SYohann }
74241a4b83SYohann 
75241a4b83SYohann template <int NCOMP, int P1d>
76920dcdc4Sjeremylt inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
77241a4b83SYohann   if (data.tidx<P1d)
78241a4b83SYohann   {
794d537eeaSYohann     const CeedInt node = data.tidx;
80ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
81241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
82241a4b83SYohann       r_u[comp] = d_u[ind * NCOMP + comp];
83241a4b83SYohann     }
84241a4b83SYohann   }
85241a4b83SYohann }
86920dcdc4Sjeremylt 
87d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
88d80fc06aSjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
89ccedf6b0Sjeremylt   if (data.tidx<P1d)
90ccedf6b0Sjeremylt   {
91ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
92d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
93ccedf6b0Sjeremylt     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
94d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
95ccedf6b0Sjeremylt     }
96ccedf6b0Sjeremylt   }
97ccedf6b0Sjeremylt }
98241a4b83SYohann 
99241a4b83SYohann template <int NCOMP, int P1d>
100920dcdc4Sjeremylt inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
101241a4b83SYohann   if (data.tidx<P1d)
102241a4b83SYohann   {
1034d537eeaSYohann     const CeedInt node = data.tidx;
104ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
105241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
1068795c945Sjeremylt       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
107241a4b83SYohann     }
108241a4b83SYohann   }
109241a4b83SYohann }
110241a4b83SYohann 
111241a4b83SYohann template <int NCOMP, int P1d>
112920dcdc4Sjeremylt inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
113241a4b83SYohann   if (data.tidx<P1d)
114241a4b83SYohann   {
1154d537eeaSYohann     const CeedInt node = data.tidx;
116ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
117241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
118241a4b83SYohann       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
119241a4b83SYohann     }
120241a4b83SYohann   }
121241a4b83SYohann }
122241a4b83SYohann 
123d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
124d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
125ccedf6b0Sjeremylt   if (data.tidx<P1d)
126ccedf6b0Sjeremylt   {
127ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
128d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
129ccedf6b0Sjeremylt     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
130d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
131ccedf6b0Sjeremylt     }
132ccedf6b0Sjeremylt   }
133ccedf6b0Sjeremylt }
134ccedf6b0Sjeremylt 
135241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
136241a4b83SYohann inline __device__ void ContractX1d(BackendData& data,
137241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
138241a4b83SYohann   data.slice[data.tidx] = *U;
139241a4b83SYohann   __syncthreads();
140241a4b83SYohann   *V = 0.0;
141ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
142241a4b83SYohann     *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction
143241a4b83SYohann   }
144241a4b83SYohann   __syncthreads();
145241a4b83SYohann }
146241a4b83SYohann 
147241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
148241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data,
149241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
150241a4b83SYohann   data.slice[data.tidx] = *U;
151241a4b83SYohann   __syncthreads();
152241a4b83SYohann   *V = 0.0;
153ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
154241a4b83SYohann     *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction
155241a4b83SYohann   }
156241a4b83SYohann   __syncthreads();
157241a4b83SYohann }
158241a4b83SYohann 
159241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
160241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
161241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
162ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
163241a4b83SYohann     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
164241a4b83SYohann   }
165241a4b83SYohann }
166241a4b83SYohann 
167241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
168241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
169241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
170ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
171241a4b83SYohann     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
172241a4b83SYohann   }
173241a4b83SYohann }
174241a4b83SYohann 
175241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
176241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U,
177241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
178241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
179ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
180241a4b83SYohann     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
181241a4b83SYohann   }
182241a4b83SYohann }
183241a4b83SYohann 
184241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
185241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U,
186241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
187241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
188ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
189241a4b83SYohann     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
190241a4b83SYohann   }
191241a4b83SYohann }
192241a4b83SYohann 
193241a4b83SYohann //****
194241a4b83SYohann // 2D
195241a4b83SYohann template <int NCOMP, int P1d>
196920dcdc4Sjeremylt inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
197241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
198241a4b83SYohann   {
1994d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
200ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
201241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
2028795c945Sjeremylt       r_u[comp] = d_u[ind + nnodes * comp];
203241a4b83SYohann     }
204241a4b83SYohann   }
205241a4b83SYohann }
206241a4b83SYohann 
207241a4b83SYohann template <int NCOMP, int P1d>
208920dcdc4Sjeremylt inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
209241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
210241a4b83SYohann   {
2114d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
212ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
213241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
214241a4b83SYohann       r_u[comp] = d_u[ind * NCOMP + comp];
215241a4b83SYohann     }
216241a4b83SYohann   }
217241a4b83SYohann }
218920dcdc4Sjeremylt 
219d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
220d80fc06aSjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
221ccedf6b0Sjeremylt   if (data.tidx<P1d && data.tidy<P1d)
222ccedf6b0Sjeremylt   {
223ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
224d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
225ccedf6b0Sjeremylt     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
226d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
227ccedf6b0Sjeremylt     }
228ccedf6b0Sjeremylt   }
229ccedf6b0Sjeremylt }
230241a4b83SYohann 
231241a4b83SYohann template <int NCOMP, int P1d>
232920dcdc4Sjeremylt inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
233241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
234241a4b83SYohann   {
2354d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
236ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
237241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
2388795c945Sjeremylt       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
239241a4b83SYohann     }
240241a4b83SYohann   }
241241a4b83SYohann }
242241a4b83SYohann 
243241a4b83SYohann template <int NCOMP, int P1d>
244920dcdc4Sjeremylt inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
245241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
246241a4b83SYohann   {
2474d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
248ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
249241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
250241a4b83SYohann       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
251241a4b83SYohann     }
252241a4b83SYohann   }
253241a4b83SYohann }
254241a4b83SYohann 
255d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
256d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
257ccedf6b0Sjeremylt   if (data.tidx<P1d && data.tidy<P1d)
258ccedf6b0Sjeremylt   {
259ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
260d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
261ccedf6b0Sjeremylt     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
262d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
263ccedf6b0Sjeremylt     }
264ccedf6b0Sjeremylt   }
265ccedf6b0Sjeremylt }
266ccedf6b0Sjeremylt 
267241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
268241a4b83SYohann inline __device__ void ContractX2d(BackendData& data,
269241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
270241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
271241a4b83SYohann   __syncthreads();
272241a4b83SYohann   *V = 0.0;
273ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
274241a4b83SYohann     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
275241a4b83SYohann   }
276241a4b83SYohann   __syncthreads();
277241a4b83SYohann }
278241a4b83SYohann 
279241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
280241a4b83SYohann inline __device__ void ContractY2d(BackendData& data,
281241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
282241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
283241a4b83SYohann   __syncthreads();
284241a4b83SYohann   *V = 0.0;
285ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
286241a4b83SYohann     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
287241a4b83SYohann   }
288241a4b83SYohann   __syncthreads();
289241a4b83SYohann }
290241a4b83SYohann 
291241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
292241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data,
293241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
294241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
295241a4b83SYohann   __syncthreads();
296241a4b83SYohann   *V = 0.0;
297241a4b83SYohann   if (data.tidy<P1d) {
298ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
299241a4b83SYohann       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
300241a4b83SYohann     }
301241a4b83SYohann   }
302241a4b83SYohann   __syncthreads();
303241a4b83SYohann }
304241a4b83SYohann 
305241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
306241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data,
307241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
308241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
309241a4b83SYohann   __syncthreads();
310241a4b83SYohann   *V = 0.0;
311241a4b83SYohann   if (data.tidx<P1d) {
312ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
313241a4b83SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
314241a4b83SYohann     }
315241a4b83SYohann   }
316241a4b83SYohann   __syncthreads();
317241a4b83SYohann }
318241a4b83SYohann 
319241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
320241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data,
321241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
322241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
323241a4b83SYohann   __syncthreads();
324241a4b83SYohann   if (data.tidx<P1d) {
325ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
326241a4b83SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
327241a4b83SYohann     }
328241a4b83SYohann   }
329241a4b83SYohann   __syncthreads();
330241a4b83SYohann }
331241a4b83SYohann 
332241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
333241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
334241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
335241a4b83SYohann   CeedScalar r_t[1];
336ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
337241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
338241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
339241a4b83SYohann   }
340241a4b83SYohann }
341241a4b83SYohann 
342241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
343241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
344241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
345241a4b83SYohann   CeedScalar r_t[1];
346ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
347241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
348241a4b83SYohann     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
349241a4b83SYohann   }
350241a4b83SYohann }
351241a4b83SYohann 
352241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
353241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U,
354241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
355241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
356241a4b83SYohann   CeedScalar r_t[1];
357ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
358241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t);
359241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP);
360241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
361241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP);
362241a4b83SYohann   }
363241a4b83SYohann }
364241a4b83SYohann 
365241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
366241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U,
367241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
368241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
369241a4b83SYohann   CeedScalar r_t[1];
370ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
371241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t);
372241a4b83SYohann     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp);
373241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t);
374241a4b83SYohann     ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
375241a4b83SYohann   }
376241a4b83SYohann }
377241a4b83SYohann 
378241a4b83SYohann //****
379241a4b83SYohann // 3D
380241a4b83SYohann template <int NCOMP, int P1d>
381920dcdc4Sjeremylt inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
382241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
383241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
3844d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
385ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
386241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
3878795c945Sjeremylt         r_u[z+comp*P1d] = d_u[ind + nnodes * comp];
388241a4b83SYohann       }
389241a4b83SYohann     }
390241a4b83SYohann   }
391241a4b83SYohann }
392241a4b83SYohann 
393241a4b83SYohann template <int NCOMP, int P1d>
394920dcdc4Sjeremylt inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
395241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
396241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
3974d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
398ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
399241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
400241a4b83SYohann         r_u[z+comp*P1d] = d_u[ind * NCOMP + comp];
401241a4b83SYohann       }
402241a4b83SYohann     }
403241a4b83SYohann   }
404241a4b83SYohann }
405920dcdc4Sjeremylt 
406d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
407d80fc06aSjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
408ccedf6b0Sjeremylt   if (data.tidx<P1d && data.tidy<P1d) {
409ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
410ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
411d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
412ccedf6b0Sjeremylt       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
413d80fc06aSjeremylt         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
414ccedf6b0Sjeremylt       }
415ccedf6b0Sjeremylt     }
416ccedf6b0Sjeremylt   }
417ccedf6b0Sjeremylt }
418241a4b83SYohann 
419241a4b83SYohann template <int NCOMP, int Q1d>
420920dcdc4Sjeremylt inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
421ac421f39SYohann   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
422920dcdc4Sjeremylt   const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
423ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
424ac421f39SYohann     r_u[comp] = d_u[ind + nquads * comp];
425ac421f39SYohann   }
426ac421f39SYohann }
427ac421f39SYohann 
428ac421f39SYohann template <int NCOMP, int Q1d>
429920dcdc4Sjeremylt inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
430ac421f39SYohann   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
431920dcdc4Sjeremylt   const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];
432ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
433ac421f39SYohann     r_u[comp] = d_u[ind * NCOMP + comp];
434ac421f39SYohann   }
435ac421f39SYohann }
436ac421f39SYohann 
437d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
438*25dfc19dSjeremylt inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
439920dcdc4Sjeremylt   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
440d80fc06aSjeremylt   const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
441920dcdc4Sjeremylt   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
442d80fc06aSjeremylt     r_u[comp] = d_u[ind + comp * STRIDES_COMP];
443920dcdc4Sjeremylt   }
444920dcdc4Sjeremylt }
445920dcdc4Sjeremylt 
446241a4b83SYohann template <int NCOMP, int P1d>
447920dcdc4Sjeremylt inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
448241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
449241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4504d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
451ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
452241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
4538795c945Sjeremylt         atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]);
454241a4b83SYohann       }
455241a4b83SYohann     }
456241a4b83SYohann   }
457241a4b83SYohann }
458241a4b83SYohann 
459241a4b83SYohann template <int NCOMP, int P1d>
460920dcdc4Sjeremylt inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
461241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
462241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4634d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
464ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
465241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
466241a4b83SYohann         atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]);
467241a4b83SYohann       }
468241a4b83SYohann     }
469241a4b83SYohann   }
470241a4b83SYohann }
471241a4b83SYohann 
472d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4732f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
474ccedf6b0Sjeremylt   if (data.tidx<P1d && data.tidy<P1d) {
475ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
476ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
477d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
478ccedf6b0Sjeremylt       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
479d80fc06aSjeremylt         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
480ccedf6b0Sjeremylt       }
481ccedf6b0Sjeremylt     }
482ccedf6b0Sjeremylt   }
483ccedf6b0Sjeremylt }
484ccedf6b0Sjeremylt 
485241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
486241a4b83SYohann inline __device__ void ContractX3d(BackendData& data,
487241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
488ac421f39SYohann   CeedScalar r_B[P1d];
489ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
490ac421f39SYohann     r_B[i] = B[i + data.tidx*P1d];
491ac421f39SYohann   }
492ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
493241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
494241a4b83SYohann     __syncthreads();
495241a4b83SYohann     V[k] = 0.0;
496ac421f39SYohann     for (CeedInt i = 0; i < P1d; ++i) {
497ac421f39SYohann       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
498241a4b83SYohann     }
499241a4b83SYohann     __syncthreads();
500241a4b83SYohann   }
501241a4b83SYohann }
502241a4b83SYohann 
503241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
504241a4b83SYohann inline __device__ void ContractY3d(BackendData& data,
505241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
506ac421f39SYohann   CeedScalar r_B[P1d];
507ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
508ac421f39SYohann     r_B[i] = B[i + data.tidy*P1d];
509ac421f39SYohann   }
510ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
511241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
512241a4b83SYohann     __syncthreads();
513241a4b83SYohann     V[k] = 0.0;
514ac421f39SYohann     for (CeedInt i = 0; i < P1d; ++i) {
515ac421f39SYohann       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
516241a4b83SYohann     }
517241a4b83SYohann     __syncthreads();
518241a4b83SYohann   }
519241a4b83SYohann }
520241a4b83SYohann 
521241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
522241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data,
523241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
524ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
525241a4b83SYohann     V[k] = 0.0;
526ac421f39SYohann     for (CeedInt i = 0; i < P1d; ++i) {
527241a4b83SYohann       V[k] += B[i + k*P1d] * U[i];//contract z direction
528241a4b83SYohann     }
529241a4b83SYohann   }
530241a4b83SYohann }
531241a4b83SYohann 
532241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
533241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data,
534241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
535ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
536241a4b83SYohann     V[k] = 0.0;
537241a4b83SYohann     if (k<P1d) {
538ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
539241a4b83SYohann         V[k] += B[k + i*P1d] * U[i];//contract z direction
540241a4b83SYohann       }
541241a4b83SYohann     }
542241a4b83SYohann   }
543241a4b83SYohann }
544241a4b83SYohann 
545241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
546241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data,
547241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
548ac421f39SYohann   CeedScalar r_B[Q1d];
549ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i)
550ac421f39SYohann   {
551ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
552ac421f39SYohann   }
553ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
554241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
555241a4b83SYohann     __syncthreads();
556241a4b83SYohann     V[k] = 0.0;
557241a4b83SYohann     if (data.tidy<P1d) {
558ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
559ac421f39SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
560ac421f39SYohann       }
561ac421f39SYohann     }
562ac421f39SYohann     __syncthreads();
563ac421f39SYohann   }
564ac421f39SYohann }
565ac421f39SYohann 
566ac421f39SYohann template <int NCOMP, int P1d, int Q1d>
567ac421f39SYohann inline __device__ void ContractTransposeAddY3d(BackendData& data,
568ac421f39SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
569ac421f39SYohann   CeedScalar r_B[Q1d];
570ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
571ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
572ac421f39SYohann   }
573ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
574ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
575ac421f39SYohann     __syncthreads();
576ac421f39SYohann     if (data.tidy<P1d) {
577ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
578ac421f39SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
579241a4b83SYohann       }
580241a4b83SYohann     }
581241a4b83SYohann     __syncthreads();
582241a4b83SYohann   }
583241a4b83SYohann }
584241a4b83SYohann 
585241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
586241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data,
587241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
588ac421f39SYohann   CeedScalar r_B[Q1d];
589ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
590ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
591ac421f39SYohann   }
592ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
593241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
594241a4b83SYohann     __syncthreads();
595241a4b83SYohann     V[k] = 0.0;
596241a4b83SYohann     if (data.tidx<P1d) {
597ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
598ac421f39SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
599241a4b83SYohann       }
600241a4b83SYohann     }
601241a4b83SYohann     __syncthreads();
602241a4b83SYohann   }
603241a4b83SYohann }
604241a4b83SYohann 
605241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
606241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data,
607241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
608ac421f39SYohann   CeedScalar r_B[Q1d];
609ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
610ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
611ac421f39SYohann   }
612ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
613241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
614241a4b83SYohann     __syncthreads();
615241a4b83SYohann     if (data.tidx<P1d) {
616ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
617ac421f39SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
618241a4b83SYohann       }
619241a4b83SYohann     }
620241a4b83SYohann     __syncthreads();
621241a4b83SYohann   }
622241a4b83SYohann }
623241a4b83SYohann 
624241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
625241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
626241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
627241a4b83SYohann   CeedScalar r_t1[Q1d];
628241a4b83SYohann   CeedScalar r_t2[Q1d];
629ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
630241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
631241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
632241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d);
633241a4b83SYohann   }
634241a4b83SYohann }
635241a4b83SYohann 
636241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
637241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
638241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
639241a4b83SYohann   CeedScalar r_t1[Q1d];
640241a4b83SYohann   CeedScalar r_t2[Q1d];
641ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
642241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1);
643241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
644241a4b83SYohann     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
645241a4b83SYohann   }
646241a4b83SYohann }
647241a4b83SYohann 
648241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
649241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U,
650241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
651241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
652241a4b83SYohann   CeedScalar r_t1[Q1d];
653241a4b83SYohann   CeedScalar r_t2[Q1d];
654ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
655241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1);
656241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
657241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d);
658241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
659241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
660241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d);
661241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
662241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
663241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d);
664241a4b83SYohann   }
665241a4b83SYohann }
666241a4b83SYohann 
667241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
668241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U,
669241a4b83SYohann                                        const CeedScalar *c_B, const CeedScalar *c_G,
670241a4b83SYohann                                        CeedScalar *__restrict__ r_V) {
671241a4b83SYohann   CeedScalar r_t1[Q1d];
672241a4b83SYohann   CeedScalar r_t2[Q1d];
673ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
674241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1);
675241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
676241a4b83SYohann     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d);
677241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1);
678241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
679241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
680241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1);
681241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
682241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
683241a4b83SYohann   }
684241a4b83SYohann }
685241a4b83SYohann 
686ac421f39SYohann template <int NCOMP, int Q1d>
687ac421f39SYohann inline __device__ void gradCollo3d(BackendData& data, const CeedInt q,
688ac421f39SYohann                                    const CeedScalar *__restrict__ r_U,
689ac421f39SYohann                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
690ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
691ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d];
692ac421f39SYohann     __syncthreads();
693ac421f39SYohann     // X derivative
694ac421f39SYohann     r_V[comp+0*NCOMP] = 0.0;
695ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
696ac421f39SYohann       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
697ac421f39SYohann     }
698ac421f39SYohann     // Y derivative
699ac421f39SYohann     r_V[comp+1*NCOMP] = 0.0;
700ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
701ac421f39SYohann       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
702ac421f39SYohann     }
703ac421f39SYohann     // Z derivative
704ac421f39SYohann     r_V[comp+2*NCOMP] = 0.0;
705ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
706ac421f39SYohann       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative)
707ac421f39SYohann     }
708ac421f39SYohann     __syncthreads();
709ac421f39SYohann   }
710ac421f39SYohann }
711ac421f39SYohann 
712ac421f39SYohann template <int NCOMP, int Q1d>
713ac421f39SYohann inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q,
714ac421f39SYohann                                             const CeedScalar *__restrict__ r_U,
715ac421f39SYohann                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
716ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
717ac421f39SYohann     // X derivative
718ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP];
719ac421f39SYohann     __syncthreads();
720ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
721ac421f39SYohann       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
722ac421f39SYohann     }
723ac421f39SYohann     __syncthreads();
724ac421f39SYohann     // Y derivative
725ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP];
726ac421f39SYohann     __syncthreads();
727ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
728ac421f39SYohann       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
729ac421f39SYohann     }
730ac421f39SYohann     __syncthreads();
731ac421f39SYohann     // Z derivative
732ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
733ac421f39SYohann       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative)
734ac421f39SYohann     }
735ac421f39SYohann   }
736ac421f39SYohann }
737ac421f39SYohann 
738241a4b83SYohann template <int Q1d>
739241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
740241a4b83SYohann   *w = qweight1d[data.tidx];
741241a4b83SYohann }
742241a4b83SYohann 
743241a4b83SYohann template <int Q1d>
744241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
745241a4b83SYohann   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
746241a4b83SYohann }
747241a4b83SYohann 
748241a4b83SYohann template <int Q1d>
749241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
750241a4b83SYohann   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
751ac421f39SYohann   for (CeedInt z = 0; z < Q1d; ++z)
752241a4b83SYohann   {
753241a4b83SYohann     w[z] = pw*qweight1d[z];
754241a4b83SYohann   }
755241a4b83SYohann }
756241a4b83SYohann 
757241a4b83SYohann );
758241a4b83SYohann 
759241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
760241a4b83SYohann 
761241a4b83SYohann   using std::ostringstream;
762241a4b83SYohann   using std::string;
763241a4b83SYohann   int ierr;
764241a4b83SYohann   bool setupdone;
765241a4b83SYohann   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
766241a4b83SYohann   if (setupdone) return 0;
767241a4b83SYohann   Ceed ceed;
768241a4b83SYohann   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
769241a4b83SYohann   CeedOperator_Cuda_gen *data;
770241a4b83SYohann   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
771241a4b83SYohann   CeedQFunction qf;
772241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
773241a4b83SYohann   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
774241a4b83SYohann   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
7758795c945Sjeremylt   CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes;
776241a4b83SYohann   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
777241a4b83SYohann   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
778241a4b83SYohann   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
779241a4b83SYohann   CeedChk(ierr);
780241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
781241a4b83SYohann   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
782241a4b83SYohann   CeedChk(ierr);
783241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
784241a4b83SYohann   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
785241a4b83SYohann   CeedChk(ierr);
786241a4b83SYohann   CeedEvalMode emode;
78761dbc9d2Sjeremylt   CeedInterlaceMode imode;
788241a4b83SYohann   CeedBasis basis;
789241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
790241a4b83SYohann   CeedElemRestriction Erestrict;
791241a4b83SYohann   CeedElemRestriction_Cuda_reg *restr_data;
792241a4b83SYohann 
793241a4b83SYohann   ostringstream code;
794241a4b83SYohann   string devFunctions(deviceFunctions);
795241a4b83SYohann 
796f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
797f1a13f77SYohann Dudouit   struct cudaDeviceProp prop;
798f1a13f77SYohann Dudouit   Ceed delegate;
799f1a13f77SYohann Dudouit   CeedGetDelegate(ceed, &delegate);
800f1a13f77SYohann Dudouit   Ceed_Cuda *ceed_data;
801f1a13f77SYohann Dudouit   ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr);
802f1a13f77SYohann Dudouit   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
803f1a13f77SYohann Dudouit   if (prop.major<6){
804f1a13f77SYohann Dudouit     code << atomicAdd;
805f1a13f77SYohann Dudouit   }
806f1a13f77SYohann Dudouit 
807241a4b83SYohann   code << devFunctions;
808241a4b83SYohann 
809241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
8104d537eeaSYohann 
8114d537eeaSYohann   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
812ee07ded2SValeria Barra   code << "\n#define CeedPragmaSIMD\n";
813241a4b83SYohann   code << qFunction;
814241a4b83SYohann 
815241a4b83SYohann   // Setup
816d80fc06aSjeremylt   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
817241a4b83SYohann   // Input Evecs and Restriction
818241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
819241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
820241a4b83SYohann     CeedChk(ierr);
821241a4b83SYohann     if (emode == CEED_EVAL_WEIGHT) { // Skip
822241a4b83SYohann     } else {
823241a4b83SYohann       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
824241a4b83SYohann       if (emode != CEED_EVAL_NONE)
825241a4b83SYohann       {
826241a4b83SYohann         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
827241a4b83SYohann         bool isTensor;
828241a4b83SYohann         ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
829241a4b83SYohann         //TODO check that all are the same
830241a4b83SYohann         ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
831241a4b83SYohann         if (isTensor)
832241a4b83SYohann         {
833241a4b83SYohann           //TODO check that all are the same
834241a4b83SYohann           ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
835241a4b83SYohann         } else {
836241a4b83SYohann           return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
837241a4b83SYohann         }
838241a4b83SYohann       }
839241a4b83SYohann     }
840241a4b83SYohann   }
841241a4b83SYohann   data->dim = dim;
842241a4b83SYohann   data->Q1d = Q1d;
843241a4b83SYohann 
844241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
845241a4b83SYohann     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
846241a4b83SYohann   }
847241a4b83SYohann   code << "const CeedInt Dim = "<<dim<<";\n";
848241a4b83SYohann   code << "const CeedInt Q1d = "<<Q1d<<";\n";
849241a4b83SYohann   // code << "const CeedInt Q   = "<<Q<<";\n";
850241a4b83SYohann   code << "extern __shared__ CeedScalar slice[];\n";
851241a4b83SYohann   code << "BackendData data;\n";
852241a4b83SYohann   code << "data.tidx = threadIdx.x;\n";
853241a4b83SYohann   code << "data.tidy = threadIdx.y;\n";
854241a4b83SYohann   code << "data.tidz = threadIdx.z;\n";
855241a4b83SYohann   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
856241a4b83SYohann   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
857920dcdc4Sjeremylt 
858920dcdc4Sjeremylt   code << "\n// Input field constants and basis data\n";
859ac421f39SYohann   //Initialize constants, and matrices B and G
860241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
861920dcdc4Sjeremylt     code << "// -- Input field "<<i<<" --\n";
862241a4b83SYohann     // Get elemsize, emode, ncomp
863241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
864241a4b83SYohann     CeedChk(ierr);
865241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
866241a4b83SYohann     CeedChk(ierr);
867241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
868241a4b83SYohann     CeedChk(ierr);
8694d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
870241a4b83SYohann     CeedChk(ierr);
871920dcdc4Sjeremylt 
872920dcdc4Sjeremylt     // Set field constants
873920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT) {
874920dcdc4Sjeremylt       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
875920dcdc4Sjeremylt       if (basis != CEED_BASIS_COLLOCATED) {
876920dcdc4Sjeremylt         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
877920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
878920dcdc4Sjeremylt       } else {
879920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
880920dcdc4Sjeremylt       }
881920dcdc4Sjeremylt       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
882920dcdc4Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
883920dcdc4Sjeremylt       code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
884920dcdc4Sjeremylt     }
885920dcdc4Sjeremylt 
886920dcdc4Sjeremylt     // Load basis data
887920dcdc4Sjeremylt     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
888241a4b83SYohann     switch (emode) {
889241a4b83SYohann     case CEED_EVAL_NONE:
890241a4b83SYohann       break;
891241a4b83SYohann     case CEED_EVAL_INTERP:
892241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
893241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
894241a4b83SYohann       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
895241a4b83SYohann       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
896241a4b83SYohann       break;
897241a4b83SYohann     case CEED_EVAL_GRAD:
898241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
899241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
900241a4b83SYohann       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
901241a4b83SYohann       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
902ac421f39SYohann       if (basis_data->d_collograd1d) {
903ac421f39SYohann         data->G.in[i] = basis_data->d_collograd1d;
904ac421f39SYohann         code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
905ac421f39SYohann         code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
906ac421f39SYohann       } else {
907ac421f39SYohann         data->G.in[i] = basis_data->d_grad1d;
908ac421f39SYohann         code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
909241a4b83SYohann         code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
910ac421f39SYohann       }
911241a4b83SYohann       break;
912241a4b83SYohann     case CEED_EVAL_WEIGHT:
913241a4b83SYohann       break; // No action
914241a4b83SYohann     case CEED_EVAL_DIV:
915241a4b83SYohann       break; // TODO: Not implemented
916241a4b83SYohann     case CEED_EVAL_CURL:
917241a4b83SYohann       break; // TODO: Not implemented
918241a4b83SYohann     }
919241a4b83SYohann   }
920920dcdc4Sjeremylt 
921920dcdc4Sjeremylt   code << "\n// Output field constants and basis data\n";
922241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
923920dcdc4Sjeremylt     code << "// -- Output field "<<i<<" --\n";
924241a4b83SYohann     // Get elemsize, emode, ncomp
925241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
926241a4b83SYohann     CeedChk(ierr);
927241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
928241a4b83SYohann     CeedChk(ierr);
929241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
930241a4b83SYohann     CeedChk(ierr);
9314d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
932241a4b83SYohann     CeedChk(ierr);
933920dcdc4Sjeremylt 
934920dcdc4Sjeremylt     // Set field constants
935241a4b83SYohann     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
936920dcdc4Sjeremylt     if (basis != CEED_BASIS_COLLOCATED) {
937241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
938241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
939920dcdc4Sjeremylt     } else {
940920dcdc4Sjeremylt       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
941920dcdc4Sjeremylt     }
942920dcdc4Sjeremylt     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
943920dcdc4Sjeremylt     ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
944920dcdc4Sjeremylt     code << "  const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n";
945920dcdc4Sjeremylt 
946920dcdc4Sjeremylt     // Load basis data
947920dcdc4Sjeremylt     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
948920dcdc4Sjeremylt     switch (emode) {
949920dcdc4Sjeremylt     case CEED_EVAL_NONE:
950920dcdc4Sjeremylt       break; // No action
951920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
952241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
953241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
954241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
955241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
956241a4b83SYohann       break;
957241a4b83SYohann     case CEED_EVAL_GRAD:
958241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
959241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
960241a4b83SYohann       code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
961241a4b83SYohann       code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
962ac421f39SYohann       if (basis_data->d_collograd1d) {
963ac421f39SYohann         data->G.out[i] = basis_data->d_collograd1d;
964ac421f39SYohann         code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
965ac421f39SYohann         code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
966ac421f39SYohann       } else {
967ac421f39SYohann         data->G.out[i] = basis_data->d_grad1d;
968ac421f39SYohann         code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
969241a4b83SYohann         code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
970ac421f39SYohann       }
971241a4b83SYohann       break;
972241a4b83SYohann     case CEED_EVAL_WEIGHT: {
973241a4b83SYohann       Ceed ceed;
974241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
975241a4b83SYohann       return CeedError(ceed, 1,
976241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
977241a4b83SYohann       break; // Should not occur
978241a4b83SYohann     }
979241a4b83SYohann     case CEED_EVAL_DIV:
980241a4b83SYohann       break; // TODO: Not implemented
981241a4b83SYohann     case CEED_EVAL_CURL:
982241a4b83SYohann       break; // TODO: Not implemented
983241a4b83SYohann     }
984241a4b83SYohann   }
985ac421f39SYohann   code << "\n";
986ac421f39SYohann   code << "__syncthreads();\n";
987241a4b83SYohann   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
988241a4b83SYohann   // Input basis apply if needed
989ac421f39SYohann   // Generate the correct eval mode code for each input
990920dcdc4Sjeremylt   code << "\n// Input field restrictions and basis actions\n";
991241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
992920dcdc4Sjeremylt     code << "  // -- Input field "<<i<<" --\n";
993241a4b83SYohann     // Get elemsize, emode, ncomp
994241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
995241a4b83SYohann     CeedChk(ierr);
996241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
997241a4b83SYohann     CeedChk(ierr);
998241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
999241a4b83SYohann     CeedChk(ierr);
10004d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1001241a4b83SYohann     CeedChk(ierr);
1002920dcdc4Sjeremylt 
1003920dcdc4Sjeremylt     // Restriction
1004920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT &&
1005920dcdc4Sjeremylt         !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) {
1006241a4b83SYohann       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1007241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1008241a4b83SYohann       data->indices.in[i] = restr_data->d_ind;
1009f2b2a896Sjeremylt       if (data->indices.in[i]) {
1010ccedf6b0Sjeremylt         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1011920dcdc4Sjeremylt       code << "  // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n";
1012920dcdc4Sjeremylt       code << "  readDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1013ccedf6b0Sjeremylt       } else {
1014920dcdc4Sjeremylt         bool backendstrides;
1015920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict,
1016920dcdc4Sjeremylt                                                           &backendstrides);
1017920dcdc4Sjeremylt         CeedChk(ierr);
1018920dcdc4Sjeremylt         CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1019920dcdc4Sjeremylt         if (!backendstrides) {
1020920dcdc4Sjeremylt           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1021920dcdc4Sjeremylt           CeedChk(ierr);
1022ccedf6b0Sjeremylt         }
1023920dcdc4Sjeremylt         code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1024d80fc06aSjeremylt         code << "  readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1025920dcdc4Sjeremylt       }
1026920dcdc4Sjeremylt     }
1027920dcdc4Sjeremylt 
1028920dcdc4Sjeremylt     // Basis action
1029920dcdc4Sjeremylt     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1030920dcdc4Sjeremylt     switch (emode) {
1031920dcdc4Sjeremylt     case CEED_EVAL_NONE:
1032920dcdc4Sjeremylt       if (!basis_data->d_collograd1d) {
1033920dcdc4Sjeremylt         code << "  CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1034920dcdc4Sjeremylt       }
1035920dcdc4Sjeremylt       break;
1036920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
1037241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1038241a4b83SYohann       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1039241a4b83SYohann       break;
1040241a4b83SYohann     case CEED_EVAL_GRAD:
1041ac421f39SYohann       if (basis_data->d_collograd1d) {
1042ac421f39SYohann         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1043ac421f39SYohann         code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1044ac421f39SYohann       } else {
1045241a4b83SYohann         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1046241a4b83SYohann         code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1047ac421f39SYohann       }
1048241a4b83SYohann       break;
1049241a4b83SYohann     case CEED_EVAL_WEIGHT:
1050241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
1051241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1052241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1053241a4b83SYohann       data->W = basis_data->d_qweight1d;
1054241a4b83SYohann       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1055241a4b83SYohann       break; // No action
1056241a4b83SYohann     case CEED_EVAL_DIV:
1057241a4b83SYohann       break; // TODO: Not implemented
1058241a4b83SYohann     case CEED_EVAL_CURL:
1059241a4b83SYohann       break; // TODO: Not implemented
1060241a4b83SYohann     }
1061241a4b83SYohann   }
1062ac421f39SYohann 
1063241a4b83SYohann   // Q function
1064920dcdc4Sjeremylt   code << "\n// Output field setup\n";
1065241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1066920dcdc4Sjeremylt       code << "  // -- Output field "<<i<<" --\n";
1067241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1068241a4b83SYohann     CeedChk(ierr);
1069241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
1070241a4b83SYohann     {
1071ac421f39SYohann       if (basis_data->d_collograd1d) {
1072ac421f39SYohann         //Accumulator for gradient slices
1073ac421f39SYohann         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1074ac421f39SYohann         code << "  for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1075ac421f39SYohann         code << "    for (CeedInt j = 0; j < Q1d; ++j) {\n";
1076ac421f39SYohann         code << "      r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1077ac421f39SYohann         code << "    }\n";
1078ac421f39SYohann         code << "  }\n";
1079ac421f39SYohann       } else {
1080241a4b83SYohann         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1081241a4b83SYohann       }
1082ac421f39SYohann     }
1083241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1084241a4b83SYohann     {
1085241a4b83SYohann       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1086241a4b83SYohann     }
1087241a4b83SYohann   }
1088ac421f39SYohann   //We treat quadrature points per slice in 3d to save registers
1089ac421f39SYohann   if (basis_data->d_collograd1d) {
1090920dcdc4Sjeremylt     code << "\n  // Note: Collocated Gradient\n";
1091ac421f39SYohann     code << "#pragma unroll\n";
1092ac421f39SYohann     code << "for (CeedInt q=0; q<Q1d; q++) {\n";
1093ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1094920dcdc4Sjeremylt       code << "  // -- Input field "<<i<<" --\n";
1095ac421f39SYohann       // Get elemsize, emode, ncomp
1096ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1097ac421f39SYohann       CeedChk(ierr);
1098ac421f39SYohann       // Basis action
1099920dcdc4Sjeremylt       code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1100ac421f39SYohann       switch (emode) {
1101ac421f39SYohann       case CEED_EVAL_NONE:
1102ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1103920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1104920dcdc4Sjeremylt         data->indices.in[i] = restr_data->d_ind;
1105920dcdc4Sjeremylt         if (data->indices.in[i]) {
110661dbc9d2Sjeremylt           ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1107920dcdc4Sjeremylt           code << "  // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n";
1108*25dfc19dSjeremylt           code << "  readSliceQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1109920dcdc4Sjeremylt         } else {
1110920dcdc4Sjeremylt           bool backendstrides;
1111920dcdc4Sjeremylt           ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict,
1112920dcdc4Sjeremylt                                                             &backendstrides);
1113920dcdc4Sjeremylt           CeedChk(ierr);
1114920dcdc4Sjeremylt           CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1115920dcdc4Sjeremylt           if (!backendstrides) {
1116920dcdc4Sjeremylt             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1117920dcdc4Sjeremylt             CeedChk(ierr);
1118920dcdc4Sjeremylt           }
1119920dcdc4Sjeremylt           code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1120d80fc06aSjeremylt           code << "  readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1121920dcdc4Sjeremylt         }
1122ac421f39SYohann         break;
1123ac421f39SYohann       case CEED_EVAL_INTERP:
1124ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1125ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1126ac421f39SYohann         code << "    r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1127ac421f39SYohann         code << "  }\n";
1128ac421f39SYohann         break;
1129ac421f39SYohann       case CEED_EVAL_GRAD:
1130ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1131ac421f39SYohann         code << "  gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1132ac421f39SYohann         break;
1133ac421f39SYohann       case CEED_EVAL_WEIGHT:
1134ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[1];\n";
1135ac421f39SYohann         code << "  r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1136ac421f39SYohann         break; // No action
1137ac421f39SYohann       case CEED_EVAL_DIV:
1138ac421f39SYohann         break; // TODO: Not implemented
1139ac421f39SYohann       case CEED_EVAL_CURL:
1140ac421f39SYohann         break; // TODO: Not implemented
1141ac421f39SYohann       }
1142ac421f39SYohann     }
1143ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1144920dcdc4Sjeremylt       code << "  // -- Output field "<<i<<" --\n";
1145ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1146ac421f39SYohann       CeedChk(ierr);
1147ac421f39SYohann       // Basis action
1148ac421f39SYohann       switch (emode) {
1149ac421f39SYohann       case CEED_EVAL_NONE:
1150ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1151ac421f39SYohann         break; // No action
1152ac421f39SYohann       case CEED_EVAL_INTERP:
1153ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1154ac421f39SYohann         break;
1155ac421f39SYohann       case CEED_EVAL_GRAD:
1156ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1157ac421f39SYohann         break;
1158ac421f39SYohann       case CEED_EVAL_WEIGHT:
1159ac421f39SYohann         break; // Should not occur
1160ac421f39SYohann       case CEED_EVAL_DIV:
1161ac421f39SYohann         break; // TODO: Not implemented
1162ac421f39SYohann       case CEED_EVAL_CURL:
1163ac421f39SYohann         break; // TODO: Not implemented
1164ac421f39SYohann       }
1165ac421f39SYohann     }
1166ac421f39SYohann   } else {
1167920dcdc4Sjeremylt       code << "\n  // Note: No Collocated Gradient\n";
1168ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1169920dcdc4Sjeremylt       code << "  // -- Input field "<<i<<" --\n";
1170ac421f39SYohann       code << "  CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1171ac421f39SYohann     }
1172ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1173920dcdc4Sjeremylt       code << "  // -- Output field "<<i<<" --\n";
1174ac421f39SYohann       code << "  CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1175ac421f39SYohann     }
1176ac421f39SYohann   }
1177920dcdc4Sjeremylt   code << "  // QFunction Inputs and outputs\n";
11784d537eeaSYohann   code << "  CeedScalar* in["<<numinputfields<<"];\n";
11794d537eeaSYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1180920dcdc4Sjeremylt     code << "  // -- Input field "<<i<<" --\n";
1181ac421f39SYohann     code << "  in["<<i<<"] = r_q"<<i<<";\n";
11824d537eeaSYohann   }
11834d537eeaSYohann   code << "  CeedScalar* out["<<numoutputfields<<"];\n";
11844d537eeaSYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1185920dcdc4Sjeremylt     code << "  // -- Output field "<<i<<" --\n";
1186ac421f39SYohann     code << "  out["<<i<<"] = r_qq"<<i<<";\n";
11874d537eeaSYohann   }
1188920dcdc4Sjeremylt   code << "\n  // Apply QFunction\n";
1189241a4b83SYohann   string qFunctionName(qf_data->qFunctionName);
1190ac421f39SYohann   code << "  "<<qFunctionName<<"(ctx, ";
1191ac421f39SYohann   if (dim != 3 || basis_data->d_collograd1d) {
1192ac421f39SYohann     code << "1 ";
1193ac421f39SYohann   }else{
1194ac421f39SYohann     code << "Q1d ";
1195ac421f39SYohann   }
1196ac421f39SYohann   code << ", in, out);\n";
1197ac421f39SYohann   if (basis_data->d_collograd1d) {
1198920dcdc4Sjeremylt     code << "\n  // Note: Collocated Gradient\n";
1199ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1200920dcdc4Sjeremylt       code << "  // -- Output field "<<i<<" --\n";
1201ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1202ac421f39SYohann       CeedChk(ierr);
1203ac421f39SYohann       // Basis action
1204920dcdc4Sjeremylt       code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1205ac421f39SYohann       switch (emode) {
1206ac421f39SYohann       case CEED_EVAL_NONE:
1207ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1208ac421f39SYohann         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1209ac421f39SYohann         code << "  }\n";
1210ac421f39SYohann         break; // No action
1211ac421f39SYohann       case CEED_EVAL_INTERP:
1212ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1213ac421f39SYohann         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1214ac421f39SYohann         code << "  }\n";
1215ac421f39SYohann         break;
1216ac421f39SYohann       case CEED_EVAL_GRAD:
1217ac421f39SYohann         code << "  gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1218ac421f39SYohann         break;
1219ac421f39SYohann       case CEED_EVAL_WEIGHT:
1220ac421f39SYohann         break; // Should not occur
1221ac421f39SYohann       case CEED_EVAL_DIV:
1222ac421f39SYohann         break; // TODO: Not implemented
1223ac421f39SYohann       case CEED_EVAL_CURL:
1224ac421f39SYohann         break; // TODO: Not implemented
1225ac421f39SYohann       }
1226ac421f39SYohann     }
1227ac421f39SYohann     code << "}\n";
1228ac421f39SYohann   }
1229241a4b83SYohann 
1230241a4b83SYohann   // Output basis apply if needed
1231ac421f39SYohann   // Generate the correct eval mode code for each output
1232920dcdc4Sjeremylt   code << "\n// Output field basis action and restrictions\n";
1233241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1234920dcdc4Sjeremylt     code << "  // -- Output field "<<i<<" --\n";
1235241a4b83SYohann     // Get elemsize, emode, ncomp
1236241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1237241a4b83SYohann     CeedChk(ierr);
1238241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1239241a4b83SYohann     CeedChk(ierr);
1240241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1241241a4b83SYohann     CeedChk(ierr);
12424d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1243241a4b83SYohann     CeedChk(ierr);
1244241a4b83SYohann     // Basis action
1245920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1246241a4b83SYohann     switch (emode) {
1247241a4b83SYohann     case CEED_EVAL_NONE:
1248920dcdc4Sjeremylt       code << "  CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1249241a4b83SYohann       break; // No action
1250241a4b83SYohann     case CEED_EVAL_INTERP:
1251241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1252241a4b83SYohann       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1253241a4b83SYohann       break;
1254241a4b83SYohann     case CEED_EVAL_GRAD:
1255241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1256ac421f39SYohann       if (basis_data->d_collograd1d) {
1257ac421f39SYohann         code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1258ac421f39SYohann       } else {
1259241a4b83SYohann         code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1260ac421f39SYohann       }
1261241a4b83SYohann       break;
1262241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1263241a4b83SYohann       Ceed ceed;
1264241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1265241a4b83SYohann       return CeedError(ceed, 1,
1266241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1267241a4b83SYohann       break; // Should not occur
1268241a4b83SYohann     }
1269241a4b83SYohann     case CEED_EVAL_DIV:
1270241a4b83SYohann       break; // TODO: Not implemented
1271241a4b83SYohann     case CEED_EVAL_CURL:
1272241a4b83SYohann       break; // TODO: Not implemented
1273241a4b83SYohann     }
1274920dcdc4Sjeremylt     // Restriction
1275920dcdc4Sjeremylt     ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1276920dcdc4Sjeremylt     data->indices.out[i] = restr_data->d_ind;
1277920dcdc4Sjeremylt     if (data->indices.out[i]) {
1278920dcdc4Sjeremylt       ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1279920dcdc4Sjeremylt       code << "  // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n";
1280920dcdc4Sjeremylt       code << "  writeDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1281920dcdc4Sjeremylt     } else {
1282920dcdc4Sjeremylt       bool backendstrides;
1283920dcdc4Sjeremylt       ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict,
1284920dcdc4Sjeremylt                                                         &backendstrides);
1285920dcdc4Sjeremylt       CeedChk(ierr);
1286920dcdc4Sjeremylt       CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1287920dcdc4Sjeremylt       if (!backendstrides) {
1288920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1289920dcdc4Sjeremylt         CeedChk(ierr);
1290920dcdc4Sjeremylt       }
1291920dcdc4Sjeremylt       code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1292d80fc06aSjeremylt       code << "  writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1293920dcdc4Sjeremylt     }
1294241a4b83SYohann   }
1295241a4b83SYohann 
1296241a4b83SYohann   code << "  }\n";
1297241a4b83SYohann   code << "}\n\n";
1298241a4b83SYohann 
1299241a4b83SYohann //  std::cout << code.str();
1300241a4b83SYohann 
1301920dcdc4Sjeremylt   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0);
1302920dcdc4Sjeremylt   CeedChk(ierr);
1303241a4b83SYohann   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1304241a4b83SYohann   CeedChk(ierr);
1305241a4b83SYohann 
1306241a4b83SYohann   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1307241a4b83SYohann 
1308241a4b83SYohann   return 0;
1309241a4b83SYohann }
1310