1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 #define CEED_DEBUG_COLOR 12 17 18 #include "ceed-cuda-gen.h" 19 #include <iostream> 20 #include <sstream> 21 #include "../cuda-shared/ceed-cuda-shared.h" 22 23 static const char *atomicAdd = QUOTE( 24 //------------------------------------------------------------------------------ 25 // Atomic add, for older CUDA 26 //------------------------------------------------------------------------------ 27 __device__ double atomicAdd(double *address, double val) { 28 unsigned long long int *address_as_ull = (unsigned long long int *)address; 29 unsigned long long int old = *address_as_ull, assumed; 30 do { 31 assumed = old; 32 old = 33 atomicCAS(address_as_ull, assumed, 34 __double_as_longlong(val + 35 __longlong_as_double(assumed))); 36 // Note: uses integer comparison to avoid hang in case of NaN 37 // (since NaN != NaN) 38 } while (assumed != old); 39 return __longlong_as_double(old); 40 } 41 ); 42 43 static const char *deviceFunctions = QUOTE( 44 45 //------------------------------------------------------------------------------ 46 // Typedefs 47 //------------------------------------------------------------------------------ 48 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 49 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 50 51 typedef struct { 52 CeedInt tidx; 53 CeedInt tidy; 54 CeedInt tidz; 55 CeedInt tid; 56 CeedScalar* slice; 57 } BackendData; 58 59 //------------------------------------------------------------------------------ 60 // Load matrices for basis actions 61 //------------------------------------------------------------------------------ 62 template <int P, int Q> 63 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 64 for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 65 B[i] = d_B[i]; 66 } 67 68 //------------------------------------------------------------------------------ 69 // 1D 70 //------------------------------------------------------------------------------ 71 72 //------------------------------------------------------------------------------ 73 // L-vector -> E-vector, offsets provided 74 //------------------------------------------------------------------------------ 75 template <int NCOMP, int COMPSTRIDE, int P1d> 76 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77 if (data.tidx < P1d) { 78 const CeedInt node = data.tidx; 79 const CeedInt ind = indices[node + elem * P1d]; 80 for (CeedInt comp = 0; comp < NCOMP; ++comp) 81 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 82 } 83 } 84 85 //------------------------------------------------------------------------------ 86 // L-vector -> E-vector, strided 87 //------------------------------------------------------------------------------ 88 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 89 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 90 if (data.tidx < P1d) { 91 const CeedInt node = data.tidx; 92 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 93 for (CeedInt comp = 0; comp < NCOMP; ++comp) 94 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 95 } 96 } 97 98 //------------------------------------------------------------------------------ 99 // E-vector -> L-vector, offsets provided 100 //------------------------------------------------------------------------------ 101 template <int NCOMP, int COMPSTRIDE, int P1d> 102 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 103 if (data.tidx < P1d) { 104 const CeedInt node = data.tidx; 105 const CeedInt ind = indices[node + elem * P1d]; 106 for (CeedInt comp = 0; comp < NCOMP; ++comp) 107 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 108 } 109 } 110 111 //------------------------------------------------------------------------------ 112 // E-vector -> L-vector, strided 113 //------------------------------------------------------------------------------ 114 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 115 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 116 if (data.tidx < P1d) { 117 const CeedInt node = data.tidx; 118 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 119 for (CeedInt comp = 0; comp < NCOMP; ++comp) 120 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 121 } 122 } 123 124 //------------------------------------------------------------------------------ 125 // 1D tensor contraction x 126 //------------------------------------------------------------------------------ 127 template <int NCOMP, int P1d, int Q1d> 128 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 129 data.slice[data.tidx] = *U; 130 __syncthreads(); 131 *V = 0.0; 132 if (data.tidx < Q1d) 133 for (CeedInt i = 0; i < P1d; ++i) 134 *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 135 __syncthreads(); 136 } 137 138 //------------------------------------------------------------------------------ 139 // 1D transpose tensor contraction x 140 //------------------------------------------------------------------------------ 141 template <int NCOMP, int P1d, int Q1d> 142 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 143 data.slice[data.tidx] = *U; 144 __syncthreads(); 145 *V = 0.0; 146 if (data.tidx < P1d) 147 for (CeedInt i = 0; i < Q1d; ++i) 148 *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 149 __syncthreads(); 150 } 151 152 //------------------------------------------------------------------------------ 153 // 1D interpolate to quadrature points 154 //------------------------------------------------------------------------------ 155 template <int NCOMP, int P1d, int Q1d> 156 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 157 for (CeedInt comp = 0; comp < NCOMP; comp++) 158 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 159 } 160 161 //------------------------------------------------------------------------------ 162 // 1D interpolate transpose 163 //------------------------------------------------------------------------------ 164 template <int NCOMP, int P1d, int Q1d> 165 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 166 for (CeedInt comp=0; comp<NCOMP; comp++) 167 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 168 } 169 170 //------------------------------------------------------------------------------ 171 // 1D derivatives at quadrature points 172 //------------------------------------------------------------------------------ 173 template <int NCOMP, int P1d, int Q1d> 174 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 175 for (CeedInt comp = 0; comp < NCOMP; comp++) 176 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 177 } 178 179 //------------------------------------------------------------------------------ 180 // 1D derivatives transpose 181 //------------------------------------------------------------------------------ 182 template <int NCOMP, int P1d, int Q1d> 183 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 184 for (CeedInt comp = 0; comp < NCOMP; comp++) 185 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 186 } 187 188 //------------------------------------------------------------------------------ 189 // 2D 190 //------------------------------------------------------------------------------ 191 192 //------------------------------------------------------------------------------ 193 // L-vector -> E-vector, offsets provided 194 //------------------------------------------------------------------------------ 195 template <int NCOMP, int COMPSTRIDE, int P1d> 196 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 197 if (data.tidx < P1d && data.tidy < P1d) { 198 const CeedInt node = data.tidx + data.tidy*P1d; 199 const CeedInt ind = indices[node + elem * P1d*P1d]; 200 for (CeedInt comp = 0; comp < NCOMP; ++comp) 201 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 202 } 203 } 204 205 //------------------------------------------------------------------------------ 206 // L-vector -> E-vector, strided 207 //------------------------------------------------------------------------------ 208 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 209 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 210 if (data.tidx < P1d && data.tidy < P1d) { 211 const CeedInt node = data.tidx + data.tidy*P1d; 212 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 213 for (CeedInt comp = 0; comp < NCOMP; ++comp) 214 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 215 } 216 } 217 218 //------------------------------------------------------------------------------ 219 // E-vector -> L-vector, offsets provided 220 //------------------------------------------------------------------------------ 221 template <int NCOMP, int COMPSTRIDE, int P1d> 222 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 223 if (data.tidx < P1d && data.tidy < P1d) { 224 const CeedInt node = data.tidx + data.tidy*P1d; 225 const CeedInt ind = indices[node + elem * P1d*P1d]; 226 for (CeedInt comp = 0; comp < NCOMP; ++comp) 227 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 228 } 229 } 230 231 //------------------------------------------------------------------------------ 232 // E-vector -> L-vector, strided 233 //------------------------------------------------------------------------------ 234 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 235 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 236 if (data.tidx < P1d && data.tidy < P1d) { 237 const CeedInt node = data.tidx + data.tidy*P1d; 238 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 239 for (CeedInt comp = 0; comp < NCOMP; ++comp) 240 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 241 } 242 } 243 244 //------------------------------------------------------------------------------ 245 // 2D tensor contraction x 246 //------------------------------------------------------------------------------ 247 template <int NCOMP, int P1d, int Q1d> 248 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 249 data.slice[data.tidx+data.tidy*T1d] = *U; 250 __syncthreads(); 251 *V = 0.0; 252 if (data.tidx < Q1d && data.tidy < P1d) 253 for (CeedInt i = 0; i < P1d; ++i) 254 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 255 __syncthreads(); 256 } 257 258 //------------------------------------------------------------------------------ 259 // 2D tensor contract y 260 //------------------------------------------------------------------------------ 261 template <int NCOMP, int P1d, int Q1d> 262 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 263 data.slice[data.tidx+data.tidy*T1d] = *U; 264 __syncthreads(); 265 *V = 0.0; 266 if (data.tidx < Q1d && data.tidy < Q1d) 267 for (CeedInt i = 0; i < P1d; ++i) 268 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 269 __syncthreads(); 270 } 271 272 //------------------------------------------------------------------------------ 273 // 2D transpose tensor contract y 274 //------------------------------------------------------------------------------ 275 template <int NCOMP, int P1d, int Q1d> 276 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 277 data.slice[data.tidx+data.tidy*T1d] = *U; 278 __syncthreads(); 279 *V = 0.0; 280 if (data.tidx < Q1d && data.tidy < P1d) 281 for (CeedInt i = 0; i < Q1d; ++i) 282 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 283 __syncthreads(); 284 } 285 286 //------------------------------------------------------------------------------ 287 // 2D transpose tensor contract x 288 //------------------------------------------------------------------------------ 289 template <int NCOMP, int P1d, int Q1d> 290 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 291 data.slice[data.tidx+data.tidy*T1d] = *U; 292 __syncthreads(); 293 *V = 0.0; 294 if (data.tidx < P1d && data.tidy < P1d) 295 for (CeedInt i = 0; i < Q1d; ++i) 296 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 297 __syncthreads(); 298 } 299 300 //------------------------------------------------------------------------------ 301 // 2D transpose tensor contract and add x 302 //------------------------------------------------------------------------------ 303 template <int NCOMP, int P1d, int Q1d> 304 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 305 data.slice[data.tidx+data.tidy*T1d] = *U; 306 __syncthreads(); 307 if (data.tidx < P1d && data.tidy < P1d) 308 for (CeedInt i = 0; i < Q1d; ++i) 309 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 310 __syncthreads(); 311 } 312 313 //------------------------------------------------------------------------------ 314 // 2D interpolate to quadrature points 315 //------------------------------------------------------------------------------ 316 template <int NCOMP, int P1d, int Q1d> 317 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 318 CeedScalar r_t[1]; 319 for (CeedInt comp = 0; comp < NCOMP; comp++) { 320 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 321 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 322 } 323 } 324 325 //------------------------------------------------------------------------------ 326 // 2D interpolate transpose 327 //------------------------------------------------------------------------------ 328 template <int NCOMP, int P1d, int Q1d> 329 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 330 CeedScalar r_t[1]; 331 for (CeedInt comp = 0; comp < NCOMP; comp++) { 332 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 333 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 334 } 335 } 336 337 //------------------------------------------------------------------------------ 338 // 2D derivatives at quadrature points 339 //------------------------------------------------------------------------------ 340 template <int NCOMP, int P1d, int Q1d> 341 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 342 CeedScalar r_t[1]; 343 for (CeedInt comp = 0; comp < NCOMP; comp++) { 344 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 345 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 346 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 347 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 348 } 349 } 350 351 //------------------------------------------------------------------------------ 352 // 2D derivatives transpose 353 //------------------------------------------------------------------------------ 354 template <int NCOMP, int P1d, int Q1d> 355 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 356 CeedScalar r_t[1]; 357 for (CeedInt comp = 0; comp < NCOMP; comp++) { 358 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 359 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 360 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 361 ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 362 } 363 } 364 365 //------------------------------------------------------------------------------ 366 // 3D 367 //------------------------------------------------------------------------------ 368 369 //------------------------------------------------------------------------------ 370 // L-vector -> E-vector, offsets provided 371 //------------------------------------------------------------------------------ 372 template <int NCOMP, int COMPSTRIDE, int P1d> 373 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 374 if (data.tidx < P1d && data.tidy < P1d) 375 for (CeedInt z = 0; z < P1d; ++z) { 376 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 377 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 378 for (CeedInt comp = 0; comp < NCOMP; ++comp) 379 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 380 } 381 } 382 383 //------------------------------------------------------------------------------ 384 // L-vector -> E-vector, strided 385 //------------------------------------------------------------------------------ 386 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 387 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 388 if (data.tidx < P1d && data.tidy < P1d) 389 for (CeedInt z = 0; z < P1d; ++z) { 390 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 391 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 392 for (CeedInt comp = 0; comp < NCOMP; ++comp) 393 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 394 } 395 } 396 397 //------------------------------------------------------------------------------ 398 // E-vector -> Q-vector, offests provided 399 //------------------------------------------------------------------------------ 400 template <int NCOMP, int COMPSTRIDE, int Q1d> 401 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 402 if (data.tidx < Q1d && data.tidy < Q1d) { 403 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 404 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 405 for (CeedInt comp = 0; comp < NCOMP; ++comp) 406 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 407 } 408 } 409 410 //------------------------------------------------------------------------------ 411 // E-vector -> Q-vector, strided 412 //------------------------------------------------------------------------------ 413 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 414 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 415 if (data.tidx < Q1d && data.tidy < Q1d) { 416 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 417 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 418 for (CeedInt comp = 0; comp < NCOMP; ++comp) 419 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 420 } 421 } 422 423 //------------------------------------------------------------------------------ 424 // E-vector -> L-vector, offsets provided 425 //------------------------------------------------------------------------------ 426 template <int NCOMP, int COMPSTRIDE, int P1d> 427 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 428 if (data.tidx < P1d && data.tidy < P1d) 429 for (CeedInt z = 0; z < P1d; ++z) { 430 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 431 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 432 for (CeedInt comp = 0; comp < NCOMP; ++comp) 433 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 434 } 435 } 436 437 //------------------------------------------------------------------------------ 438 // E-vector -> L-vector, strided 439 //------------------------------------------------------------------------------ 440 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 441 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 442 if (data.tidx < P1d && data.tidy < P1d) 443 for (CeedInt z = 0; z < P1d; ++z) { 444 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 445 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 446 for (CeedInt comp = 0; comp < NCOMP; ++comp) 447 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 448 } 449 } 450 451 //------------------------------------------------------------------------------ 452 // 3D tensor contract x 453 //------------------------------------------------------------------------------ 454 template <int NCOMP, int P1d, int Q1d> 455 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 456 CeedScalar r_B[P1d]; 457 for (CeedInt i = 0; i < P1d; ++i) 458 r_B[i] = B[i + data.tidx*P1d]; 459 460 for (CeedInt k = 0; k < P1d; ++k) { 461 data.slice[data.tidx+data.tidy*T1d] = U[k]; 462 __syncthreads(); 463 V[k] = 0.0; 464 if (data.tidx < Q1d && data.tidy < P1d) 465 for (CeedInt i = 0; i < P1d; ++i) 466 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 467 __syncthreads(); 468 } 469 } 470 471 //------------------------------------------------------------------------------ 472 // 3D tensor contract y 473 //------------------------------------------------------------------------------ 474 template <int NCOMP, int P1d, int Q1d> 475 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 476 CeedScalar r_B[P1d]; 477 for (CeedInt i = 0; i < P1d; ++i) 478 r_B[i] = B[i + data.tidy*P1d]; 479 480 for (CeedInt k = 0; k < P1d; ++k) { 481 data.slice[data.tidx+data.tidy*T1d] = U[k]; 482 __syncthreads(); 483 V[k] = 0.0; 484 if (data.tidx < Q1d && data.tidy < Q1d) 485 for (CeedInt i = 0; i < P1d; ++i) 486 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 487 __syncthreads(); 488 } 489 } 490 491 //------------------------------------------------------------------------------ 492 // 3D tensor contract z 493 //------------------------------------------------------------------------------ 494 template <int NCOMP, int P1d, int Q1d> 495 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 496 for (CeedInt k = 0; k < Q1d; ++k) { 497 V[k] = 0.0; 498 if (data.tidx < Q1d && data.tidy < Q1d) 499 for (CeedInt i = 0; i < P1d; ++i) 500 V[k] += B[i + k*P1d] * U[i]; // Contract z direction 501 } 502 } 503 504 //------------------------------------------------------------------------------ 505 // 3D transpose tensor contract z 506 //------------------------------------------------------------------------------ 507 template <int NCOMP, int P1d, int Q1d> 508 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 509 for (CeedInt k = 0; k < P1d; ++k) { 510 V[k] = 0.0; 511 if (data.tidx < Q1d && data.tidy < Q1d) 512 for (CeedInt i = 0; i < Q1d; ++i) 513 V[k] += B[k + i*P1d] * U[i]; // Contract z direction 514 } 515 } 516 517 //------------------------------------------------------------------------------ 518 // 3D transpose tensor contract y 519 //------------------------------------------------------------------------------ 520 template <int NCOMP, int P1d, int Q1d> 521 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 522 CeedScalar r_B[Q1d]; 523 for (CeedInt i = 0; i < Q1d; ++i) 524 r_B[i] = B[data.tidy + i*P1d]; 525 526 for (CeedInt k = 0; k < P1d; ++k) { 527 data.slice[data.tidx+data.tidy*T1d] = U[k]; 528 __syncthreads(); 529 V[k] = 0.0; 530 if (data.tidx < Q1d && data.tidy < P1d) 531 for (CeedInt i = 0; i < Q1d; ++i) 532 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 533 __syncthreads(); 534 } 535 } 536 537 //------------------------------------------------------------------------------ 538 // 3D transpose tensor contract add y 539 //------------------------------------------------------------------------------ 540 template <int NCOMP, int P1d, int Q1d> 541 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 542 CeedScalar r_B[Q1d]; 543 for (CeedInt i = 0; i < Q1d; ++i) 544 r_B[i] = B[data.tidy + i*P1d]; 545 546 for (CeedInt k = 0; k < P1d; ++k) { 547 data.slice[data.tidx+data.tidy*T1d] = U[k]; 548 __syncthreads(); 549 if (data.tidx < Q1d && data.tidy < P1d) 550 for (CeedInt i = 0; i < Q1d; ++i) 551 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 552 __syncthreads(); 553 } 554 } 555 556 //------------------------------------------------------------------------------ 557 // 3D transpose tensor contract x 558 //------------------------------------------------------------------------------ 559 template <int NCOMP, int P1d, int Q1d> 560 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 561 CeedScalar r_B[Q1d]; 562 for (CeedInt i = 0; i < Q1d; ++i) 563 r_B[i] = B[data.tidx + i*P1d]; 564 565 for (CeedInt k = 0; k < P1d; ++k) { 566 data.slice[data.tidx+data.tidy*T1d] = U[k]; 567 __syncthreads(); 568 V[k] = 0.0; 569 if (data.tidx < P1d && data.tidy < P1d) 570 for (CeedInt i = 0; i < Q1d; ++i) 571 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 572 __syncthreads(); 573 } 574 } 575 576 //------------------------------------------------------------------------------ 577 // 3D transpose tensor contract add x 578 //------------------------------------------------------------------------------ 579 template <int NCOMP, int P1d, int Q1d> 580 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 581 CeedScalar r_B[Q1d]; 582 for (CeedInt i = 0; i < Q1d; ++i) 583 r_B[i] = B[data.tidx + i*P1d]; 584 585 for (CeedInt k = 0; k < P1d; ++k) { 586 data.slice[data.tidx+data.tidy*T1d] = U[k]; 587 __syncthreads(); 588 if (data.tidx < P1d && data.tidy < P1d) 589 for (CeedInt i = 0; i < Q1d; ++i) 590 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 591 __syncthreads(); 592 } 593 } 594 595 //------------------------------------------------------------------------------ 596 // 3D interpolate to quadrature points 597 //------------------------------------------------------------------------------ 598 template <int NCOMP, int P1d, int Q1d> 599 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 600 CeedScalar r_t1[T1d]; 601 CeedScalar r_t2[T1d]; 602 for (CeedInt comp = 0; comp < NCOMP; comp++) { 603 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 604 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 605 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 606 } 607 } 608 609 //------------------------------------------------------------------------------ 610 // 3D interpolate transpose 611 //------------------------------------------------------------------------------ 612 template <int NCOMP, int P1d, int Q1d> 613 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 614 CeedScalar r_t1[T1d]; 615 CeedScalar r_t2[T1d]; 616 for (CeedInt comp = 0; comp < NCOMP; comp++) { 617 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 618 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 619 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 620 } 621 } 622 623 //------------------------------------------------------------------------------ 624 // 3D derivatives at quadrature points 625 //------------------------------------------------------------------------------ 626 template <int NCOMP, int P1d, int Q1d> 627 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 628 CeedScalar r_t1[T1d]; 629 CeedScalar r_t2[T1d]; 630 for (CeedInt comp = 0; comp < NCOMP; comp++) { 631 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 632 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 633 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 634 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 635 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 636 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 637 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 638 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 639 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 640 } 641 } 642 643 //------------------------------------------------------------------------------ 644 // 3D derivatives transpose 645 //------------------------------------------------------------------------------ 646 template <int NCOMP, int P1d, int Q1d> 647 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 648 CeedScalar r_t1[T1d]; 649 CeedScalar r_t2[T1d]; 650 for (CeedInt comp = 0; comp < NCOMP; comp++) { 651 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 652 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 653 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 654 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 655 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 656 ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 657 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 658 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 659 ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 660 } 661 } 662 663 //------------------------------------------------------------------------------ 664 // 3D collocated derivatives computation 665 //------------------------------------------------------------------------------ 666 template <int NCOMP, int Q1d> 667 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 668 if (data.tidx < Q1d && data.tidy < Q1d) { 669 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 670 data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 671 __syncthreads(); 672 // X derivative 673 r_V[comp+0*NCOMP] = 0.0; 674 for (CeedInt i = 0; i < Q1d; ++i) 675 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 676 // Y derivative 677 r_V[comp+1*NCOMP] = 0.0; 678 for (CeedInt i = 0; i < Q1d; ++i) 679 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 680 // Z derivative 681 r_V[comp+2*NCOMP] = 0.0; 682 for (CeedInt i = 0; i < Q1d; ++i) 683 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 684 __syncthreads(); 685 } 686 } 687 } 688 689 //------------------------------------------------------------------------------ 690 // 3D collocated derivatives transpose 691 //------------------------------------------------------------------------------ 692 template <int NCOMP, int Q1d> 693 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 694 if (data.tidx < Q1d && data.tidy < Q1d) { 695 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 696 // X derivative 697 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 698 __syncthreads(); 699 for (CeedInt i = 0; i < Q1d; ++i) 700 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 701 __syncthreads(); 702 // Y derivative 703 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 704 __syncthreads(); 705 for (CeedInt i = 0; i < Q1d; ++i) 706 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 707 __syncthreads(); 708 // Z derivative 709 for (CeedInt i = 0; i < Q1d; ++i) 710 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 711 } 712 } 713 } 714 715 //------------------------------------------------------------------------------ 716 // 1D quadrature weights 717 //------------------------------------------------------------------------------ 718 template <int Q1d> 719 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 720 *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 721 } 722 723 //------------------------------------------------------------------------------ 724 // 2D quadrature weights 725 //------------------------------------------------------------------------------ 726 template <int Q1d> 727 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 728 *w = (data.tidx < Q1d && data.tidy < Q1d) ? 729 qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 730 } 731 732 //------------------------------------------------------------------------------ 733 // 3D quadrature weights 734 //------------------------------------------------------------------------------ 735 template <int Q1d> 736 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 737 const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 738 const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 739 for (CeedInt z = 0; z < Q1d; ++z) 740 w[z] = quad ? pw*qweight1d[z] : 0.0; 741 } 742 743 ); 744 //------------------------------------------------------------------------------ 745 // Build singe operator kernel 746 //------------------------------------------------------------------------------ 747 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 748 749 using std::ostringstream; 750 using std::string; 751 int ierr; 752 bool setupdone; 753 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr); 754 if (setupdone) return 0; 755 Ceed ceed; 756 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 757 CeedOperator_Cuda_gen *data; 758 ierr = CeedOperatorGetData(op, &data); CeedChk(ierr); 759 CeedQFunction qf; 760 CeedQFunction_Cuda_gen *qf_data; 761 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 762 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr); 763 CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 764 numoutputfields, ncomp, dim = 0, lsize; 765 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 766 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 767 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 768 CeedChk(ierr); 769 CeedOperatorField *opinputfields, *opoutputfields; 770 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 771 CeedChk(ierr); 772 CeedQFunctionField *qfinputfields, *qfoutputfields; 773 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 774 CeedChk(ierr); 775 CeedEvalMode emode; 776 CeedBasis basis; 777 CeedBasis_Cuda_shared *basis_data; 778 CeedElemRestriction Erestrict; 779 CeedElemRestriction_Cuda *restr_data; 780 781 ostringstream code; 782 string devFunctions(deviceFunctions); 783 784 // Add atomicAdd function for old NVidia architectures 785 struct cudaDeviceProp prop; 786 Ceed_Cuda *ceed_data; 787 ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr); 788 ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 789 if (prop.major<6){ 790 code << atomicAdd; 791 } 792 793 code << devFunctions; 794 795 string qFunction(qf_data->qFunctionSource); 796 string qFunctionName(qf_data->qFunctionName); 797 string oper; 798 oper = "CeedKernel_Cuda_gen_" + qFunctionName; 799 800 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 801 code << "\n#define CeedPragmaSIMD\n"; 802 803 // Find dim and Q1d 804 bool useCollograd = true; 805 data->maxP1d = 0; 806 for (CeedInt i = 0; i < numinputfields; i++) { 807 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 808 if (basis != CEED_BASIS_COLLOCATED) { 809 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 810 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 811 CeedChk(ierr); 812 813 // Check for collocated gradient 814 useCollograd = useCollograd && basis_data->d_collograd1d; 815 816 // Collect dim and Q1d 817 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 818 bool isTensor; 819 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 820 if (isTensor) { 821 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 822 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 823 if (P1d>data->maxP1d) data->maxP1d = P1d; 824 } else { 825 // LCOV_EXCL_START 826 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 827 // LCOV_EXCL_STOP 828 } 829 } 830 } 831 // Check output bases for Q1d, dim as well 832 // The only imput basis might be CEED_BASIS_COLLOCATED 833 for (CeedInt i = 0; i < numoutputfields; i++) { 834 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 835 836 if (basis != CEED_BASIS_COLLOCATED) { 837 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 838 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 839 CeedChk(ierr); 840 841 // Collect dim and Q1d 842 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 843 bool isTensor; 844 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 845 if (isTensor) { 846 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 847 } else { 848 // LCOV_EXCL_START 849 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 850 // LCOV_EXCL_STOP 851 } 852 853 // Check for collocated gradient 854 useCollograd = useCollograd && basis_data->d_collograd1d; 855 } 856 } 857 data->dim = dim; 858 data->Q1d = Q1d; 859 860 // Define CEED_Q_VLA 861 if (dim != 3 || useCollograd) { 862 code << "\n#define CEED_Q_VLA 1\n\n"; 863 } else { 864 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 865 } 866 867 code << qFunction; 868 869 // Setup 870 code << "\n// -----------------------------------------------------------------------------\n"; 871 code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 872 for (CeedInt i = 0; i < numinputfields; i++) { 873 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 874 CeedChk(ierr); 875 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 876 code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 877 } 878 } 879 880 for (CeedInt i = 0; i < numoutputfields; i++) { 881 code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 882 } 883 884 code << " const CeedInt Dim = "<<dim<<";\n"; 885 code << " const CeedInt Q1d = "<<Q1d<<";\n"; 886 887 code << " extern __shared__ CeedScalar slice[];\n"; 888 code << " BackendData data;\n"; 889 code << " data.tidx = threadIdx.x;\n"; 890 code << " data.tidy = threadIdx.y;\n"; 891 code << " data.tidz = threadIdx.z;\n"; 892 code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 893 code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 894 895 code << "\n // -- Input field constants and basis data --\n"; 896 //Initialize constants, and matrices B and G 897 for (CeedInt i = 0; i < numinputfields; i++) { 898 code << " // ---- Input field "<<i<<" ----\n"; 899 // Get elemsize, emode, ncomp 900 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 901 CeedChk(ierr); 902 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 903 CeedChk(ierr); 904 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 905 CeedChk(ierr); 906 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 907 CeedChk(ierr); 908 909 // Set field constants 910 if (emode != CEED_EVAL_WEIGHT) { 911 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 912 if (basis != CEED_BASIS_COLLOCATED) { 913 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 914 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 915 } else { 916 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 917 } 918 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 919 } 920 921 // Load basis data 922 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 923 switch (emode) { 924 case CEED_EVAL_NONE: 925 break; 926 case CEED_EVAL_INTERP: 927 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 928 data->B.in[i] = basis_data->d_interp1d; 929 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 930 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 931 break; 932 case CEED_EVAL_GRAD: 933 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 934 data->B.in[i] = basis_data->d_interp1d; 935 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 936 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 937 if (useCollograd) { 938 data->G.in[i] = basis_data->d_collograd1d; 939 code << " __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 940 code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 941 } else { 942 data->G.in[i] = basis_data->d_grad1d; 943 code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 944 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 945 } 946 break; 947 case CEED_EVAL_WEIGHT: 948 break; // No action 949 case CEED_EVAL_DIV: 950 break; // TODO: Not implemented 951 case CEED_EVAL_CURL: 952 break; // TODO: Not implemented 953 } 954 } 955 956 code << "\n // -- Output field constants and basis data --\n"; 957 for (CeedInt i = 0; i < numoutputfields; i++) { 958 code << " // ---- Output field "<<i<<" ----\n"; 959 // Get elemsize, emode, ncomp 960 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 961 CeedChk(ierr); 962 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 963 CeedChk(ierr); 964 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 965 CeedChk(ierr); 966 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 967 CeedChk(ierr); 968 969 // Set field constants 970 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 971 if (basis != CEED_BASIS_COLLOCATED) { 972 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 973 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 974 } else { 975 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 976 } 977 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 978 979 // Load basis data 980 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 981 switch (emode) { 982 case CEED_EVAL_NONE: 983 break; // No action 984 case CEED_EVAL_INTERP: 985 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 986 data->B.out[i] = basis_data->d_interp1d; 987 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 988 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 989 break; 990 case CEED_EVAL_GRAD: 991 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 992 data->B.out[i] = basis_data->d_interp1d; 993 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 994 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 995 if (useCollograd) { 996 data->G.out[i] = basis_data->d_collograd1d; 997 code << " __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 998 code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 999 } else { 1000 data->G.out[i] = basis_data->d_grad1d; 1001 code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1002 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1003 } 1004 break; 1005 // LCOV_EXCL_START 1006 case CEED_EVAL_WEIGHT: { 1007 Ceed ceed; 1008 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1009 return CeedError(ceed, 1, 1010 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1011 break; // Should not occur 1012 } 1013 case CEED_EVAL_DIV: 1014 break; // TODO: Not implemented 1015 case CEED_EVAL_CURL: 1016 break; // TODO: Not implemented 1017 // LCOV_EXCL_STOP 1018 } 1019 } 1020 code << "\n // -- Element loop --\n"; 1021 code << " __syncthreads();\n"; 1022 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1023 // Input basis apply if needed 1024 // Generate the correct eval mode code for each input 1025 code << " // -- Input field restrictions and basis actions --\n"; 1026 for (CeedInt i = 0; i < numinputfields; i++) { 1027 code << " // ---- Input field "<<i<<" ----\n"; 1028 // Get elemsize, emode, ncomp 1029 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1030 CeedChk(ierr); 1031 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1032 CeedChk(ierr); 1033 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1034 CeedChk(ierr); 1035 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1036 CeedChk(ierr); 1037 1038 // Restriction 1039 if (emode != CEED_EVAL_WEIGHT && 1040 !((emode == CEED_EVAL_NONE) && useCollograd)) { 1041 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1042 1043 bool isStrided; 1044 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1045 if (!isStrided) { 1046 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1047 CeedChk(ierr); 1048 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1049 CeedInt compstride; 1050 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1051 code << " // CompStride: "<<compstride<<"\n"; 1052 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1053 data->indices.in[i] = restr_data->d_ind; 1054 code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1055 } else { 1056 bool backendstrides; 1057 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1058 CeedChk(ierr); 1059 CeedInt nelem; 1060 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1061 CeedChk(ierr); 1062 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1063 if (!backendstrides) { 1064 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1065 CeedChk(ierr); 1066 } 1067 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1068 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1069 } 1070 } 1071 1072 // Basis action 1073 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1074 switch (emode) { 1075 case CEED_EVAL_NONE: 1076 if (!useCollograd) { 1077 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1078 } 1079 break; 1080 case CEED_EVAL_INTERP: 1081 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1082 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1083 break; 1084 case CEED_EVAL_GRAD: 1085 if (useCollograd) { 1086 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1087 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1088 } else { 1089 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1090 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1091 } 1092 break; 1093 case CEED_EVAL_WEIGHT: 1094 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1095 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1096 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 1097 data->W = basis_data->d_qweight1d; 1098 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1099 break; // No action 1100 case CEED_EVAL_DIV: 1101 break; // TODO: Not implemented 1102 case CEED_EVAL_CURL: 1103 break; // TODO: Not implemented 1104 } 1105 } 1106 1107 // Q function 1108 code << "\n // -- Output field setup --\n"; 1109 for (CeedInt i = 0; i < numoutputfields; i++) { 1110 code << "\n // ---- Output field "<<i<<" ----\n"; 1111 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1112 CeedChk(ierr); 1113 if (emode==CEED_EVAL_GRAD) 1114 { 1115 if (useCollograd) { 1116 //Accumulator for gradient slices 1117 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1118 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1119 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1120 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1121 code << " }\n"; 1122 code << " }\n"; 1123 } else { 1124 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1125 } 1126 } 1127 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1128 { 1129 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1130 } 1131 } 1132 // We treat quadrature points per slice in 3d to save registers 1133 if (useCollograd) { 1134 code << "\n // Note: Collocated Gradient\n"; 1135 code << "#pragma unroll\n"; 1136 code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1137 code << " // -- Input fields --\n"; 1138 for (CeedInt i = 0; i < numinputfields; i++) { 1139 code << " // ---- Input field "<<i<<" ----\n"; 1140 // Get elemsize, emode, ncomp 1141 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1142 CeedChk(ierr); 1143 // Basis action 1144 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1145 switch (emode) { 1146 case CEED_EVAL_NONE: 1147 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1148 1149 bool isStrided; 1150 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr); 1151 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr); 1152 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1153 if (!isStrided) { 1154 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1155 CeedChk(ierr); 1156 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1157 CeedInt compstride; 1158 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1159 code << " // CompStride: "<<compstride<<"\n"; 1160 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1161 data->indices.in[i] = restr_data->d_ind; 1162 code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1163 } else { 1164 bool backendstrides; 1165 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1166 CeedChk(ierr); 1167 CeedInt nelem; 1168 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1169 CeedChk(ierr); 1170 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1171 if (!backendstrides) { 1172 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1173 CeedChk(ierr); 1174 } 1175 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1176 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1177 } 1178 break; 1179 case CEED_EVAL_INTERP: 1180 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1181 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1182 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1183 code << " }\n"; 1184 break; 1185 case CEED_EVAL_GRAD: 1186 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1187 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1188 break; 1189 case CEED_EVAL_WEIGHT: 1190 code << " CeedScalar r_q"<<i<<"[1];\n"; 1191 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1192 break; // No action 1193 case CEED_EVAL_DIV: 1194 break; // TODO: Not implemented 1195 case CEED_EVAL_CURL: 1196 break; // TODO: Not implemented 1197 } 1198 } 1199 code << "\n // -- Output fields --\n"; 1200 for (CeedInt i = 0; i < numoutputfields; i++) { 1201 code << " // ---- Output field "<<i<<" ----\n"; 1202 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1203 CeedChk(ierr); 1204 // Basis action 1205 switch (emode) { 1206 case CEED_EVAL_NONE: 1207 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1208 break; // No action 1209 case CEED_EVAL_INTERP: 1210 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1211 break; 1212 case CEED_EVAL_GRAD: 1213 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1214 break; 1215 case CEED_EVAL_WEIGHT: 1216 break; // Should not occur 1217 case CEED_EVAL_DIV: 1218 break; // TODO: Not implemented 1219 case CEED_EVAL_CURL: 1220 break; // TODO: Not implemented 1221 } 1222 } 1223 } else { 1224 code << "\n // Note: No Collocated Gradient\n"; 1225 code << " // -- Input fields --\n"; 1226 for (CeedInt i = 0; i < numinputfields; i++) { 1227 code << " // ---- Input field "<<i<<" ----\n"; 1228 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1229 } 1230 code << " // -- Output fields --\n"; 1231 for (CeedInt i = 0; i < numoutputfields; i++) { 1232 code << " // ---- Output field "<<i<<" ----\n"; 1233 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1234 } 1235 } 1236 code << "\n // -- QFunction Inputs and outputs --\n"; 1237 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1238 for (CeedInt i = 0; i < numinputfields; i++) { 1239 code << " // ---- Input field "<<i<<" ----\n"; 1240 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1241 } 1242 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1243 for (CeedInt i = 0; i < numoutputfields; i++) { 1244 code << " // ---- Output field "<<i<<" ----\n"; 1245 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1246 } 1247 code << "\n // -- Apply QFunction --\n"; 1248 code << " "<<qFunctionName<<"(ctx, "; 1249 if (dim != 3 || useCollograd) { 1250 code << "1"; 1251 } else { 1252 code << "Q1d"; 1253 } 1254 code << ", in, out);\n"; 1255 if (useCollograd) { 1256 code << "\n // Note: Collocated Gradient\n"; 1257 code << " // -- Output fields --\n"; 1258 for (CeedInt i = 0; i < numoutputfields; i++) { 1259 code << " // ---- Output field "<<i<<" ----\n"; 1260 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1261 CeedChk(ierr); 1262 // Basis action 1263 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1264 switch (emode) { 1265 case CEED_EVAL_NONE: 1266 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1267 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1268 code << " }\n"; 1269 break; // No action 1270 case CEED_EVAL_INTERP: 1271 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1272 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1273 code << " }\n"; 1274 break; 1275 case CEED_EVAL_GRAD: 1276 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1277 break; 1278 case CEED_EVAL_WEIGHT: 1279 break; // Should not occur 1280 case CEED_EVAL_DIV: 1281 break; // TODO: Not implemented 1282 case CEED_EVAL_CURL: 1283 break; // TODO: Not implemented 1284 } 1285 } 1286 code << " }\n"; 1287 } 1288 1289 // Output basis apply if needed 1290 // Generate the correct eval mode code for each output 1291 code << "\n // -- Output field basis action and restrictions --\n"; 1292 for (CeedInt i = 0; i < numoutputfields; i++) { 1293 code << " // ---- Output field "<<i<<" ----\n"; 1294 // Get elemsize, emode, ncomp 1295 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1296 CeedChk(ierr); 1297 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1298 CeedChk(ierr); 1299 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1300 CeedChk(ierr); 1301 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1302 CeedChk(ierr); 1303 // Basis action 1304 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1305 switch (emode) { 1306 case CEED_EVAL_NONE: 1307 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1308 break; // No action 1309 case CEED_EVAL_INTERP: 1310 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1311 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1312 break; 1313 case CEED_EVAL_GRAD: 1314 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1315 if (useCollograd) { 1316 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1317 } else { 1318 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1319 } 1320 break; 1321 // LCOV_EXCL_START 1322 case CEED_EVAL_WEIGHT: { 1323 Ceed ceed; 1324 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1325 return CeedError(ceed, 1, 1326 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1327 break; // Should not occur 1328 } 1329 case CEED_EVAL_DIV: 1330 break; // TODO: Not implemented 1331 case CEED_EVAL_CURL: 1332 break; // TODO: Not implemented 1333 // LCOV_EXCL_STOP 1334 } 1335 // Restriction 1336 bool isStrided; 1337 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1338 if (!isStrided) { 1339 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1340 CeedChk(ierr); 1341 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1342 CeedInt compstride; 1343 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1344 code << " // CompStride: "<<compstride<<"\n"; 1345 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1346 data->indices.out[i] = restr_data->d_ind; 1347 code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1348 } else { 1349 bool backendstrides; 1350 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1351 CeedChk(ierr); 1352 CeedInt nelem; 1353 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1354 CeedChk(ierr); 1355 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1356 if (!backendstrides) { 1357 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1358 CeedChk(ierr); 1359 } 1360 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1361 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1362 } 1363 } 1364 1365 code << " }\n"; 1366 code << "}\n"; 1367 code << "// -----------------------------------------------------------------------------\n\n"; 1368 1369 // View kernel for debugging 1370 CeedDebug(code.str().c_str()); 1371 1372 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 1373 "T1d", CeedIntMax(Q1d, data->maxP1d)); 1374 CeedChk(ierr); 1375 ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1376 CeedChk(ierr); 1377 1378 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1379 return 0; 1380 } 1381 //------------------------------------------------------------------------------ 1382