1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #define CEED_DEBUG_COLOR 12 18 19 #include <ceed/ceed.h> 20 #include <ceed/backend.h> 21 #include <cuda_runtime.h> 22 #include <iostream> 23 #include <sstream> 24 #include "ceed-cuda-gen.h" 25 #include "../cuda-shared/ceed-cuda-shared.h" 26 27 static const char *atomicAdd = QUOTE( 28 //------------------------------------------------------------------------------ 29 // Atomic add, for older CUDA 30 //------------------------------------------------------------------------------ 31 __device__ double atomicAdd(double *address, double val) { 32 unsigned long long int *address_as_ull = (unsigned long long int *)address; 33 unsigned long long int old = *address_as_ull, assumed; 34 do { 35 assumed = old; 36 old = 37 atomicCAS(address_as_ull, assumed, 38 __double_as_longlong(val + 39 __longlong_as_double(assumed))); 40 // Note: uses integer comparison to avoid hang in case of NaN 41 // (since NaN != NaN) 42 } while (assumed != old); 43 return __longlong_as_double(old); 44 } 45 ); 46 47 static const char *deviceFunctions = QUOTE( 48 49 //------------------------------------------------------------------------------ 50 // Typedefs 51 //------------------------------------------------------------------------------ 52 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 53 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 54 55 typedef struct { 56 CeedInt tidx; 57 CeedInt tidy; 58 CeedInt tidz; 59 CeedInt tid; 60 CeedScalar* slice; 61 } BackendData; 62 63 //------------------------------------------------------------------------------ 64 // Load matrices for basis actions 65 //------------------------------------------------------------------------------ 66 template <int P, int Q> 67 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 68 for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 69 B[i] = d_B[i]; 70 } 71 72 //------------------------------------------------------------------------------ 73 // 1D 74 //------------------------------------------------------------------------------ 75 76 //------------------------------------------------------------------------------ 77 // L-vector -> E-vector, offsets provided 78 //------------------------------------------------------------------------------ 79 template <int NCOMP, int COMPSTRIDE, int P1d> 80 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 81 if (data.tidx < P1d) { 82 const CeedInt node = data.tidx; 83 const CeedInt ind = indices[node + elem * P1d]; 84 for (CeedInt comp = 0; comp < NCOMP; ++comp) 85 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 86 } 87 } 88 89 //------------------------------------------------------------------------------ 90 // L-vector -> E-vector, strided 91 //------------------------------------------------------------------------------ 92 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 93 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 94 if (data.tidx < P1d) { 95 const CeedInt node = data.tidx; 96 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 97 for (CeedInt comp = 0; comp < NCOMP; ++comp) 98 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 99 } 100 } 101 102 //------------------------------------------------------------------------------ 103 // E-vector -> L-vector, offsets provided 104 //------------------------------------------------------------------------------ 105 template <int NCOMP, int COMPSTRIDE, int P1d> 106 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 107 if (data.tidx < P1d) { 108 const CeedInt node = data.tidx; 109 const CeedInt ind = indices[node + elem * P1d]; 110 for (CeedInt comp = 0; comp < NCOMP; ++comp) 111 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 112 } 113 } 114 115 //------------------------------------------------------------------------------ 116 // E-vector -> L-vector, strided 117 //------------------------------------------------------------------------------ 118 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 119 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 120 if (data.tidx < P1d) { 121 const CeedInt node = data.tidx; 122 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 123 for (CeedInt comp = 0; comp < NCOMP; ++comp) 124 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 125 } 126 } 127 128 //------------------------------------------------------------------------------ 129 // 1D tensor contraction x 130 //------------------------------------------------------------------------------ 131 template <int NCOMP, int P1d, int Q1d> 132 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 133 data.slice[data.tidx] = *U; 134 __syncthreads(); 135 *V = 0.0; 136 if (data.tidx < Q1d) 137 for (CeedInt i = 0; i < P1d; ++i) 138 *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 139 __syncthreads(); 140 } 141 142 //------------------------------------------------------------------------------ 143 // 1D transpose tensor contraction x 144 //------------------------------------------------------------------------------ 145 template <int NCOMP, int P1d, int Q1d> 146 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 147 data.slice[data.tidx] = *U; 148 __syncthreads(); 149 *V = 0.0; 150 if (data.tidx < P1d) 151 for (CeedInt i = 0; i < Q1d; ++i) 152 *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 153 __syncthreads(); 154 } 155 156 //------------------------------------------------------------------------------ 157 // 1D interpolate to quadrature points 158 //------------------------------------------------------------------------------ 159 template <int NCOMP, int P1d, int Q1d> 160 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 161 for (CeedInt comp = 0; comp < NCOMP; comp++) 162 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 163 } 164 165 //------------------------------------------------------------------------------ 166 // 1D interpolate transpose 167 //------------------------------------------------------------------------------ 168 template <int NCOMP, int P1d, int Q1d> 169 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 170 for (CeedInt comp=0; comp<NCOMP; comp++) 171 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 172 } 173 174 //------------------------------------------------------------------------------ 175 // 1D derivatives at quadrature points 176 //------------------------------------------------------------------------------ 177 template <int NCOMP, int P1d, int Q1d> 178 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 179 for (CeedInt comp = 0; comp < NCOMP; comp++) 180 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 181 } 182 183 //------------------------------------------------------------------------------ 184 // 1D derivatives transpose 185 //------------------------------------------------------------------------------ 186 template <int NCOMP, int P1d, int Q1d> 187 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 188 for (CeedInt comp = 0; comp < NCOMP; comp++) 189 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 190 } 191 192 //------------------------------------------------------------------------------ 193 // 2D 194 //------------------------------------------------------------------------------ 195 196 //------------------------------------------------------------------------------ 197 // L-vector -> E-vector, offsets provided 198 //------------------------------------------------------------------------------ 199 template <int NCOMP, int COMPSTRIDE, int P1d> 200 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 201 if (data.tidx < P1d && data.tidy < P1d) { 202 const CeedInt node = data.tidx + data.tidy*P1d; 203 const CeedInt ind = indices[node + elem * P1d*P1d]; 204 for (CeedInt comp = 0; comp < NCOMP; ++comp) 205 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 206 } 207 } 208 209 //------------------------------------------------------------------------------ 210 // L-vector -> E-vector, strided 211 //------------------------------------------------------------------------------ 212 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 213 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 214 if (data.tidx < P1d && data.tidy < P1d) { 215 const CeedInt node = data.tidx + data.tidy*P1d; 216 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 217 for (CeedInt comp = 0; comp < NCOMP; ++comp) 218 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 219 } 220 } 221 222 //------------------------------------------------------------------------------ 223 // E-vector -> L-vector, offsets provided 224 //------------------------------------------------------------------------------ 225 template <int NCOMP, int COMPSTRIDE, int P1d> 226 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 227 if (data.tidx < P1d && data.tidy < P1d) { 228 const CeedInt node = data.tidx + data.tidy*P1d; 229 const CeedInt ind = indices[node + elem * P1d*P1d]; 230 for (CeedInt comp = 0; comp < NCOMP; ++comp) 231 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 232 } 233 } 234 235 //------------------------------------------------------------------------------ 236 // E-vector -> L-vector, strided 237 //------------------------------------------------------------------------------ 238 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 239 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 240 if (data.tidx < P1d && data.tidy < P1d) { 241 const CeedInt node = data.tidx + data.tidy*P1d; 242 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 243 for (CeedInt comp = 0; comp < NCOMP; ++comp) 244 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 245 } 246 } 247 248 //------------------------------------------------------------------------------ 249 // 2D tensor contraction x 250 //------------------------------------------------------------------------------ 251 template <int NCOMP, int P1d, int Q1d> 252 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 253 data.slice[data.tidx+data.tidy*T1d] = *U; 254 __syncthreads(); 255 *V = 0.0; 256 if (data.tidx < Q1d && data.tidy < P1d) 257 for (CeedInt i = 0; i < P1d; ++i) 258 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 259 __syncthreads(); 260 } 261 262 //------------------------------------------------------------------------------ 263 // 2D tensor contract y 264 //------------------------------------------------------------------------------ 265 template <int NCOMP, int P1d, int Q1d> 266 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 267 data.slice[data.tidx+data.tidy*T1d] = *U; 268 __syncthreads(); 269 *V = 0.0; 270 if (data.tidx < Q1d && data.tidy < Q1d) 271 for (CeedInt i = 0; i < P1d; ++i) 272 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 273 __syncthreads(); 274 } 275 276 //------------------------------------------------------------------------------ 277 // 2D transpose tensor contract y 278 //------------------------------------------------------------------------------ 279 template <int NCOMP, int P1d, int Q1d> 280 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 281 data.slice[data.tidx+data.tidy*T1d] = *U; 282 __syncthreads(); 283 *V = 0.0; 284 if (data.tidx < Q1d && data.tidy < P1d) 285 for (CeedInt i = 0; i < Q1d; ++i) 286 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 287 __syncthreads(); 288 } 289 290 //------------------------------------------------------------------------------ 291 // 2D transpose tensor contract x 292 //------------------------------------------------------------------------------ 293 template <int NCOMP, int P1d, int Q1d> 294 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 295 data.slice[data.tidx+data.tidy*T1d] = *U; 296 __syncthreads(); 297 *V = 0.0; 298 if (data.tidx < P1d && data.tidy < P1d) 299 for (CeedInt i = 0; i < Q1d; ++i) 300 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 301 __syncthreads(); 302 } 303 304 //------------------------------------------------------------------------------ 305 // 2D transpose tensor contract and add x 306 //------------------------------------------------------------------------------ 307 template <int NCOMP, int P1d, int Q1d> 308 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 309 data.slice[data.tidx+data.tidy*T1d] = *U; 310 __syncthreads(); 311 if (data.tidx < P1d && data.tidy < P1d) 312 for (CeedInt i = 0; i < Q1d; ++i) 313 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 314 __syncthreads(); 315 } 316 317 //------------------------------------------------------------------------------ 318 // 2D interpolate to quadrature points 319 //------------------------------------------------------------------------------ 320 template <int NCOMP, int P1d, int Q1d> 321 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 322 CeedScalar r_t[1]; 323 for (CeedInt comp = 0; comp < NCOMP; comp++) { 324 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 325 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 326 } 327 } 328 329 //------------------------------------------------------------------------------ 330 // 2D interpolate transpose 331 //------------------------------------------------------------------------------ 332 template <int NCOMP, int P1d, int Q1d> 333 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 334 CeedScalar r_t[1]; 335 for (CeedInt comp = 0; comp < NCOMP; comp++) { 336 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 337 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 338 } 339 } 340 341 //------------------------------------------------------------------------------ 342 // 2D derivatives at quadrature points 343 //------------------------------------------------------------------------------ 344 template <int NCOMP, int P1d, int Q1d> 345 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 346 CeedScalar r_t[1]; 347 for (CeedInt comp = 0; comp < NCOMP; comp++) { 348 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 349 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 350 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 351 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 352 } 353 } 354 355 //------------------------------------------------------------------------------ 356 // 2D derivatives transpose 357 //------------------------------------------------------------------------------ 358 template <int NCOMP, int P1d, int Q1d> 359 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 360 CeedScalar r_t[1]; 361 for (CeedInt comp = 0; comp < NCOMP; comp++) { 362 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 363 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 364 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 365 ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 366 } 367 } 368 369 //------------------------------------------------------------------------------ 370 // 3D 371 //------------------------------------------------------------------------------ 372 373 //------------------------------------------------------------------------------ 374 // L-vector -> E-vector, offsets provided 375 //------------------------------------------------------------------------------ 376 template <int NCOMP, int COMPSTRIDE, int P1d> 377 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 378 if (data.tidx < P1d && data.tidy < P1d) 379 for (CeedInt z = 0; z < P1d; ++z) { 380 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 381 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 382 for (CeedInt comp = 0; comp < NCOMP; ++comp) 383 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 384 } 385 } 386 387 //------------------------------------------------------------------------------ 388 // L-vector -> E-vector, strided 389 //------------------------------------------------------------------------------ 390 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 391 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 392 if (data.tidx < P1d && data.tidy < P1d) 393 for (CeedInt z = 0; z < P1d; ++z) { 394 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 395 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 396 for (CeedInt comp = 0; comp < NCOMP; ++comp) 397 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 398 } 399 } 400 401 //------------------------------------------------------------------------------ 402 // E-vector -> Q-vector, offests provided 403 //------------------------------------------------------------------------------ 404 template <int NCOMP, int COMPSTRIDE, int Q1d> 405 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 406 if (data.tidx < Q1d && data.tidy < Q1d) { 407 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 408 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 409 for (CeedInt comp = 0; comp < NCOMP; ++comp) 410 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 411 } 412 } 413 414 //------------------------------------------------------------------------------ 415 // E-vector -> Q-vector, strided 416 //------------------------------------------------------------------------------ 417 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 418 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 419 if (data.tidx < Q1d && data.tidy < Q1d) { 420 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 421 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 422 for (CeedInt comp = 0; comp < NCOMP; ++comp) 423 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 424 } 425 } 426 427 //------------------------------------------------------------------------------ 428 // E-vector -> L-vector, offsets provided 429 //------------------------------------------------------------------------------ 430 template <int NCOMP, int COMPSTRIDE, int P1d> 431 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 432 if (data.tidx < P1d && data.tidy < P1d) 433 for (CeedInt z = 0; z < P1d; ++z) { 434 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 435 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 436 for (CeedInt comp = 0; comp < NCOMP; ++comp) 437 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 438 } 439 } 440 441 //------------------------------------------------------------------------------ 442 // E-vector -> L-vector, strided 443 //------------------------------------------------------------------------------ 444 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 445 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 446 if (data.tidx < P1d && data.tidy < P1d) 447 for (CeedInt z = 0; z < P1d; ++z) { 448 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 449 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 450 for (CeedInt comp = 0; comp < NCOMP; ++comp) 451 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 452 } 453 } 454 455 //------------------------------------------------------------------------------ 456 // 3D tensor contract x 457 //------------------------------------------------------------------------------ 458 template <int NCOMP, int P1d, int Q1d> 459 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 460 CeedScalar r_B[P1d]; 461 for (CeedInt i = 0; i < P1d; ++i) 462 r_B[i] = B[i + data.tidx*P1d]; 463 464 for (CeedInt k = 0; k < P1d; ++k) { 465 data.slice[data.tidx+data.tidy*T1d] = U[k]; 466 __syncthreads(); 467 V[k] = 0.0; 468 if (data.tidx < Q1d && data.tidy < P1d) 469 for (CeedInt i = 0; i < P1d; ++i) 470 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 471 __syncthreads(); 472 } 473 } 474 475 //------------------------------------------------------------------------------ 476 // 3D tensor contract y 477 //------------------------------------------------------------------------------ 478 template <int NCOMP, int P1d, int Q1d> 479 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 480 CeedScalar r_B[P1d]; 481 for (CeedInt i = 0; i < P1d; ++i) 482 r_B[i] = B[i + data.tidy*P1d]; 483 484 for (CeedInt k = 0; k < P1d; ++k) { 485 data.slice[data.tidx+data.tidy*T1d] = U[k]; 486 __syncthreads(); 487 V[k] = 0.0; 488 if (data.tidx < Q1d && data.tidy < Q1d) 489 for (CeedInt i = 0; i < P1d; ++i) 490 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 491 __syncthreads(); 492 } 493 } 494 495 //------------------------------------------------------------------------------ 496 // 3D tensor contract z 497 //------------------------------------------------------------------------------ 498 template <int NCOMP, int P1d, int Q1d> 499 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 500 for (CeedInt k = 0; k < Q1d; ++k) { 501 V[k] = 0.0; 502 if (data.tidx < Q1d && data.tidy < Q1d) 503 for (CeedInt i = 0; i < P1d; ++i) 504 V[k] += B[i + k*P1d] * U[i]; // Contract z direction 505 } 506 } 507 508 //------------------------------------------------------------------------------ 509 // 3D transpose tensor contract z 510 //------------------------------------------------------------------------------ 511 template <int NCOMP, int P1d, int Q1d> 512 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 513 for (CeedInt k = 0; k < P1d; ++k) { 514 V[k] = 0.0; 515 if (data.tidx < Q1d && data.tidy < Q1d) 516 for (CeedInt i = 0; i < Q1d; ++i) 517 V[k] += B[k + i*P1d] * U[i]; // Contract z direction 518 } 519 } 520 521 //------------------------------------------------------------------------------ 522 // 3D transpose tensor contract y 523 //------------------------------------------------------------------------------ 524 template <int NCOMP, int P1d, int Q1d> 525 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 526 CeedScalar r_B[Q1d]; 527 for (CeedInt i = 0; i < Q1d; ++i) 528 r_B[i] = B[data.tidy + i*P1d]; 529 530 for (CeedInt k = 0; k < P1d; ++k) { 531 data.slice[data.tidx+data.tidy*T1d] = U[k]; 532 __syncthreads(); 533 V[k] = 0.0; 534 if (data.tidx < Q1d && data.tidy < P1d) 535 for (CeedInt i = 0; i < Q1d; ++i) 536 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 537 __syncthreads(); 538 } 539 } 540 541 //------------------------------------------------------------------------------ 542 // 3D transpose tensor contract add y 543 //------------------------------------------------------------------------------ 544 template <int NCOMP, int P1d, int Q1d> 545 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 546 CeedScalar r_B[Q1d]; 547 for (CeedInt i = 0; i < Q1d; ++i) 548 r_B[i] = B[data.tidy + i*P1d]; 549 550 for (CeedInt k = 0; k < P1d; ++k) { 551 data.slice[data.tidx+data.tidy*T1d] = U[k]; 552 __syncthreads(); 553 if (data.tidx < Q1d && data.tidy < P1d) 554 for (CeedInt i = 0; i < Q1d; ++i) 555 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 556 __syncthreads(); 557 } 558 } 559 560 //------------------------------------------------------------------------------ 561 // 3D transpose tensor contract x 562 //------------------------------------------------------------------------------ 563 template <int NCOMP, int P1d, int Q1d> 564 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 565 CeedScalar r_B[Q1d]; 566 for (CeedInt i = 0; i < Q1d; ++i) 567 r_B[i] = B[data.tidx + i*P1d]; 568 569 for (CeedInt k = 0; k < P1d; ++k) { 570 data.slice[data.tidx+data.tidy*T1d] = U[k]; 571 __syncthreads(); 572 V[k] = 0.0; 573 if (data.tidx < P1d && data.tidy < P1d) 574 for (CeedInt i = 0; i < Q1d; ++i) 575 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 576 __syncthreads(); 577 } 578 } 579 580 //------------------------------------------------------------------------------ 581 // 3D transpose tensor contract add x 582 //------------------------------------------------------------------------------ 583 template <int NCOMP, int P1d, int Q1d> 584 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 585 CeedScalar r_B[Q1d]; 586 for (CeedInt i = 0; i < Q1d; ++i) 587 r_B[i] = B[data.tidx + i*P1d]; 588 589 for (CeedInt k = 0; k < P1d; ++k) { 590 data.slice[data.tidx+data.tidy*T1d] = U[k]; 591 __syncthreads(); 592 if (data.tidx < P1d && data.tidy < P1d) 593 for (CeedInt i = 0; i < Q1d; ++i) 594 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 595 __syncthreads(); 596 } 597 } 598 599 //------------------------------------------------------------------------------ 600 // 3D interpolate to quadrature points 601 //------------------------------------------------------------------------------ 602 template <int NCOMP, int P1d, int Q1d> 603 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 604 CeedScalar r_t1[T1d]; 605 CeedScalar r_t2[T1d]; 606 for (CeedInt comp = 0; comp < NCOMP; comp++) { 607 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 608 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 609 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 610 } 611 } 612 613 //------------------------------------------------------------------------------ 614 // 3D interpolate transpose 615 //------------------------------------------------------------------------------ 616 template <int NCOMP, int P1d, int Q1d> 617 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 618 CeedScalar r_t1[T1d]; 619 CeedScalar r_t2[T1d]; 620 for (CeedInt comp = 0; comp < NCOMP; comp++) { 621 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 622 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 623 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 624 } 625 } 626 627 //------------------------------------------------------------------------------ 628 // 3D derivatives at quadrature points 629 //------------------------------------------------------------------------------ 630 template <int NCOMP, int P1d, int Q1d> 631 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 632 CeedScalar r_t1[T1d]; 633 CeedScalar r_t2[T1d]; 634 for (CeedInt comp = 0; comp < NCOMP; comp++) { 635 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 636 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 637 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 638 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 639 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 640 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 641 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 642 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 643 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 644 } 645 } 646 647 //------------------------------------------------------------------------------ 648 // 3D derivatives transpose 649 //------------------------------------------------------------------------------ 650 template <int NCOMP, int P1d, int Q1d> 651 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 652 CeedScalar r_t1[T1d]; 653 CeedScalar r_t2[T1d]; 654 for (CeedInt comp = 0; comp < NCOMP; comp++) { 655 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 656 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 657 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 658 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 659 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 660 ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 661 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 662 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 663 ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 664 } 665 } 666 667 //------------------------------------------------------------------------------ 668 // 3D collocated derivatives computation 669 //------------------------------------------------------------------------------ 670 template <int NCOMP, int Q1d> 671 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 672 if (data.tidx < Q1d && data.tidy < Q1d) { 673 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 674 data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 675 __syncthreads(); 676 // X derivative 677 r_V[comp+0*NCOMP] = 0.0; 678 for (CeedInt i = 0; i < Q1d; ++i) 679 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 680 // Y derivative 681 r_V[comp+1*NCOMP] = 0.0; 682 for (CeedInt i = 0; i < Q1d; ++i) 683 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 684 // Z derivative 685 r_V[comp+2*NCOMP] = 0.0; 686 for (CeedInt i = 0; i < Q1d; ++i) 687 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 688 __syncthreads(); 689 } 690 } 691 } 692 693 //------------------------------------------------------------------------------ 694 // 3D collocated derivatives transpose 695 //------------------------------------------------------------------------------ 696 template <int NCOMP, int Q1d> 697 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 698 if (data.tidx < Q1d && data.tidy < Q1d) { 699 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 700 // X derivative 701 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 702 __syncthreads(); 703 for (CeedInt i = 0; i < Q1d; ++i) 704 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 705 __syncthreads(); 706 // Y derivative 707 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 708 __syncthreads(); 709 for (CeedInt i = 0; i < Q1d; ++i) 710 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 711 __syncthreads(); 712 // Z derivative 713 for (CeedInt i = 0; i < Q1d; ++i) 714 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 715 } 716 } 717 } 718 719 //------------------------------------------------------------------------------ 720 // 1D quadrature weights 721 //------------------------------------------------------------------------------ 722 template <int Q1d> 723 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 724 *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 725 } 726 727 //------------------------------------------------------------------------------ 728 // 2D quadrature weights 729 //------------------------------------------------------------------------------ 730 template <int Q1d> 731 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 732 *w = (data.tidx < Q1d && data.tidy < Q1d) ? 733 qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 734 } 735 736 //------------------------------------------------------------------------------ 737 // 3D quadrature weights 738 //------------------------------------------------------------------------------ 739 template <int Q1d> 740 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 741 const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 742 const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 743 for (CeedInt z = 0; z < Q1d; ++z) 744 w[z] = quad ? pw*qweight1d[z] : 0.0; 745 } 746 747 ); 748 //------------------------------------------------------------------------------ 749 // Build singe operator kernel 750 //------------------------------------------------------------------------------ 751 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 752 753 using std::ostringstream; 754 using std::string; 755 int ierr; 756 bool setupdone; 757 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 758 if (setupdone) return CEED_ERROR_SUCCESS; 759 Ceed ceed; 760 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 761 CeedOperator_Cuda_gen *data; 762 ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 763 CeedQFunction qf; 764 CeedQFunction_Cuda_gen *qf_data; 765 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 766 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 767 CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 768 numoutputfields, ncomp, dim = 0, lsize; 769 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 770 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 771 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 772 CeedChkBackend(ierr); 773 CeedOperatorField *opinputfields, *opoutputfields; 774 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 775 CeedChkBackend(ierr); 776 CeedQFunctionField *qfinputfields, *qfoutputfields; 777 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 778 CeedChkBackend(ierr); 779 CeedEvalMode emode; 780 CeedBasis basis; 781 CeedBasis_Cuda_shared *basis_data; 782 CeedElemRestriction Erestrict; 783 CeedElemRestriction_Cuda *restr_data; 784 785 // Check for restriction only identity operator 786 bool is_identity_qf; 787 ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 788 if (is_identity_qf) { 789 CeedEvalMode emodein, emodeout; 790 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein); CeedChkBackend(ierr); 791 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout); CeedChkBackend(ierr); 792 if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE) 793 // LCOV_EXCL_START 794 return CeedError(ceed, CEED_ERROR_BACKEND, 795 "Backend does not implement restriction only identity operators"); 796 // LCOV_EXCL_STOP 797 } 798 799 ostringstream code; 800 string devFunctions(deviceFunctions); 801 802 // Add atomicAdd function for old NVidia architectures 803 struct cudaDeviceProp prop; 804 Ceed_Cuda *ceed_data; 805 ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); 806 ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 807 if (prop.major<6){ 808 code << atomicAdd; 809 } 810 811 code << devFunctions; 812 813 string qFunction(qf_data->qFunctionSource); 814 string qFunctionName(qf_data->qFunctionName); 815 string oper; 816 oper = "CeedKernel_Cuda_gen_" + qFunctionName; 817 818 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 819 code << "#define CeedPragmaSIMD\n"; 820 code << "#define CEED_ERROR_SUCCESS 0\n\n"; 821 822 // Find dim and Q1d 823 bool useCollograd = true; 824 data->maxP1d = 0; 825 for (CeedInt i = 0; i < numinputfields; i++) { 826 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 827 if (basis != CEED_BASIS_COLLOCATED) { 828 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 829 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 830 CeedChkBackend(ierr); 831 832 // Check for collocated gradient 833 useCollograd = useCollograd && basis_data->d_collograd1d; 834 835 // Collect dim and Q1d 836 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 837 bool isTensor; 838 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 839 if (isTensor) { 840 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 841 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 842 if (P1d>data->maxP1d) data->maxP1d = P1d; 843 } else { 844 // LCOV_EXCL_START 845 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 846 // LCOV_EXCL_STOP 847 } 848 } 849 } 850 // Check output bases for Q1d, dim as well 851 // The only imput basis might be CEED_BASIS_COLLOCATED 852 for (CeedInt i = 0; i < numoutputfields; i++) { 853 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 854 855 if (basis != CEED_BASIS_COLLOCATED) { 856 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 857 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 858 CeedChkBackend(ierr); 859 860 // Collect dim and Q1d 861 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 862 bool isTensor; 863 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 864 if (isTensor) { 865 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 866 } else { 867 // LCOV_EXCL_START 868 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 869 // LCOV_EXCL_STOP 870 } 871 872 // Check for collocated gradient 873 useCollograd = useCollograd && basis_data->d_collograd1d; 874 } 875 } 876 data->dim = dim; 877 data->Q1d = Q1d; 878 879 // Define CEED_Q_VLA 880 if (dim != 3 || useCollograd) { 881 code << "\n#define CEED_Q_VLA 1\n\n"; 882 } else { 883 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 884 } 885 886 code << qFunction; 887 888 // Setup 889 code << "\n// -----------------------------------------------------------------------------\n"; 890 code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 891 for (CeedInt i = 0; i < numinputfields; i++) { 892 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 893 CeedChkBackend(ierr); 894 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 895 code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 896 } 897 } 898 899 for (CeedInt i = 0; i < numoutputfields; i++) { 900 code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 901 } 902 903 code << " const CeedInt Dim = "<<dim<<";\n"; 904 code << " const CeedInt Q1d = "<<Q1d<<";\n"; 905 906 code << " extern __shared__ CeedScalar slice[];\n"; 907 code << " BackendData data;\n"; 908 code << " data.tidx = threadIdx.x;\n"; 909 code << " data.tidy = threadIdx.y;\n"; 910 code << " data.tidz = threadIdx.z;\n"; 911 code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 912 code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 913 914 code << "\n // -- Input field constants and basis data --\n"; 915 //Initialize constants, and matrices B and G 916 for (CeedInt i = 0; i < numinputfields; i++) { 917 code << " // ---- Input field "<<i<<" ----\n"; 918 // Get elemsize, emode, ncomp 919 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 920 CeedChkBackend(ierr); 921 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 922 CeedChkBackend(ierr); 923 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 924 CeedChkBackend(ierr); 925 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 926 CeedChkBackend(ierr); 927 928 // Set field constants 929 if (emode != CEED_EVAL_WEIGHT) { 930 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 931 if (basis != CEED_BASIS_COLLOCATED) { 932 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 933 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 934 } else { 935 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 936 } 937 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 938 } 939 940 // Load basis data 941 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 942 switch (emode) { 943 case CEED_EVAL_NONE: 944 break; 945 case CEED_EVAL_INTERP: 946 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 947 data->B.in[i] = basis_data->d_interp1d; 948 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 949 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 950 break; 951 case CEED_EVAL_GRAD: 952 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 953 data->B.in[i] = basis_data->d_interp1d; 954 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 955 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 956 if (useCollograd) { 957 data->G.in[i] = basis_data->d_collograd1d; 958 code << " __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 959 code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 960 } else { 961 data->G.in[i] = basis_data->d_grad1d; 962 code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 963 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 964 } 965 break; 966 case CEED_EVAL_WEIGHT: 967 break; // No action 968 case CEED_EVAL_DIV: 969 break; // TODO: Not implemented 970 case CEED_EVAL_CURL: 971 break; // TODO: Not implemented 972 } 973 } 974 975 code << "\n // -- Output field constants and basis data --\n"; 976 for (CeedInt i = 0; i < numoutputfields; i++) { 977 code << " // ---- Output field "<<i<<" ----\n"; 978 // Get elemsize, emode, ncomp 979 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 980 CeedChkBackend(ierr); 981 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 982 CeedChkBackend(ierr); 983 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 984 CeedChkBackend(ierr); 985 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 986 CeedChkBackend(ierr); 987 988 // Set field constants 989 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 990 if (basis != CEED_BASIS_COLLOCATED) { 991 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 992 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 993 } else { 994 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 995 } 996 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 997 998 // Load basis data 999 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1000 switch (emode) { 1001 case CEED_EVAL_NONE: 1002 break; // No action 1003 case CEED_EVAL_INTERP: 1004 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1005 data->B.out[i] = basis_data->d_interp1d; 1006 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1007 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1008 break; 1009 case CEED_EVAL_GRAD: 1010 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1011 data->B.out[i] = basis_data->d_interp1d; 1012 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1013 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1014 if (useCollograd) { 1015 data->G.out[i] = basis_data->d_collograd1d; 1016 code << " __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1017 code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1018 } else { 1019 data->G.out[i] = basis_data->d_grad1d; 1020 code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1021 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1022 } 1023 break; 1024 // LCOV_EXCL_START 1025 case CEED_EVAL_WEIGHT: { 1026 Ceed ceed; 1027 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1028 return CeedError(ceed, CEED_ERROR_BACKEND, 1029 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1030 break; // Should not occur 1031 } 1032 case CEED_EVAL_DIV: 1033 break; // TODO: Not implemented 1034 case CEED_EVAL_CURL: 1035 break; // TODO: Not implemented 1036 // LCOV_EXCL_STOP 1037 } 1038 } 1039 code << "\n // -- Element loop --\n"; 1040 code << " __syncthreads();\n"; 1041 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1042 // Input basis apply if needed 1043 // Generate the correct eval mode code for each input 1044 code << " // -- Input field restrictions and basis actions --\n"; 1045 for (CeedInt i = 0; i < numinputfields; i++) { 1046 code << " // ---- Input field "<<i<<" ----\n"; 1047 // Get elemsize, emode, ncomp 1048 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1049 CeedChkBackend(ierr); 1050 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1051 CeedChkBackend(ierr); 1052 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1053 CeedChkBackend(ierr); 1054 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1055 CeedChkBackend(ierr); 1056 1057 // Restriction 1058 if (emode != CEED_EVAL_WEIGHT && 1059 !((emode == CEED_EVAL_NONE) && useCollograd)) { 1060 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1061 1062 bool isStrided; 1063 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1064 if (!isStrided) { 1065 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1066 CeedChkBackend(ierr); 1067 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1068 CeedInt compstride; 1069 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1070 code << " // CompStride: "<<compstride<<"\n"; 1071 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1072 data->indices.in[i] = restr_data->d_ind; 1073 code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1074 } else { 1075 bool backendstrides; 1076 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1077 CeedChkBackend(ierr); 1078 CeedInt nelem; 1079 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1080 CeedChkBackend(ierr); 1081 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1082 if (!backendstrides) { 1083 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1084 CeedChkBackend(ierr); 1085 } 1086 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1087 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1088 } 1089 } 1090 1091 // Basis action 1092 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1093 switch (emode) { 1094 case CEED_EVAL_NONE: 1095 if (!useCollograd) { 1096 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1097 } 1098 break; 1099 case CEED_EVAL_INTERP: 1100 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1101 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1102 break; 1103 case CEED_EVAL_GRAD: 1104 if (useCollograd) { 1105 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1106 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1107 } else { 1108 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1109 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1110 } 1111 break; 1112 case CEED_EVAL_WEIGHT: 1113 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1114 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 1115 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1116 data->W = basis_data->d_qweight1d; 1117 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1118 break; // No action 1119 case CEED_EVAL_DIV: 1120 break; // TODO: Not implemented 1121 case CEED_EVAL_CURL: 1122 break; // TODO: Not implemented 1123 } 1124 } 1125 1126 // Q function 1127 code << "\n // -- Output field setup --\n"; 1128 for (CeedInt i = 0; i < numoutputfields; i++) { 1129 code << "\n // ---- Output field "<<i<<" ----\n"; 1130 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1131 CeedChkBackend(ierr); 1132 if (emode==CEED_EVAL_GRAD) 1133 { 1134 if (useCollograd) { 1135 //Accumulator for gradient slices 1136 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1137 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1138 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1139 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1140 code << " }\n"; 1141 code << " }\n"; 1142 } else { 1143 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1144 } 1145 } 1146 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1147 { 1148 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1149 } 1150 } 1151 // We treat quadrature points per slice in 3d to save registers 1152 if (useCollograd) { 1153 code << "\n // Note: Collocated Gradient\n"; 1154 code << "#pragma unroll\n"; 1155 code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1156 code << " // -- Input fields --\n"; 1157 for (CeedInt i = 0; i < numinputfields; i++) { 1158 code << " // ---- Input field "<<i<<" ----\n"; 1159 // Get elemsize, emode, ncomp 1160 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1161 CeedChkBackend(ierr); 1162 // Basis action 1163 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1164 switch (emode) { 1165 case CEED_EVAL_NONE: 1166 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1167 1168 bool isStrided; 1169 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr); 1170 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr); 1171 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1172 if (!isStrided) { 1173 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1174 CeedChkBackend(ierr); 1175 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1176 CeedInt compstride; 1177 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1178 code << " // CompStride: "<<compstride<<"\n"; 1179 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1180 data->indices.in[i] = restr_data->d_ind; 1181 code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1182 } else { 1183 bool backendstrides; 1184 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1185 CeedChkBackend(ierr); 1186 CeedInt nelem; 1187 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1188 CeedChkBackend(ierr); 1189 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1190 if (!backendstrides) { 1191 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1192 CeedChkBackend(ierr); 1193 } 1194 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1195 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1196 } 1197 break; 1198 case CEED_EVAL_INTERP: 1199 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1200 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1201 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1202 code << " }\n"; 1203 break; 1204 case CEED_EVAL_GRAD: 1205 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1206 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1207 break; 1208 case CEED_EVAL_WEIGHT: 1209 code << " CeedScalar r_q"<<i<<"[1];\n"; 1210 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1211 break; // No action 1212 case CEED_EVAL_DIV: 1213 break; // TODO: Not implemented 1214 case CEED_EVAL_CURL: 1215 break; // TODO: Not implemented 1216 } 1217 } 1218 code << "\n // -- Output fields --\n"; 1219 for (CeedInt i = 0; i < numoutputfields; i++) { 1220 code << " // ---- Output field "<<i<<" ----\n"; 1221 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1222 CeedChkBackend(ierr); 1223 // Basis action 1224 switch (emode) { 1225 case CEED_EVAL_NONE: 1226 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1227 break; // No action 1228 case CEED_EVAL_INTERP: 1229 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1230 break; 1231 case CEED_EVAL_GRAD: 1232 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1233 break; 1234 case CEED_EVAL_WEIGHT: 1235 break; // Should not occur 1236 case CEED_EVAL_DIV: 1237 break; // TODO: Not implemented 1238 case CEED_EVAL_CURL: 1239 break; // TODO: Not implemented 1240 } 1241 } 1242 } else { 1243 code << "\n // Note: No Collocated Gradient\n"; 1244 code << " // -- Input fields --\n"; 1245 for (CeedInt i = 0; i < numinputfields; i++) { 1246 code << " // ---- Input field "<<i<<" ----\n"; 1247 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1248 } 1249 code << " // -- Output fields --\n"; 1250 for (CeedInt i = 0; i < numoutputfields; i++) { 1251 code << " // ---- Output field "<<i<<" ----\n"; 1252 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1253 } 1254 } 1255 code << "\n // -- QFunction Inputs and outputs --\n"; 1256 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1257 for (CeedInt i = 0; i < numinputfields; i++) { 1258 code << " // ---- Input field "<<i<<" ----\n"; 1259 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1260 } 1261 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1262 for (CeedInt i = 0; i < numoutputfields; i++) { 1263 code << " // ---- Output field "<<i<<" ----\n"; 1264 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1265 } 1266 code << "\n // -- Apply QFunction --\n"; 1267 code << " "<<qFunctionName<<"(ctx, "; 1268 if (dim != 3 || useCollograd) { 1269 code << "1"; 1270 } else { 1271 code << "Q1d"; 1272 } 1273 code << ", in, out);\n"; 1274 if (useCollograd) { 1275 code << "\n // Note: Collocated Gradient\n"; 1276 code << " // -- Output fields --\n"; 1277 for (CeedInt i = 0; i < numoutputfields; i++) { 1278 code << " // ---- Output field "<<i<<" ----\n"; 1279 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1280 CeedChkBackend(ierr); 1281 // Basis action 1282 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1283 switch (emode) { 1284 case CEED_EVAL_NONE: 1285 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1286 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1287 code << " }\n"; 1288 break; // No action 1289 case CEED_EVAL_INTERP: 1290 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1291 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1292 code << " }\n"; 1293 break; 1294 case CEED_EVAL_GRAD: 1295 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1296 break; 1297 case CEED_EVAL_WEIGHT: 1298 break; // Should not occur 1299 case CEED_EVAL_DIV: 1300 break; // TODO: Not implemented 1301 case CEED_EVAL_CURL: 1302 break; // TODO: Not implemented 1303 } 1304 } 1305 code << " }\n"; 1306 } 1307 1308 // Output basis apply if needed 1309 // Generate the correct eval mode code for each output 1310 code << "\n // -- Output field basis action and restrictions --\n"; 1311 for (CeedInt i = 0; i < numoutputfields; i++) { 1312 code << " // ---- Output field "<<i<<" ----\n"; 1313 // Get elemsize, emode, ncomp 1314 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1315 CeedChkBackend(ierr); 1316 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1317 CeedChkBackend(ierr); 1318 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1319 CeedChkBackend(ierr); 1320 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1321 CeedChkBackend(ierr); 1322 // Basis action 1323 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1324 switch (emode) { 1325 case CEED_EVAL_NONE: 1326 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1327 break; // No action 1328 case CEED_EVAL_INTERP: 1329 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1330 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1331 break; 1332 case CEED_EVAL_GRAD: 1333 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1334 if (useCollograd) { 1335 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1336 } else { 1337 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1338 } 1339 break; 1340 // LCOV_EXCL_START 1341 case CEED_EVAL_WEIGHT: { 1342 Ceed ceed; 1343 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1344 return CeedError(ceed, CEED_ERROR_BACKEND, 1345 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1346 break; // Should not occur 1347 } 1348 case CEED_EVAL_DIV: 1349 break; // TODO: Not implemented 1350 case CEED_EVAL_CURL: 1351 break; // TODO: Not implemented 1352 // LCOV_EXCL_STOP 1353 } 1354 // Restriction 1355 bool isStrided; 1356 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1357 if (!isStrided) { 1358 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1359 CeedChkBackend(ierr); 1360 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1361 CeedInt compstride; 1362 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1363 code << " // CompStride: "<<compstride<<"\n"; 1364 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1365 data->indices.out[i] = restr_data->d_ind; 1366 code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1367 } else { 1368 bool backendstrides; 1369 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1370 CeedChkBackend(ierr); 1371 CeedInt nelem; 1372 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1373 CeedChkBackend(ierr); 1374 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1375 if (!backendstrides) { 1376 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1377 CeedChkBackend(ierr); 1378 } 1379 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1380 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1381 } 1382 } 1383 1384 code << " }\n"; 1385 code << "}\n"; 1386 code << "// -----------------------------------------------------------------------------\n\n"; 1387 1388 // View kernel for debugging 1389 CeedDebug(code.str().c_str()); 1390 1391 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 1392 "T1d", CeedIntMax(Q1d, data->maxP1d)); 1393 CeedChkBackend(ierr); 1394 ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1395 CeedChkBackend(ierr); 1396 1397 ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 1398 return CEED_ERROR_SUCCESS; 1399 } 1400 //------------------------------------------------------------------------------ 1401