1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #define CEED_DEBUG_COLOR 12 18 19 #include <ceed/ceed.h> 20 #include <ceed/backend.h> 21 #include <cuda_runtime.h> 22 #include <iostream> 23 #include <sstream> 24 #include "ceed-cuda-gen.h" 25 #include "../cuda/ceed-cuda-compile.h" 26 #include "../cuda-shared/ceed-cuda-shared.h" 27 28 static const char *atomicAdd = QUOTE( 29 //------------------------------------------------------------------------------ 30 // Atomic add, for older CUDA 31 //------------------------------------------------------------------------------ 32 __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) { 33 unsigned long long int *address_as_ull = (unsigned long long int *)address; 34 unsigned long long int old = *address_as_ull, assumed; 35 do { 36 assumed = old; 37 old = 38 atomicCAS(address_as_ull, assumed, 39 __double_as_longlong(val + 40 __longlong_as_double(assumed))); 41 // Note: uses integer comparison to avoid hang in case of NaN 42 // (since NaN != NaN) 43 } while (assumed != old); 44 return __longlong_as_double(old); 45 } 46 ); 47 48 static const char *deviceFunctions = QUOTE( 49 50 //------------------------------------------------------------------------------ 51 // Typedefs 52 //------------------------------------------------------------------------------ 53 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 54 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 55 56 typedef struct { 57 CeedInt tidx; 58 CeedInt tidy; 59 CeedInt tidz; 60 CeedInt tid; 61 CeedScalar* slice; 62 } BackendData; 63 64 //------------------------------------------------------------------------------ 65 // Load matrices for basis actions 66 //------------------------------------------------------------------------------ 67 template <int P, int Q> 68 inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 69 for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 70 B[i] = d_B[i]; 71 } 72 73 //------------------------------------------------------------------------------ 74 // 1D 75 //------------------------------------------------------------------------------ 76 77 //------------------------------------------------------------------------------ 78 // L-vector -> E-vector, offsets provided 79 //------------------------------------------------------------------------------ 80 template <int NCOMP, int COMPSTRIDE, int P1d> 81 inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 82 if (data.tidx < P1d) { 83 const CeedInt node = data.tidx; 84 const CeedInt ind = indices[node + elem * P1d]; 85 for (CeedInt comp = 0; comp < NCOMP; ++comp) 86 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 87 } 88 } 89 90 //------------------------------------------------------------------------------ 91 // L-vector -> E-vector, strided 92 //------------------------------------------------------------------------------ 93 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 94 inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 95 if (data.tidx < P1d) { 96 const CeedInt node = data.tidx; 97 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 98 for (CeedInt comp = 0; comp < NCOMP; ++comp) 99 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 100 } 101 } 102 103 //------------------------------------------------------------------------------ 104 // E-vector -> L-vector, offsets provided 105 //------------------------------------------------------------------------------ 106 template <int NCOMP, int COMPSTRIDE, int P1d> 107 inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 108 if (data.tidx < P1d) { 109 const CeedInt node = data.tidx; 110 const CeedInt ind = indices[node + elem * P1d]; 111 for (CeedInt comp = 0; comp < NCOMP; ++comp) 112 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 113 } 114 } 115 116 //------------------------------------------------------------------------------ 117 // E-vector -> L-vector, strided 118 //------------------------------------------------------------------------------ 119 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 120 inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 121 if (data.tidx < P1d) { 122 const CeedInt node = data.tidx; 123 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 124 for (CeedInt comp = 0; comp < NCOMP; ++comp) 125 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 126 } 127 } 128 129 //------------------------------------------------------------------------------ 130 // 1D tensor contraction x 131 //------------------------------------------------------------------------------ 132 template <int NCOMP, int P1d, int Q1d> 133 inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 134 data.slice[data.tidx] = *U; 135 __syncthreads(); 136 *V = 0.0; 137 if (data.tidx < Q1d) 138 for (CeedInt i = 0; i < P1d; ++i) 139 *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 140 __syncthreads(); 141 } 142 143 //------------------------------------------------------------------------------ 144 // 1D transpose tensor contraction x 145 //------------------------------------------------------------------------------ 146 template <int NCOMP, int P1d, int Q1d> 147 inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 148 data.slice[data.tidx] = *U; 149 __syncthreads(); 150 *V = 0.0; 151 if (data.tidx < P1d) 152 for (CeedInt i = 0; i < Q1d; ++i) 153 *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 154 __syncthreads(); 155 } 156 157 //------------------------------------------------------------------------------ 158 // 1D interpolate to quadrature points 159 //------------------------------------------------------------------------------ 160 template <int NCOMP, int P1d, int Q1d> 161 inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 162 for (CeedInt comp = 0; comp < NCOMP; comp++) 163 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 164 } 165 166 //------------------------------------------------------------------------------ 167 // 1D interpolate transpose 168 //------------------------------------------------------------------------------ 169 template <int NCOMP, int P1d, int Q1d> 170 inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 171 for (CeedInt comp=0; comp<NCOMP; comp++) 172 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 173 } 174 175 //------------------------------------------------------------------------------ 176 // 1D derivatives at quadrature points 177 //------------------------------------------------------------------------------ 178 template <int NCOMP, int P1d, int Q1d> 179 inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 180 for (CeedInt comp = 0; comp < NCOMP; comp++) 181 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 182 } 183 184 //------------------------------------------------------------------------------ 185 // 1D derivatives transpose 186 //------------------------------------------------------------------------------ 187 template <int NCOMP, int P1d, int Q1d> 188 inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 189 for (CeedInt comp = 0; comp < NCOMP; comp++) 190 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 191 } 192 193 //------------------------------------------------------------------------------ 194 // 2D 195 //------------------------------------------------------------------------------ 196 197 //------------------------------------------------------------------------------ 198 // L-vector -> E-vector, offsets provided 199 //------------------------------------------------------------------------------ 200 template <int NCOMP, int COMPSTRIDE, int P1d> 201 inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 202 if (data.tidx < P1d && data.tidy < P1d) { 203 const CeedInt node = data.tidx + data.tidy*P1d; 204 const CeedInt ind = indices[node + elem * P1d*P1d]; 205 for (CeedInt comp = 0; comp < NCOMP; ++comp) 206 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 207 } 208 } 209 210 //------------------------------------------------------------------------------ 211 // L-vector -> E-vector, strided 212 //------------------------------------------------------------------------------ 213 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 214 inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 215 if (data.tidx < P1d && data.tidy < P1d) { 216 const CeedInt node = data.tidx + data.tidy*P1d; 217 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 218 for (CeedInt comp = 0; comp < NCOMP; ++comp) 219 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 220 } 221 } 222 223 //------------------------------------------------------------------------------ 224 // E-vector -> L-vector, offsets provided 225 //------------------------------------------------------------------------------ 226 template <int NCOMP, int COMPSTRIDE, int P1d> 227 inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 228 if (data.tidx < P1d && data.tidy < P1d) { 229 const CeedInt node = data.tidx + data.tidy*P1d; 230 const CeedInt ind = indices[node + elem * P1d*P1d]; 231 for (CeedInt comp = 0; comp < NCOMP; ++comp) 232 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 233 } 234 } 235 236 //------------------------------------------------------------------------------ 237 // E-vector -> L-vector, strided 238 //------------------------------------------------------------------------------ 239 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 240 inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 241 if (data.tidx < P1d && data.tidy < P1d) { 242 const CeedInt node = data.tidx + data.tidy*P1d; 243 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 244 for (CeedInt comp = 0; comp < NCOMP; ++comp) 245 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 246 } 247 } 248 249 //------------------------------------------------------------------------------ 250 // 2D tensor contraction x 251 //------------------------------------------------------------------------------ 252 template <int NCOMP, int P1d, int Q1d> 253 inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 254 data.slice[data.tidx+data.tidy*T1d] = *U; 255 __syncthreads(); 256 *V = 0.0; 257 if (data.tidx < Q1d && data.tidy < P1d) 258 for (CeedInt i = 0; i < P1d; ++i) 259 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 260 __syncthreads(); 261 } 262 263 //------------------------------------------------------------------------------ 264 // 2D tensor contract y 265 //------------------------------------------------------------------------------ 266 template <int NCOMP, int P1d, int Q1d> 267 inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 268 data.slice[data.tidx+data.tidy*T1d] = *U; 269 __syncthreads(); 270 *V = 0.0; 271 if (data.tidx < Q1d && data.tidy < Q1d) 272 for (CeedInt i = 0; i < P1d; ++i) 273 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 274 __syncthreads(); 275 } 276 277 //------------------------------------------------------------------------------ 278 // 2D transpose tensor contract y 279 //------------------------------------------------------------------------------ 280 template <int NCOMP, int P1d, int Q1d> 281 inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 282 data.slice[data.tidx+data.tidy*T1d] = *U; 283 __syncthreads(); 284 *V = 0.0; 285 if (data.tidx < Q1d && data.tidy < P1d) 286 for (CeedInt i = 0; i < Q1d; ++i) 287 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 288 __syncthreads(); 289 } 290 291 //------------------------------------------------------------------------------ 292 // 2D transpose tensor contract x 293 //------------------------------------------------------------------------------ 294 template <int NCOMP, int P1d, int Q1d> 295 inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 296 data.slice[data.tidx+data.tidy*T1d] = *U; 297 __syncthreads(); 298 *V = 0.0; 299 if (data.tidx < P1d && data.tidy < P1d) 300 for (CeedInt i = 0; i < Q1d; ++i) 301 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 302 __syncthreads(); 303 } 304 305 //------------------------------------------------------------------------------ 306 // 2D transpose tensor contract and add x 307 //------------------------------------------------------------------------------ 308 template <int NCOMP, int P1d, int Q1d> 309 inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 310 data.slice[data.tidx+data.tidy*T1d] = *U; 311 __syncthreads(); 312 if (data.tidx < P1d && data.tidy < P1d) 313 for (CeedInt i = 0; i < Q1d; ++i) 314 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 315 __syncthreads(); 316 } 317 318 //------------------------------------------------------------------------------ 319 // 2D interpolate to quadrature points 320 //------------------------------------------------------------------------------ 321 template <int NCOMP, int P1d, int Q1d> 322 inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 323 CeedScalar r_t[1]; 324 for (CeedInt comp = 0; comp < NCOMP; comp++) { 325 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 326 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 327 } 328 } 329 330 //------------------------------------------------------------------------------ 331 // 2D interpolate transpose 332 //------------------------------------------------------------------------------ 333 template <int NCOMP, int P1d, int Q1d> 334 inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 335 CeedScalar r_t[1]; 336 for (CeedInt comp = 0; comp < NCOMP; comp++) { 337 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 338 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 339 } 340 } 341 342 //------------------------------------------------------------------------------ 343 // 2D derivatives at quadrature points 344 //------------------------------------------------------------------------------ 345 template <int NCOMP, int P1d, int Q1d> 346 inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 347 CeedScalar r_t[1]; 348 for (CeedInt comp = 0; comp < NCOMP; comp++) { 349 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 350 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 351 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 352 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 353 } 354 } 355 356 //------------------------------------------------------------------------------ 357 // 2D derivatives transpose 358 //------------------------------------------------------------------------------ 359 template <int NCOMP, int P1d, int Q1d> 360 inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 361 CeedScalar r_t[1]; 362 for (CeedInt comp = 0; comp < NCOMP; comp++) { 363 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 364 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 365 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 366 ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 367 } 368 } 369 370 //------------------------------------------------------------------------------ 371 // 3D 372 //------------------------------------------------------------------------------ 373 374 //------------------------------------------------------------------------------ 375 // L-vector -> E-vector, offsets provided 376 //------------------------------------------------------------------------------ 377 template <int NCOMP, int COMPSTRIDE, int P1d> 378 inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 379 if (data.tidx < P1d && data.tidy < P1d) 380 for (CeedInt z = 0; z < P1d; ++z) { 381 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 382 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 383 for (CeedInt comp = 0; comp < NCOMP; ++comp) 384 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 385 } 386 } 387 388 //------------------------------------------------------------------------------ 389 // L-vector -> E-vector, strided 390 //------------------------------------------------------------------------------ 391 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 392 inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 393 if (data.tidx < P1d && data.tidy < P1d) 394 for (CeedInt z = 0; z < P1d; ++z) { 395 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 396 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 397 for (CeedInt comp = 0; comp < NCOMP; ++comp) 398 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 399 } 400 } 401 402 //------------------------------------------------------------------------------ 403 // E-vector -> Q-vector, offests provided 404 //------------------------------------------------------------------------------ 405 template <int NCOMP, int COMPSTRIDE, int Q1d> 406 inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 407 if (data.tidx < Q1d && data.tidy < Q1d) { 408 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 409 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 410 for (CeedInt comp = 0; comp < NCOMP; ++comp) 411 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 412 } 413 } 414 415 //------------------------------------------------------------------------------ 416 // E-vector -> Q-vector, strided 417 //------------------------------------------------------------------------------ 418 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 419 inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 420 if (data.tidx < Q1d && data.tidy < Q1d) { 421 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 422 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 423 for (CeedInt comp = 0; comp < NCOMP; ++comp) 424 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 425 } 426 } 427 428 //------------------------------------------------------------------------------ 429 // E-vector -> L-vector, offsets provided 430 //------------------------------------------------------------------------------ 431 template <int NCOMP, int COMPSTRIDE, int P1d> 432 inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 433 if (data.tidx < P1d && data.tidy < P1d) 434 for (CeedInt z = 0; z < P1d; ++z) { 435 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 436 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 437 for (CeedInt comp = 0; comp < NCOMP; ++comp) 438 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 439 } 440 } 441 442 //------------------------------------------------------------------------------ 443 // E-vector -> L-vector, strided 444 //------------------------------------------------------------------------------ 445 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 446 inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 447 if (data.tidx < P1d && data.tidy < P1d) 448 for (CeedInt z = 0; z < P1d; ++z) { 449 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 450 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 451 for (CeedInt comp = 0; comp < NCOMP; ++comp) 452 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 453 } 454 } 455 456 //------------------------------------------------------------------------------ 457 // 3D tensor contract x 458 //------------------------------------------------------------------------------ 459 template <int NCOMP, int P1d, int Q1d> 460 inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 461 CeedScalar r_B[P1d]; 462 for (CeedInt i = 0; i < P1d; ++i) 463 r_B[i] = B[i + data.tidx*P1d]; 464 465 for (CeedInt k = 0; k < P1d; ++k) { 466 data.slice[data.tidx+data.tidy*T1d] = U[k]; 467 __syncthreads(); 468 V[k] = 0.0; 469 if (data.tidx < Q1d && data.tidy < P1d) 470 for (CeedInt i = 0; i < P1d; ++i) 471 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 472 __syncthreads(); 473 } 474 } 475 476 //------------------------------------------------------------------------------ 477 // 3D tensor contract y 478 //------------------------------------------------------------------------------ 479 template <int NCOMP, int P1d, int Q1d> 480 inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 481 CeedScalar r_B[P1d]; 482 for (CeedInt i = 0; i < P1d; ++i) 483 r_B[i] = B[i + data.tidy*P1d]; 484 485 for (CeedInt k = 0; k < P1d; ++k) { 486 data.slice[data.tidx+data.tidy*T1d] = U[k]; 487 __syncthreads(); 488 V[k] = 0.0; 489 if (data.tidx < Q1d && data.tidy < Q1d) 490 for (CeedInt i = 0; i < P1d; ++i) 491 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 492 __syncthreads(); 493 } 494 } 495 496 //------------------------------------------------------------------------------ 497 // 3D tensor contract z 498 //------------------------------------------------------------------------------ 499 template <int NCOMP, int P1d, int Q1d> 500 inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 501 for (CeedInt k = 0; k < Q1d; ++k) { 502 V[k] = 0.0; 503 if (data.tidx < Q1d && data.tidy < Q1d) 504 for (CeedInt i = 0; i < P1d; ++i) 505 V[k] += B[i + k*P1d] * U[i]; // Contract z direction 506 } 507 } 508 509 //------------------------------------------------------------------------------ 510 // 3D transpose tensor contract z 511 //------------------------------------------------------------------------------ 512 template <int NCOMP, int P1d, int Q1d> 513 inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 514 for (CeedInt k = 0; k < P1d; ++k) { 515 V[k] = 0.0; 516 if (data.tidx < Q1d && data.tidy < Q1d) 517 for (CeedInt i = 0; i < Q1d; ++i) 518 V[k] += B[k + i*P1d] * U[i]; // Contract z direction 519 } 520 } 521 522 //------------------------------------------------------------------------------ 523 // 3D transpose tensor contract y 524 //------------------------------------------------------------------------------ 525 template <int NCOMP, int P1d, int Q1d> 526 inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 527 CeedScalar r_B[Q1d]; 528 for (CeedInt i = 0; i < Q1d; ++i) 529 r_B[i] = B[data.tidy + i*P1d]; 530 531 for (CeedInt k = 0; k < P1d; ++k) { 532 data.slice[data.tidx+data.tidy*T1d] = U[k]; 533 __syncthreads(); 534 V[k] = 0.0; 535 if (data.tidx < Q1d && data.tidy < P1d) 536 for (CeedInt i = 0; i < Q1d; ++i) 537 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 538 __syncthreads(); 539 } 540 } 541 542 //------------------------------------------------------------------------------ 543 // 3D transpose tensor contract add y 544 //------------------------------------------------------------------------------ 545 template <int NCOMP, int P1d, int Q1d> 546 inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 547 CeedScalar r_B[Q1d]; 548 for (CeedInt i = 0; i < Q1d; ++i) 549 r_B[i] = B[data.tidy + i*P1d]; 550 551 for (CeedInt k = 0; k < P1d; ++k) { 552 data.slice[data.tidx+data.tidy*T1d] = U[k]; 553 __syncthreads(); 554 if (data.tidx < Q1d && data.tidy < P1d) 555 for (CeedInt i = 0; i < Q1d; ++i) 556 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 557 __syncthreads(); 558 } 559 } 560 561 //------------------------------------------------------------------------------ 562 // 3D transpose tensor contract x 563 //------------------------------------------------------------------------------ 564 template <int NCOMP, int P1d, int Q1d> 565 inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 566 CeedScalar r_B[Q1d]; 567 for (CeedInt i = 0; i < Q1d; ++i) 568 r_B[i] = B[data.tidx + i*P1d]; 569 570 for (CeedInt k = 0; k < P1d; ++k) { 571 data.slice[data.tidx+data.tidy*T1d] = U[k]; 572 __syncthreads(); 573 V[k] = 0.0; 574 if (data.tidx < P1d && data.tidy < P1d) 575 for (CeedInt i = 0; i < Q1d; ++i) 576 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 577 __syncthreads(); 578 } 579 } 580 581 //------------------------------------------------------------------------------ 582 // 3D transpose tensor contract add x 583 //------------------------------------------------------------------------------ 584 template <int NCOMP, int P1d, int Q1d> 585 inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 586 CeedScalar r_B[Q1d]; 587 for (CeedInt i = 0; i < Q1d; ++i) 588 r_B[i] = B[data.tidx + i*P1d]; 589 590 for (CeedInt k = 0; k < P1d; ++k) { 591 data.slice[data.tidx+data.tidy*T1d] = U[k]; 592 __syncthreads(); 593 if (data.tidx < P1d && data.tidy < P1d) 594 for (CeedInt i = 0; i < Q1d; ++i) 595 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 596 __syncthreads(); 597 } 598 } 599 600 //------------------------------------------------------------------------------ 601 // 3D interpolate to quadrature points 602 //------------------------------------------------------------------------------ 603 template <int NCOMP, int P1d, int Q1d> 604 inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 605 CeedScalar r_t1[T1d]; 606 CeedScalar r_t2[T1d]; 607 for (CeedInt comp = 0; comp < NCOMP; comp++) { 608 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 609 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 610 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 611 } 612 } 613 614 //------------------------------------------------------------------------------ 615 // 3D interpolate transpose 616 //------------------------------------------------------------------------------ 617 template <int NCOMP, int P1d, int Q1d> 618 inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 619 CeedScalar r_t1[T1d]; 620 CeedScalar r_t2[T1d]; 621 for (CeedInt comp = 0; comp < NCOMP; comp++) { 622 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 623 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 624 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 625 } 626 } 627 628 //------------------------------------------------------------------------------ 629 // 3D derivatives at quadrature points 630 //------------------------------------------------------------------------------ 631 template <int NCOMP, int P1d, int Q1d> 632 inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 633 CeedScalar r_t1[T1d]; 634 CeedScalar r_t2[T1d]; 635 for (CeedInt comp = 0; comp < NCOMP; comp++) { 636 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 637 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 638 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 639 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 640 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 641 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 642 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 643 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 644 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 645 } 646 } 647 648 //------------------------------------------------------------------------------ 649 // 3D derivatives transpose 650 //------------------------------------------------------------------------------ 651 template <int NCOMP, int P1d, int Q1d> 652 inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 653 CeedScalar r_t1[T1d]; 654 CeedScalar r_t2[T1d]; 655 for (CeedInt comp = 0; comp < NCOMP; comp++) { 656 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 657 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 658 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 659 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 660 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 661 ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 662 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 663 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 664 ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 665 } 666 } 667 668 //------------------------------------------------------------------------------ 669 // 3D collocated derivatives computation 670 //------------------------------------------------------------------------------ 671 template <int NCOMP, int Q1d> 672 inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 673 if (data.tidx < Q1d && data.tidy < Q1d) { 674 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 675 data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 676 __syncthreads(); 677 // X derivative 678 r_V[comp+0*NCOMP] = 0.0; 679 for (CeedInt i = 0; i < Q1d; ++i) 680 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 681 // Y derivative 682 r_V[comp+1*NCOMP] = 0.0; 683 for (CeedInt i = 0; i < Q1d; ++i) 684 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 685 // Z derivative 686 r_V[comp+2*NCOMP] = 0.0; 687 for (CeedInt i = 0; i < Q1d; ++i) 688 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 689 __syncthreads(); 690 } 691 } 692 } 693 694 //------------------------------------------------------------------------------ 695 // 3D collocated derivatives transpose 696 //------------------------------------------------------------------------------ 697 template <int NCOMP, int Q1d> 698 inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 699 if (data.tidx < Q1d && data.tidy < Q1d) { 700 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 701 // X derivative 702 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 703 __syncthreads(); 704 for (CeedInt i = 0; i < Q1d; ++i) 705 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 706 __syncthreads(); 707 // Y derivative 708 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 709 __syncthreads(); 710 for (CeedInt i = 0; i < Q1d; ++i) 711 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 712 __syncthreads(); 713 // Z derivative 714 for (CeedInt i = 0; i < Q1d; ++i) 715 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 716 } 717 } 718 } 719 720 //------------------------------------------------------------------------------ 721 // 1D quadrature weights 722 //------------------------------------------------------------------------------ 723 template <int Q1d> 724 inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 725 *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 726 } 727 728 //------------------------------------------------------------------------------ 729 // 2D quadrature weights 730 //------------------------------------------------------------------------------ 731 template <int Q1d> 732 inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 733 *w = (data.tidx < Q1d && data.tidy < Q1d) ? 734 qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 735 } 736 737 //------------------------------------------------------------------------------ 738 // 3D quadrature weights 739 //------------------------------------------------------------------------------ 740 template <int Q1d> 741 inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 742 const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 743 const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 744 for (CeedInt z = 0; z < Q1d; ++z) 745 w[z] = quad ? pw*qweight1d[z] : 0.0; 746 } 747 748 ); 749 //------------------------------------------------------------------------------ 750 // Build singe operator kernel 751 //------------------------------------------------------------------------------ 752 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 753 754 using std::ostringstream; 755 using std::string; 756 int ierr; 757 bool setupdone; 758 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 759 if (setupdone) return CEED_ERROR_SUCCESS; 760 Ceed ceed; 761 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 762 CeedOperator_Cuda_gen *data; 763 ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 764 CeedQFunction qf; 765 CeedQFunction_Cuda_gen *qf_data; 766 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 767 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 768 CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields, 769 numoutputfields, ncomp, dim = 0, lsize; 770 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 771 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 772 CeedOperatorField *opinputfields, *opoutputfields; 773 ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields); 774 CeedChkBackend(ierr); 775 CeedQFunctionField *qfinputfields, *qfoutputfields; 776 ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 777 CeedChkBackend(ierr); 778 CeedEvalMode emode; 779 CeedBasis basis; 780 CeedBasis_Cuda_shared *basis_data; 781 CeedElemRestriction Erestrict; 782 CeedElemRestriction_Cuda *restr_data; 783 784 // Check for restriction only identity operator 785 bool is_identity_qf; 786 ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 787 if (is_identity_qf) { 788 CeedEvalMode emodein, emodeout; 789 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein); CeedChkBackend(ierr); 790 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout); CeedChkBackend(ierr); 791 if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE) 792 // LCOV_EXCL_START 793 return CeedError(ceed, CEED_ERROR_BACKEND, 794 "Backend does not implement restriction only identity operators"); 795 // LCOV_EXCL_STOP 796 } 797 798 ostringstream code; 799 string devFunctions(deviceFunctions); 800 801 // Add atomicAdd function for old NVidia architectures 802 struct cudaDeviceProp prop; 803 Ceed_Cuda *ceed_data; 804 ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr); 805 ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); CeedChkBackend(ierr); 806 if ((prop.major<6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){ 807 code << atomicAdd; 808 } 809 810 code << devFunctions; 811 812 string qFunction(qf_data->qFunctionSource); 813 string qFunctionName(qf_data->qFunctionName); 814 string oper; 815 oper = "CeedKernel_Cuda_gen_" + qFunctionName; 816 817 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 818 code << "#define CEED_QFUNCTION_HELPER inline __device__\n"; 819 code << "#define CeedPragmaSIMD\n"; 820 code << "#define CEED_ERROR_SUCCESS 0\n\n"; 821 822 // Find dim and Q1d 823 bool useCollograd = true; 824 data->maxP1d = 0; 825 for (CeedInt i = 0; i < numinputfields; i++) { 826 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 827 if (basis != CEED_BASIS_COLLOCATED) { 828 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 829 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 830 CeedChkBackend(ierr); 831 832 // Check for collocated gradient 833 useCollograd = useCollograd && basis_data->d_collograd1d; 834 835 // Collect dim and Q1d 836 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 837 bool isTensor; 838 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 839 if (isTensor) { 840 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 841 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 842 if (P1d>data->maxP1d) data->maxP1d = P1d; 843 } else { 844 // LCOV_EXCL_START 845 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 846 // LCOV_EXCL_STOP 847 } 848 } 849 } 850 // Check output bases for Q1d, dim as well 851 // The only imput basis might be CEED_BASIS_COLLOCATED 852 for (CeedInt i = 0; i < numoutputfields; i++) { 853 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 854 855 if (basis != CEED_BASIS_COLLOCATED) { 856 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 857 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 858 CeedChkBackend(ierr); 859 860 // Collect dim and Q1d 861 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 862 bool isTensor; 863 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 864 if (isTensor) { 865 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 866 } else { 867 // LCOV_EXCL_START 868 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 869 // LCOV_EXCL_STOP 870 } 871 872 // Check for collocated gradient 873 useCollograd = useCollograd && basis_data->d_collograd1d; 874 } 875 } 876 data->dim = dim; 877 data->Q1d = Q1d; 878 879 // Define CEED_Q_VLA 880 if (dim != 3 || useCollograd) { 881 code << "\n#define CEED_Q_VLA 1\n\n"; 882 } else { 883 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 884 } 885 886 code << qFunction; 887 888 // Setup 889 code << "\n// -----------------------------------------------------------------------------\n"; 890 code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 891 for (CeedInt i = 0; i < numinputfields; i++) { 892 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 893 CeedChkBackend(ierr); 894 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 895 code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 896 } 897 } 898 899 for (CeedInt i = 0; i < numoutputfields; i++) { 900 code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 901 } 902 903 code << " const CeedInt Dim = "<<dim<<";\n"; 904 code << " const CeedInt Q1d = "<<Q1d<<";\n"; 905 906 code << " extern __shared__ CeedScalar slice[];\n"; 907 code << " BackendData data;\n"; 908 code << " data.tidx = threadIdx.x;\n"; 909 code << " data.tidy = threadIdx.y;\n"; 910 code << " data.tidz = threadIdx.z;\n"; 911 code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 912 code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 913 914 code << "\n // -- Input field constants and basis data --\n"; 915 //Initialize constants, and matrices B and G 916 for (CeedInt i = 0; i < numinputfields; i++) { 917 code << " // ---- Input field "<<i<<" ----\n"; 918 // Get elemsize, emode, ncomp 919 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 920 CeedChkBackend(ierr); 921 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 922 CeedChkBackend(ierr); 923 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 924 CeedChkBackend(ierr); 925 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 926 CeedChkBackend(ierr); 927 928 // Set field constants 929 if (emode != CEED_EVAL_WEIGHT) { 930 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 931 if (basis != CEED_BASIS_COLLOCATED) { 932 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 933 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 934 } else { 935 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 936 } 937 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 938 } 939 940 // Load basis data 941 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 942 switch (emode) { 943 case CEED_EVAL_NONE: 944 break; 945 case CEED_EVAL_INTERP: 946 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 947 data->B.in[i] = basis_data->d_interp1d; 948 code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 949 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 950 break; 951 case CEED_EVAL_GRAD: 952 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 953 data->B.in[i] = basis_data->d_interp1d; 954 code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 955 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 956 if (useCollograd) { 957 data->G.in[i] = basis_data->d_collograd1d; 958 code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 959 code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 960 } else { 961 data->G.in[i] = basis_data->d_grad1d; 962 code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 963 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 964 } 965 break; 966 case CEED_EVAL_WEIGHT: 967 break; // No action 968 case CEED_EVAL_DIV: 969 break; // TODO: Not implemented 970 case CEED_EVAL_CURL: 971 break; // TODO: Not implemented 972 } 973 } 974 975 code << "\n // -- Output field constants and basis data --\n"; 976 for (CeedInt i = 0; i < numoutputfields; i++) { 977 code << " // ---- Output field "<<i<<" ----\n"; 978 // Get elemsize, emode, ncomp 979 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 980 CeedChkBackend(ierr); 981 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 982 CeedChkBackend(ierr); 983 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 984 CeedChkBackend(ierr); 985 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 986 CeedChkBackend(ierr); 987 988 // Set field constants 989 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 990 if (basis != CEED_BASIS_COLLOCATED) { 991 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 992 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 993 } else { 994 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 995 } 996 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 997 998 // Load basis data 999 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1000 switch (emode) { 1001 case CEED_EVAL_NONE: 1002 break; // No action 1003 case CEED_EVAL_INTERP: 1004 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1005 data->B.out[i] = basis_data->d_interp1d; 1006 code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1007 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1008 break; 1009 case CEED_EVAL_GRAD: 1010 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1011 data->B.out[i] = basis_data->d_interp1d; 1012 code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1013 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1014 if (useCollograd) { 1015 data->G.out[i] = basis_data->d_collograd1d; 1016 code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1017 code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1018 } else { 1019 data->G.out[i] = basis_data->d_grad1d; 1020 code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1021 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1022 } 1023 break; 1024 // LCOV_EXCL_START 1025 case CEED_EVAL_WEIGHT: { 1026 Ceed ceed; 1027 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1028 return CeedError(ceed, CEED_ERROR_BACKEND, 1029 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1030 break; // Should not occur 1031 } 1032 case CEED_EVAL_DIV: 1033 break; // TODO: Not implemented 1034 case CEED_EVAL_CURL: 1035 break; // TODO: Not implemented 1036 // LCOV_EXCL_STOP 1037 } 1038 } 1039 code << "\n // -- Element loop --\n"; 1040 code << " __syncthreads();\n"; 1041 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1042 // Input basis apply if needed 1043 // Generate the correct eval mode code for each input 1044 code << " // -- Input field restrictions and basis actions --\n"; 1045 for (CeedInt i = 0; i < numinputfields; i++) { 1046 code << " // ---- Input field "<<i<<" ----\n"; 1047 // Get elemsize, emode, ncomp 1048 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1049 CeedChkBackend(ierr); 1050 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1051 CeedChkBackend(ierr); 1052 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1053 CeedChkBackend(ierr); 1054 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1055 CeedChkBackend(ierr); 1056 1057 // Restriction 1058 if (emode != CEED_EVAL_WEIGHT && 1059 !((emode == CEED_EVAL_NONE) && useCollograd)) { 1060 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1061 1062 bool isStrided; 1063 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1064 if (!isStrided) { 1065 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1066 CeedChkBackend(ierr); 1067 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1068 CeedInt compstride; 1069 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1070 code << " // CompStride: "<<compstride<<"\n"; 1071 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1072 data->indices.in[i] = restr_data->d_ind; 1073 code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1074 } else { 1075 bool backendstrides; 1076 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1077 CeedChkBackend(ierr); 1078 CeedInt nelem; 1079 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1080 CeedChkBackend(ierr); 1081 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1082 if (!backendstrides) { 1083 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1084 CeedChkBackend(ierr); 1085 } 1086 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1087 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1088 } 1089 } 1090 1091 // Basis action 1092 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1093 switch (emode) { 1094 case CEED_EVAL_NONE: 1095 if (!useCollograd) { 1096 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1097 } 1098 break; 1099 case CEED_EVAL_INTERP: 1100 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1101 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1102 break; 1103 case CEED_EVAL_GRAD: 1104 if (useCollograd) { 1105 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1106 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1107 } else { 1108 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1109 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1110 } 1111 break; 1112 case CEED_EVAL_WEIGHT: 1113 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1114 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 1115 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1116 data->W = basis_data->d_qweight1d; 1117 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1118 break; // No action 1119 case CEED_EVAL_DIV: 1120 break; // TODO: Not implemented 1121 case CEED_EVAL_CURL: 1122 break; // TODO: Not implemented 1123 } 1124 } 1125 1126 // Q function 1127 code << "\n // -- Output field setup --\n"; 1128 for (CeedInt i = 0; i < numoutputfields; i++) { 1129 code << "\n // ---- Output field "<<i<<" ----\n"; 1130 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1131 CeedChkBackend(ierr); 1132 if (emode==CEED_EVAL_GRAD) 1133 { 1134 if (useCollograd) { 1135 //Accumulator for gradient slices 1136 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1137 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1138 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1139 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1140 code << " }\n"; 1141 code << " }\n"; 1142 } else { 1143 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1144 } 1145 } 1146 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1147 { 1148 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1149 } 1150 } 1151 // We treat quadrature points per slice in 3d to save registers 1152 if (useCollograd) { 1153 code << "\n // Note: Collocated Gradient\n"; 1154 code << "#pragma unroll\n"; 1155 code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1156 code << " // -- Input fields --\n"; 1157 for (CeedInt i = 0; i < numinputfields; i++) { 1158 code << " // ---- Input field "<<i<<" ----\n"; 1159 // Get elemsize, emode, ncomp 1160 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1161 CeedChkBackend(ierr); 1162 // Basis action 1163 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1164 switch (emode) { 1165 case CEED_EVAL_NONE: 1166 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1167 1168 bool isStrided; 1169 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr); 1170 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr); 1171 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1172 if (!isStrided) { 1173 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1174 CeedChkBackend(ierr); 1175 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1176 CeedInt compstride; 1177 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1178 code << " // CompStride: "<<compstride<<"\n"; 1179 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1180 data->indices.in[i] = restr_data->d_ind; 1181 code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1182 } else { 1183 bool backendstrides; 1184 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1185 CeedChkBackend(ierr); 1186 CeedInt nelem; 1187 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1188 CeedChkBackend(ierr); 1189 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1190 if (!backendstrides) { 1191 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1192 CeedChkBackend(ierr); 1193 } 1194 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1195 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1196 } 1197 break; 1198 case CEED_EVAL_INTERP: 1199 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1200 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1201 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1202 code << " }\n"; 1203 break; 1204 case CEED_EVAL_GRAD: 1205 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1206 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1207 break; 1208 case CEED_EVAL_WEIGHT: 1209 code << " CeedScalar r_q"<<i<<"[1];\n"; 1210 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1211 break; // No action 1212 case CEED_EVAL_DIV: 1213 break; // TODO: Not implemented 1214 case CEED_EVAL_CURL: 1215 break; // TODO: Not implemented 1216 } 1217 } 1218 code << "\n // -- Output fields --\n"; 1219 for (CeedInt i = 0; i < numoutputfields; i++) { 1220 code << " // ---- Output field "<<i<<" ----\n"; 1221 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1222 CeedChkBackend(ierr); 1223 // Basis action 1224 switch (emode) { 1225 case CEED_EVAL_NONE: 1226 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1227 break; // No action 1228 case CEED_EVAL_INTERP: 1229 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1230 break; 1231 case CEED_EVAL_GRAD: 1232 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1233 break; 1234 case CEED_EVAL_WEIGHT: 1235 break; // Should not occur 1236 case CEED_EVAL_DIV: 1237 break; // TODO: Not implemented 1238 case CEED_EVAL_CURL: 1239 break; // TODO: Not implemented 1240 } 1241 } 1242 } else { 1243 code << "\n // Note: No Collocated Gradient\n"; 1244 code << " // -- Input fields --\n"; 1245 for (CeedInt i = 0; i < numinputfields; i++) { 1246 code << " // ---- Input field "<<i<<" ----\n"; 1247 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1248 } 1249 code << " // -- Output fields --\n"; 1250 for (CeedInt i = 0; i < numoutputfields; i++) { 1251 code << " // ---- Output field "<<i<<" ----\n"; 1252 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1253 } 1254 } 1255 code << "\n // -- QFunction Inputs and outputs --\n"; 1256 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1257 for (CeedInt i = 0; i < numinputfields; i++) { 1258 code << " // ---- Input field "<<i<<" ----\n"; 1259 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1260 } 1261 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1262 for (CeedInt i = 0; i < numoutputfields; i++) { 1263 code << " // ---- Output field "<<i<<" ----\n"; 1264 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1265 } 1266 code << "\n // -- Apply QFunction --\n"; 1267 code << " "<<qFunctionName<<"(ctx, "; 1268 if (dim != 3 || useCollograd) { 1269 code << "1"; 1270 } else { 1271 code << "Q1d"; 1272 } 1273 code << ", in, out);\n"; 1274 if (useCollograd) { 1275 code << "\n // Note: Collocated Gradient\n"; 1276 code << " // -- Output fields --\n"; 1277 for (CeedInt i = 0; i < numoutputfields; i++) { 1278 code << " // ---- Output field "<<i<<" ----\n"; 1279 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1280 CeedChkBackend(ierr); 1281 // Basis action 1282 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1283 switch (emode) { 1284 case CEED_EVAL_NONE: 1285 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1286 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1287 code << " }\n"; 1288 break; // No action 1289 case CEED_EVAL_INTERP: 1290 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1291 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1292 code << " }\n"; 1293 break; 1294 case CEED_EVAL_GRAD: 1295 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1296 break; 1297 case CEED_EVAL_WEIGHT: 1298 break; // Should not occur 1299 case CEED_EVAL_DIV: 1300 break; // TODO: Not implemented 1301 case CEED_EVAL_CURL: 1302 break; // TODO: Not implemented 1303 } 1304 } 1305 code << " }\n"; 1306 } 1307 1308 // Output basis apply if needed 1309 // Generate the correct eval mode code for each output 1310 code << "\n // -- Output field basis action and restrictions --\n"; 1311 for (CeedInt i = 0; i < numoutputfields; i++) { 1312 code << " // ---- Output field "<<i<<" ----\n"; 1313 // Get elemsize, emode, ncomp 1314 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1315 CeedChkBackend(ierr); 1316 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1317 CeedChkBackend(ierr); 1318 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1319 CeedChkBackend(ierr); 1320 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1321 CeedChkBackend(ierr); 1322 // Basis action 1323 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1324 switch (emode) { 1325 case CEED_EVAL_NONE: 1326 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1327 break; // No action 1328 case CEED_EVAL_INTERP: 1329 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1330 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1331 break; 1332 case CEED_EVAL_GRAD: 1333 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1334 if (useCollograd) { 1335 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1336 } else { 1337 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1338 } 1339 break; 1340 // LCOV_EXCL_START 1341 case CEED_EVAL_WEIGHT: { 1342 Ceed ceed; 1343 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1344 return CeedError(ceed, CEED_ERROR_BACKEND, 1345 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1346 break; // Should not occur 1347 } 1348 case CEED_EVAL_DIV: 1349 break; // TODO: Not implemented 1350 case CEED_EVAL_CURL: 1351 break; // TODO: Not implemented 1352 // LCOV_EXCL_STOP 1353 } 1354 // Restriction 1355 bool isStrided; 1356 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1357 if (!isStrided) { 1358 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1359 CeedChkBackend(ierr); 1360 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1361 CeedInt compstride; 1362 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1363 code << " // CompStride: "<<compstride<<"\n"; 1364 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1365 data->indices.out[i] = restr_data->d_ind; 1366 code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1367 } else { 1368 bool backendstrides; 1369 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1370 CeedChkBackend(ierr); 1371 CeedInt nelem; 1372 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1373 CeedChkBackend(ierr); 1374 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1375 if (!backendstrides) { 1376 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1377 CeedChkBackend(ierr); 1378 } 1379 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1380 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1381 } 1382 } 1383 1384 code << " }\n"; 1385 code << "}\n"; 1386 code << "// -----------------------------------------------------------------------------\n\n"; 1387 1388 // View kernel for debugging 1389 CeedDebug(ceed, code.str().c_str()); 1390 1391 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 1392 "T1d", CeedIntMax(Q1d, data->maxP1d)); 1393 CeedChkBackend(ierr); 1394 ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1395 CeedChkBackend(ierr); 1396 1397 ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 1398 return CEED_ERROR_SUCCESS; 1399 } 1400 //------------------------------------------------------------------------------ 1401