1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed/ceed.h> 11 #include <ceed/backend.h> 12 #include <cuda_runtime.h> 13 #include <iostream> 14 #include <sstream> 15 #include "ceed-cuda-gen.h" 16 #include "../cuda/ceed-cuda-compile.h" 17 #include "../cuda-ref/ceed-cuda-ref.h" 18 #include "../cuda-shared/ceed-cuda-shared.h" 19 20 static const char *atomicAdd = QUOTE( 21 //------------------------------------------------------------------------------ 22 // Atomic add, for older CUDA 23 //------------------------------------------------------------------------------ 24 __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) { 25 unsigned long long int *address_as_ull = (unsigned long long int *)address; 26 unsigned long long int old = *address_as_ull, assumed; 27 do { 28 assumed = old; 29 old = 30 atomicCAS(address_as_ull, assumed, 31 __double_as_longlong(val + 32 __longlong_as_double(assumed))); 33 // Note: uses integer comparison to avoid hang in case of NaN 34 // (since NaN != NaN) 35 } while (assumed != old); 36 return __longlong_as_double(old); 37 } 38 ); 39 40 static const char *deviceFunctions = QUOTE( 41 42 //------------------------------------------------------------------------------ 43 // Typedefs 44 //------------------------------------------------------------------------------ 45 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 46 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 47 48 typedef struct { 49 CeedInt tidx; 50 CeedInt tidy; 51 CeedInt tidz; 52 CeedInt tid; 53 CeedScalar* slice; 54 } BackendData; 55 56 //------------------------------------------------------------------------------ 57 // Load matrices for basis actions 58 //------------------------------------------------------------------------------ 59 template <int P, int Q> 60 inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 61 for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 62 B[i] = d_B[i]; 63 } 64 65 //------------------------------------------------------------------------------ 66 // 1D 67 //------------------------------------------------------------------------------ 68 69 //------------------------------------------------------------------------------ 70 // L-vector -> E-vector, offsets provided 71 //------------------------------------------------------------------------------ 72 template <int NCOMP, int COMPSTRIDE, int P1d> 73 inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 74 if (data.tidx < P1d) { 75 const CeedInt node = data.tidx; 76 const CeedInt ind = indices[node + elem * P1d]; 77 for (CeedInt comp = 0; comp < NCOMP; ++comp) 78 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 79 } 80 } 81 82 //------------------------------------------------------------------------------ 83 // L-vector -> E-vector, strided 84 //------------------------------------------------------------------------------ 85 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 86 inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 87 if (data.tidx < P1d) { 88 const CeedInt node = data.tidx; 89 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 90 for (CeedInt comp = 0; comp < NCOMP; ++comp) 91 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 92 } 93 } 94 95 //------------------------------------------------------------------------------ 96 // E-vector -> L-vector, offsets provided 97 //------------------------------------------------------------------------------ 98 template <int NCOMP, int COMPSTRIDE, int P1d> 99 inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 100 if (data.tidx < P1d) { 101 const CeedInt node = data.tidx; 102 const CeedInt ind = indices[node + elem * P1d]; 103 for (CeedInt comp = 0; comp < NCOMP; ++comp) 104 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 105 } 106 } 107 108 //------------------------------------------------------------------------------ 109 // E-vector -> L-vector, strided 110 //------------------------------------------------------------------------------ 111 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 112 inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 113 if (data.tidx < P1d) { 114 const CeedInt node = data.tidx; 115 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 116 for (CeedInt comp = 0; comp < NCOMP; ++comp) 117 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 118 } 119 } 120 121 //------------------------------------------------------------------------------ 122 // 1D tensor contraction x 123 //------------------------------------------------------------------------------ 124 template <int NCOMP, int P1d, int Q1d> 125 inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 126 data.slice[data.tidx] = *U; 127 __syncthreads(); 128 *V = 0.0; 129 if (data.tidx < Q1d) 130 for (CeedInt i = 0; i < P1d; ++i) 131 *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 132 __syncthreads(); 133 } 134 135 //------------------------------------------------------------------------------ 136 // 1D transpose tensor contraction x 137 //------------------------------------------------------------------------------ 138 template <int NCOMP, int P1d, int Q1d> 139 inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 140 data.slice[data.tidx] = *U; 141 __syncthreads(); 142 *V = 0.0; 143 if (data.tidx < P1d) 144 for (CeedInt i = 0; i < Q1d; ++i) 145 *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 146 __syncthreads(); 147 } 148 149 //------------------------------------------------------------------------------ 150 // 1D interpolate to quadrature points 151 //------------------------------------------------------------------------------ 152 template <int NCOMP, int P1d, int Q1d> 153 inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 154 for (CeedInt comp = 0; comp < NCOMP; comp++) 155 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 156 } 157 158 //------------------------------------------------------------------------------ 159 // 1D interpolate transpose 160 //------------------------------------------------------------------------------ 161 template <int NCOMP, int P1d, int Q1d> 162 inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 163 for (CeedInt comp=0; comp<NCOMP; comp++) 164 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 165 } 166 167 //------------------------------------------------------------------------------ 168 // 1D derivatives at quadrature points 169 //------------------------------------------------------------------------------ 170 template <int NCOMP, int P1d, int Q1d> 171 inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 172 for (CeedInt comp = 0; comp < NCOMP; comp++) 173 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 174 } 175 176 //------------------------------------------------------------------------------ 177 // 1D derivatives transpose 178 //------------------------------------------------------------------------------ 179 template <int NCOMP, int P1d, int Q1d> 180 inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 181 for (CeedInt comp = 0; comp < NCOMP; comp++) 182 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 183 } 184 185 //------------------------------------------------------------------------------ 186 // 2D 187 //------------------------------------------------------------------------------ 188 189 //------------------------------------------------------------------------------ 190 // L-vector -> E-vector, offsets provided 191 //------------------------------------------------------------------------------ 192 template <int NCOMP, int COMPSTRIDE, int P1d> 193 inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 194 if (data.tidx < P1d && data.tidy < P1d) { 195 const CeedInt node = data.tidx + data.tidy*P1d; 196 const CeedInt ind = indices[node + elem * P1d*P1d]; 197 for (CeedInt comp = 0; comp < NCOMP; ++comp) 198 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 199 } 200 } 201 202 //------------------------------------------------------------------------------ 203 // L-vector -> E-vector, strided 204 //------------------------------------------------------------------------------ 205 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 206 inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 207 if (data.tidx < P1d && data.tidy < P1d) { 208 const CeedInt node = data.tidx + data.tidy*P1d; 209 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 210 for (CeedInt comp = 0; comp < NCOMP; ++comp) 211 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 212 } 213 } 214 215 //------------------------------------------------------------------------------ 216 // E-vector -> L-vector, offsets provided 217 //------------------------------------------------------------------------------ 218 template <int NCOMP, int COMPSTRIDE, int P1d> 219 inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 220 if (data.tidx < P1d && data.tidy < P1d) { 221 const CeedInt node = data.tidx + data.tidy*P1d; 222 const CeedInt ind = indices[node + elem * P1d*P1d]; 223 for (CeedInt comp = 0; comp < NCOMP; ++comp) 224 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 225 } 226 } 227 228 //------------------------------------------------------------------------------ 229 // E-vector -> L-vector, strided 230 //------------------------------------------------------------------------------ 231 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 232 inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 233 if (data.tidx < P1d && data.tidy < P1d) { 234 const CeedInt node = data.tidx + data.tidy*P1d; 235 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 236 for (CeedInt comp = 0; comp < NCOMP; ++comp) 237 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 238 } 239 } 240 241 //------------------------------------------------------------------------------ 242 // 2D tensor contraction x 243 //------------------------------------------------------------------------------ 244 template <int NCOMP, int P1d, int Q1d> 245 inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 246 data.slice[data.tidx+data.tidy*T1d] = *U; 247 __syncthreads(); 248 *V = 0.0; 249 if (data.tidx < Q1d && data.tidy < P1d) 250 for (CeedInt i = 0; i < P1d; ++i) 251 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 252 __syncthreads(); 253 } 254 255 //------------------------------------------------------------------------------ 256 // 2D tensor contract y 257 //------------------------------------------------------------------------------ 258 template <int NCOMP, int P1d, int Q1d> 259 inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 260 data.slice[data.tidx+data.tidy*T1d] = *U; 261 __syncthreads(); 262 *V = 0.0; 263 if (data.tidx < Q1d && data.tidy < Q1d) 264 for (CeedInt i = 0; i < P1d; ++i) 265 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 266 __syncthreads(); 267 } 268 269 //------------------------------------------------------------------------------ 270 // 2D transpose tensor contract y 271 //------------------------------------------------------------------------------ 272 template <int NCOMP, int P1d, int Q1d> 273 inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 274 data.slice[data.tidx+data.tidy*T1d] = *U; 275 __syncthreads(); 276 *V = 0.0; 277 if (data.tidx < Q1d && data.tidy < P1d) 278 for (CeedInt i = 0; i < Q1d; ++i) 279 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 280 __syncthreads(); 281 } 282 283 //------------------------------------------------------------------------------ 284 // 2D transpose tensor contract x 285 //------------------------------------------------------------------------------ 286 template <int NCOMP, int P1d, int Q1d> 287 inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 288 data.slice[data.tidx+data.tidy*T1d] = *U; 289 __syncthreads(); 290 *V = 0.0; 291 if (data.tidx < P1d && data.tidy < P1d) 292 for (CeedInt i = 0; i < Q1d; ++i) 293 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 294 __syncthreads(); 295 } 296 297 //------------------------------------------------------------------------------ 298 // 2D transpose tensor contract and add x 299 //------------------------------------------------------------------------------ 300 template <int NCOMP, int P1d, int Q1d> 301 inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 302 data.slice[data.tidx+data.tidy*T1d] = *U; 303 __syncthreads(); 304 if (data.tidx < P1d && data.tidy < P1d) 305 for (CeedInt i = 0; i < Q1d; ++i) 306 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 307 __syncthreads(); 308 } 309 310 //------------------------------------------------------------------------------ 311 // 2D interpolate to quadrature points 312 //------------------------------------------------------------------------------ 313 template <int NCOMP, int P1d, int Q1d> 314 inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 315 CeedScalar r_t[1]; 316 for (CeedInt comp = 0; comp < NCOMP; comp++) { 317 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 318 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 319 } 320 } 321 322 //------------------------------------------------------------------------------ 323 // 2D interpolate transpose 324 //------------------------------------------------------------------------------ 325 template <int NCOMP, int P1d, int Q1d> 326 inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 327 CeedScalar r_t[1]; 328 for (CeedInt comp = 0; comp < NCOMP; comp++) { 329 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 330 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 331 } 332 } 333 334 //------------------------------------------------------------------------------ 335 // 2D derivatives at quadrature points 336 //------------------------------------------------------------------------------ 337 template <int NCOMP, int P1d, int Q1d> 338 inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 339 CeedScalar r_t[1]; 340 for (CeedInt comp = 0; comp < NCOMP; comp++) { 341 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 342 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 343 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 344 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 345 } 346 } 347 348 //------------------------------------------------------------------------------ 349 // 2D derivatives transpose 350 //------------------------------------------------------------------------------ 351 template <int NCOMP, int P1d, int Q1d> 352 inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 353 CeedScalar r_t[1]; 354 for (CeedInt comp = 0; comp < NCOMP; comp++) { 355 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 356 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 357 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 358 ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 359 } 360 } 361 362 //------------------------------------------------------------------------------ 363 // 3D 364 //------------------------------------------------------------------------------ 365 366 //------------------------------------------------------------------------------ 367 // L-vector -> E-vector, offsets provided 368 //------------------------------------------------------------------------------ 369 template <int NCOMP, int COMPSTRIDE, int P1d> 370 inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 371 if (data.tidx < P1d && data.tidy < P1d) 372 for (CeedInt z = 0; z < P1d; ++z) { 373 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 374 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 375 for (CeedInt comp = 0; comp < NCOMP; ++comp) 376 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 377 } 378 } 379 380 //------------------------------------------------------------------------------ 381 // L-vector -> E-vector, strided 382 //------------------------------------------------------------------------------ 383 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 384 inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 385 if (data.tidx < P1d && data.tidy < P1d) 386 for (CeedInt z = 0; z < P1d; ++z) { 387 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 388 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 389 for (CeedInt comp = 0; comp < NCOMP; ++comp) 390 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 391 } 392 } 393 394 //------------------------------------------------------------------------------ 395 // E-vector -> Q-vector, offests provided 396 //------------------------------------------------------------------------------ 397 template <int NCOMP, int COMPSTRIDE, int Q1d> 398 inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 399 if (data.tidx < Q1d && data.tidy < Q1d) { 400 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 401 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 402 for (CeedInt comp = 0; comp < NCOMP; ++comp) 403 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 404 } 405 } 406 407 //------------------------------------------------------------------------------ 408 // E-vector -> Q-vector, strided 409 //------------------------------------------------------------------------------ 410 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 411 inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 412 if (data.tidx < Q1d && data.tidy < Q1d) { 413 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 414 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 415 for (CeedInt comp = 0; comp < NCOMP; ++comp) 416 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 417 } 418 } 419 420 //------------------------------------------------------------------------------ 421 // E-vector -> L-vector, offsets provided 422 //------------------------------------------------------------------------------ 423 template <int NCOMP, int COMPSTRIDE, int P1d> 424 inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 425 if (data.tidx < P1d && data.tidy < P1d) 426 for (CeedInt z = 0; z < P1d; ++z) { 427 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 428 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 429 for (CeedInt comp = 0; comp < NCOMP; ++comp) 430 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 431 } 432 } 433 434 //------------------------------------------------------------------------------ 435 // E-vector -> L-vector, strided 436 //------------------------------------------------------------------------------ 437 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 438 inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 439 if (data.tidx < P1d && data.tidy < P1d) 440 for (CeedInt z = 0; z < P1d; ++z) { 441 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 442 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 443 for (CeedInt comp = 0; comp < NCOMP; ++comp) 444 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 445 } 446 } 447 448 //------------------------------------------------------------------------------ 449 // 3D tensor contract x 450 //------------------------------------------------------------------------------ 451 template <int NCOMP, int P1d, int Q1d> 452 inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 453 CeedScalar r_B[P1d]; 454 for (CeedInt i = 0; i < P1d; ++i) 455 r_B[i] = B[i + data.tidx*P1d]; 456 457 for (CeedInt k = 0; k < P1d; ++k) { 458 data.slice[data.tidx+data.tidy*T1d] = U[k]; 459 __syncthreads(); 460 V[k] = 0.0; 461 if (data.tidx < Q1d && data.tidy < P1d) 462 for (CeedInt i = 0; i < P1d; ++i) 463 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 464 __syncthreads(); 465 } 466 } 467 468 //------------------------------------------------------------------------------ 469 // 3D tensor contract y 470 //------------------------------------------------------------------------------ 471 template <int NCOMP, int P1d, int Q1d> 472 inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 473 CeedScalar r_B[P1d]; 474 for (CeedInt i = 0; i < P1d; ++i) 475 r_B[i] = B[i + data.tidy*P1d]; 476 477 for (CeedInt k = 0; k < P1d; ++k) { 478 data.slice[data.tidx+data.tidy*T1d] = U[k]; 479 __syncthreads(); 480 V[k] = 0.0; 481 if (data.tidx < Q1d && data.tidy < Q1d) 482 for (CeedInt i = 0; i < P1d; ++i) 483 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 484 __syncthreads(); 485 } 486 } 487 488 //------------------------------------------------------------------------------ 489 // 3D tensor contract z 490 //------------------------------------------------------------------------------ 491 template <int NCOMP, int P1d, int Q1d> 492 inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 493 for (CeedInt k = 0; k < Q1d; ++k) { 494 V[k] = 0.0; 495 if (data.tidx < Q1d && data.tidy < Q1d) 496 for (CeedInt i = 0; i < P1d; ++i) 497 V[k] += B[i + k*P1d] * U[i]; // Contract z direction 498 } 499 } 500 501 //------------------------------------------------------------------------------ 502 // 3D transpose tensor contract z 503 //------------------------------------------------------------------------------ 504 template <int NCOMP, int P1d, int Q1d> 505 inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 506 for (CeedInt k = 0; k < P1d; ++k) { 507 V[k] = 0.0; 508 if (data.tidx < Q1d && data.tidy < Q1d) 509 for (CeedInt i = 0; i < Q1d; ++i) 510 V[k] += B[k + i*P1d] * U[i]; // Contract z direction 511 } 512 } 513 514 //------------------------------------------------------------------------------ 515 // 3D transpose tensor contract y 516 //------------------------------------------------------------------------------ 517 template <int NCOMP, int P1d, int Q1d> 518 inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 519 CeedScalar r_B[Q1d]; 520 for (CeedInt i = 0; i < Q1d; ++i) 521 r_B[i] = B[data.tidy + i*P1d]; 522 523 for (CeedInt k = 0; k < P1d; ++k) { 524 data.slice[data.tidx+data.tidy*T1d] = U[k]; 525 __syncthreads(); 526 V[k] = 0.0; 527 if (data.tidx < Q1d && data.tidy < P1d) 528 for (CeedInt i = 0; i < Q1d; ++i) 529 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 530 __syncthreads(); 531 } 532 } 533 534 //------------------------------------------------------------------------------ 535 // 3D transpose tensor contract add y 536 //------------------------------------------------------------------------------ 537 template <int NCOMP, int P1d, int Q1d> 538 inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 539 CeedScalar r_B[Q1d]; 540 for (CeedInt i = 0; i < Q1d; ++i) 541 r_B[i] = B[data.tidy + i*P1d]; 542 543 for (CeedInt k = 0; k < P1d; ++k) { 544 data.slice[data.tidx+data.tidy*T1d] = U[k]; 545 __syncthreads(); 546 if (data.tidx < Q1d && data.tidy < P1d) 547 for (CeedInt i = 0; i < Q1d; ++i) 548 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 549 __syncthreads(); 550 } 551 } 552 553 //------------------------------------------------------------------------------ 554 // 3D transpose tensor contract x 555 //------------------------------------------------------------------------------ 556 template <int NCOMP, int P1d, int Q1d> 557 inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 558 CeedScalar r_B[Q1d]; 559 for (CeedInt i = 0; i < Q1d; ++i) 560 r_B[i] = B[data.tidx + i*P1d]; 561 562 for (CeedInt k = 0; k < P1d; ++k) { 563 data.slice[data.tidx+data.tidy*T1d] = U[k]; 564 __syncthreads(); 565 V[k] = 0.0; 566 if (data.tidx < P1d && data.tidy < P1d) 567 for (CeedInt i = 0; i < Q1d; ++i) 568 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 569 __syncthreads(); 570 } 571 } 572 573 //------------------------------------------------------------------------------ 574 // 3D transpose tensor contract add x 575 //------------------------------------------------------------------------------ 576 template <int NCOMP, int P1d, int Q1d> 577 inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 578 CeedScalar r_B[Q1d]; 579 for (CeedInt i = 0; i < Q1d; ++i) 580 r_B[i] = B[data.tidx + i*P1d]; 581 582 for (CeedInt k = 0; k < P1d; ++k) { 583 data.slice[data.tidx+data.tidy*T1d] = U[k]; 584 __syncthreads(); 585 if (data.tidx < P1d && data.tidy < P1d) 586 for (CeedInt i = 0; i < Q1d; ++i) 587 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 588 __syncthreads(); 589 } 590 } 591 592 //------------------------------------------------------------------------------ 593 // 3D interpolate to quadrature points 594 //------------------------------------------------------------------------------ 595 template <int NCOMP, int P1d, int Q1d> 596 inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 597 CeedScalar r_t1[T1d]; 598 CeedScalar r_t2[T1d]; 599 for (CeedInt comp = 0; comp < NCOMP; comp++) { 600 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 601 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 602 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 603 } 604 } 605 606 //------------------------------------------------------------------------------ 607 // 3D interpolate transpose 608 //------------------------------------------------------------------------------ 609 template <int NCOMP, int P1d, int Q1d> 610 inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 611 CeedScalar r_t1[T1d]; 612 CeedScalar r_t2[T1d]; 613 for (CeedInt comp = 0; comp < NCOMP; comp++) { 614 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 615 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 616 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 617 } 618 } 619 620 //------------------------------------------------------------------------------ 621 // 3D derivatives at quadrature points 622 //------------------------------------------------------------------------------ 623 template <int NCOMP, int P1d, int Q1d> 624 inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 625 CeedScalar r_t1[T1d]; 626 CeedScalar r_t2[T1d]; 627 for (CeedInt comp = 0; comp < NCOMP; comp++) { 628 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 629 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 630 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 631 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 632 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 633 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 634 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 635 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 636 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 637 } 638 } 639 640 //------------------------------------------------------------------------------ 641 // 3D derivatives transpose 642 //------------------------------------------------------------------------------ 643 template <int NCOMP, int P1d, int Q1d> 644 inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 645 CeedScalar r_t1[T1d]; 646 CeedScalar r_t2[T1d]; 647 for (CeedInt comp = 0; comp < NCOMP; comp++) { 648 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 649 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 650 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 651 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 652 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 653 ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 654 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 655 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 656 ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 657 } 658 } 659 660 //------------------------------------------------------------------------------ 661 // 3D collocated derivatives computation 662 //------------------------------------------------------------------------------ 663 template <int NCOMP, int Q1d> 664 inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 665 if (data.tidx < Q1d && data.tidy < Q1d) { 666 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 667 data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 668 __syncthreads(); 669 // X derivative 670 r_V[comp+0*NCOMP] = 0.0; 671 for (CeedInt i = 0; i < Q1d; ++i) 672 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 673 // Y derivative 674 r_V[comp+1*NCOMP] = 0.0; 675 for (CeedInt i = 0; i < Q1d; ++i) 676 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 677 // Z derivative 678 r_V[comp+2*NCOMP] = 0.0; 679 for (CeedInt i = 0; i < Q1d; ++i) 680 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 681 __syncthreads(); 682 } 683 } 684 } 685 686 //------------------------------------------------------------------------------ 687 // 3D collocated derivatives transpose 688 //------------------------------------------------------------------------------ 689 template <int NCOMP, int Q1d> 690 inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 691 if (data.tidx < Q1d && data.tidy < Q1d) { 692 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 693 // X derivative 694 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 695 __syncthreads(); 696 for (CeedInt i = 0; i < Q1d; ++i) 697 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 698 __syncthreads(); 699 // Y derivative 700 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 701 __syncthreads(); 702 for (CeedInt i = 0; i < Q1d; ++i) 703 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 704 __syncthreads(); 705 // Z derivative 706 for (CeedInt i = 0; i < Q1d; ++i) 707 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 708 } 709 } 710 } 711 712 //------------------------------------------------------------------------------ 713 // 1D quadrature weights 714 //------------------------------------------------------------------------------ 715 template <int Q1d> 716 inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 717 *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 718 } 719 720 //------------------------------------------------------------------------------ 721 // 2D quadrature weights 722 //------------------------------------------------------------------------------ 723 template <int Q1d> 724 inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 725 *w = (data.tidx < Q1d && data.tidy < Q1d) ? 726 qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 727 } 728 729 //------------------------------------------------------------------------------ 730 // 3D quadrature weights 731 //------------------------------------------------------------------------------ 732 template <int Q1d> 733 inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 734 const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 735 const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 736 for (CeedInt z = 0; z < Q1d; ++z) 737 w[z] = quad ? pw*qweight1d[z] : 0.0; 738 } 739 740 ); 741 //------------------------------------------------------------------------------ 742 // Build singe operator kernel 743 //------------------------------------------------------------------------------ 744 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 745 746 using std::ostringstream; 747 using std::string; 748 int ierr; 749 bool setupdone; 750 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 751 if (setupdone) return CEED_ERROR_SUCCESS; 752 Ceed ceed; 753 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 754 CeedOperator_Cuda_gen *data; 755 ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 756 CeedQFunction qf; 757 CeedQFunction_Cuda_gen *qf_data; 758 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 759 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 760 CeedSize lsize; 761 CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields, 762 numoutputfields, ncomp, dim = 1; 763 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 764 Q1d = Q; 765 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 766 CeedOperatorField *opinputfields, *opoutputfields; 767 ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields); 768 CeedChkBackend(ierr); 769 CeedQFunctionField *qfinputfields, *qfoutputfields; 770 ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 771 CeedChkBackend(ierr); 772 CeedEvalMode emode; 773 CeedBasis basis; 774 CeedBasis_Cuda_shared *basis_data; 775 CeedElemRestriction Erestrict; 776 CeedElemRestriction_Cuda *restr_data; 777 778 // Check for restriction only identity operator 779 bool is_identity_qf; 780 ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 781 if (is_identity_qf) { 782 CeedEvalMode emodein, emodeout; 783 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein); CeedChkBackend(ierr); 784 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout); CeedChkBackend(ierr); 785 if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE) 786 // LCOV_EXCL_START 787 return CeedError(ceed, CEED_ERROR_BACKEND, 788 "Backend does not implement restriction only identity operators"); 789 // LCOV_EXCL_STOP 790 } 791 792 ostringstream code; 793 string devFunctions(deviceFunctions); 794 795 // Add atomicAdd function for old NVidia architectures 796 struct cudaDeviceProp prop; 797 Ceed_Cuda *ceed_data; 798 ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr); 799 ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr); 800 if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){ 801 code << atomicAdd; 802 } 803 804 code << devFunctions; 805 806 string qFunction(qf_data->qFunctionSource); 807 string qFunctionName(qf_data->qFunctionName); 808 string oper; 809 oper = "CeedKernel_Cuda_gen_" + qFunctionName; 810 811 // Find dim and Q1d 812 bool useCollograd = false; 813 // Only use collocated gradient algorithm when we actually compute a gradient. 814 if ( dim == 3 ) { 815 for (CeedInt i = 0; i < numinputfields; i++) { 816 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 817 if (emode == CEED_EVAL_GRAD) { 818 useCollograd = true; 819 } 820 } 821 for (CeedInt i = 0; i < numoutputfields; i++) { 822 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 823 if (emode == CEED_EVAL_GRAD) { 824 useCollograd = true; 825 } 826 } 827 } 828 data->maxP1d = 0; 829 for (CeedInt i = 0; i < numinputfields; i++) { 830 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 831 if (basis != CEED_BASIS_COLLOCATED) { 832 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 833 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 834 CeedChkBackend(ierr); 835 836 // Check for collocated gradient 837 useCollograd = useCollograd && basis_data->d_collo_grad_1d; 838 839 // Collect dim and Q1d 840 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 841 bool isTensor; 842 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 843 if (isTensor) { 844 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 845 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 846 if (P1d>data->maxP1d) data->maxP1d = P1d; 847 } else { 848 // LCOV_EXCL_START 849 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 850 // LCOV_EXCL_STOP 851 } 852 } 853 } 854 // Check output bases for Q1d, dim as well 855 // The only input basis might be CEED_BASIS_COLLOCATED 856 for (CeedInt i = 0; i < numoutputfields; i++) { 857 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 858 859 if (basis != CEED_BASIS_COLLOCATED) { 860 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 861 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 862 CeedChkBackend(ierr); 863 864 // Collect dim and Q1d 865 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 866 bool isTensor; 867 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 868 if (isTensor) { 869 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 870 } else { 871 // LCOV_EXCL_START 872 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 873 // LCOV_EXCL_STOP 874 } 875 876 // Check for collocated gradient 877 useCollograd = useCollograd && basis_data->d_collo_grad_1d; 878 } 879 } 880 data->dim = dim; 881 data->Q1d = Q1d; 882 883 // Define CEED_Q_VLA 884 if (dim != 3 || useCollograd) { 885 code << "\n#define CEED_Q_VLA 1\n\n"; 886 } else { 887 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 888 } 889 890 code << qFunction; 891 892 // Setup 893 code << "\n// -----------------------------------------------------------------------------\n"; 894 code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 895 for (CeedInt i = 0; i < numinputfields; i++) { 896 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 897 CeedChkBackend(ierr); 898 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 899 code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 900 } 901 } 902 903 for (CeedInt i = 0; i < numoutputfields; i++) { 904 code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 905 } 906 907 code << " const CeedInt Dim = "<<dim<<";\n"; 908 code << " const CeedInt Q1d = "<<Q1d<<";\n"; 909 910 code << " extern __shared__ CeedScalar slice[];\n"; 911 code << " BackendData data;\n"; 912 code << " data.tidx = threadIdx.x;\n"; 913 code << " data.tidy = threadIdx.y;\n"; 914 code << " data.tidz = threadIdx.z;\n"; 915 code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 916 code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 917 918 code << "\n // -- Input field constants and basis data --\n"; 919 //Initialize constants, and matrices B and G 920 for (CeedInt i = 0; i < numinputfields; i++) { 921 code << " // ---- Input field "<<i<<" ----\n"; 922 // Get elemsize, emode, ncomp 923 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 924 CeedChkBackend(ierr); 925 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 926 CeedChkBackend(ierr); 927 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 928 CeedChkBackend(ierr); 929 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 930 CeedChkBackend(ierr); 931 932 // Set field constants 933 if (emode != CEED_EVAL_WEIGHT) { 934 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 935 if (basis != CEED_BASIS_COLLOCATED) { 936 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 937 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 938 } else { 939 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 940 } 941 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 942 } 943 944 // Load basis data 945 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 946 switch (emode) { 947 case CEED_EVAL_NONE: 948 break; 949 case CEED_EVAL_INTERP: 950 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 951 data->B.in[i] = basis_data->d_interp_1d; 952 code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 953 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 954 break; 955 case CEED_EVAL_GRAD: 956 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 957 data->B.in[i] = basis_data->d_interp_1d; 958 code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 959 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 960 if (useCollograd) { 961 data->G.in[i] = basis_data->d_collo_grad_1d; 962 code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 963 code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 964 } else { 965 data->G.in[i] = basis_data->d_grad_1d; 966 code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 967 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 968 } 969 break; 970 case CEED_EVAL_WEIGHT: 971 break; // No action 972 case CEED_EVAL_DIV: 973 break; // TODO: Not implemented 974 case CEED_EVAL_CURL: 975 break; // TODO: Not implemented 976 } 977 } 978 979 code << "\n // -- Output field constants and basis data --\n"; 980 for (CeedInt i = 0; i < numoutputfields; i++) { 981 code << " // ---- Output field "<<i<<" ----\n"; 982 // Get elemsize, emode, ncomp 983 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 984 CeedChkBackend(ierr); 985 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 986 CeedChkBackend(ierr); 987 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 988 CeedChkBackend(ierr); 989 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 990 CeedChkBackend(ierr); 991 992 // Set field constants 993 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 994 if (basis != CEED_BASIS_COLLOCATED) { 995 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 996 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 997 } else { 998 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 999 } 1000 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 1001 1002 // Load basis data 1003 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1004 switch (emode) { 1005 case CEED_EVAL_NONE: 1006 break; // No action 1007 case CEED_EVAL_INTERP: 1008 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1009 data->B.out[i] = basis_data->d_interp_1d; 1010 code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1011 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1012 break; 1013 case CEED_EVAL_GRAD: 1014 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1015 data->B.out[i] = basis_data->d_interp_1d; 1016 code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1017 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1018 if (useCollograd) { 1019 data->G.out[i] = basis_data->d_collo_grad_1d; 1020 code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1021 code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1022 } else { 1023 data->G.out[i] = basis_data->d_grad_1d; 1024 code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1025 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1026 } 1027 break; 1028 // LCOV_EXCL_START 1029 case CEED_EVAL_WEIGHT: { 1030 Ceed ceed; 1031 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1032 return CeedError(ceed, CEED_ERROR_BACKEND, 1033 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1034 break; // Should not occur 1035 } 1036 case CEED_EVAL_DIV: 1037 break; // TODO: Not implemented 1038 case CEED_EVAL_CURL: 1039 break; // TODO: Not implemented 1040 // LCOV_EXCL_STOP 1041 } 1042 } 1043 code << "\n // -- Element loop --\n"; 1044 code << " __syncthreads();\n"; 1045 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1046 // Input basis apply if needed 1047 // Generate the correct eval mode code for each input 1048 code << " // -- Input field restrictions and basis actions --\n"; 1049 for (CeedInt i = 0; i < numinputfields; i++) { 1050 code << " // ---- Input field "<<i<<" ----\n"; 1051 // Get elemsize, emode, ncomp 1052 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1053 CeedChkBackend(ierr); 1054 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1055 CeedChkBackend(ierr); 1056 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1057 CeedChkBackend(ierr); 1058 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1059 CeedChkBackend(ierr); 1060 1061 // Restriction 1062 if (emode != CEED_EVAL_WEIGHT && 1063 !((emode == CEED_EVAL_NONE) && useCollograd)) { 1064 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1065 1066 bool isStrided; 1067 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1068 if (!isStrided) { 1069 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1070 CeedChkBackend(ierr); 1071 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1072 CeedInt compstride; 1073 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1074 code << " // CompStride: "<<compstride<<"\n"; 1075 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1076 data->indices.in[i] = restr_data->d_ind; 1077 code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1078 } else { 1079 bool backendstrides; 1080 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1081 CeedChkBackend(ierr); 1082 CeedInt nelem; 1083 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1084 CeedChkBackend(ierr); 1085 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1086 if (!backendstrides) { 1087 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1088 CeedChkBackend(ierr); 1089 } 1090 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1091 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1092 } 1093 } 1094 1095 // Basis action 1096 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1097 switch (emode) { 1098 case CEED_EVAL_NONE: 1099 if (!useCollograd) { 1100 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1101 } 1102 break; 1103 case CEED_EVAL_INTERP: 1104 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1105 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1106 break; 1107 case CEED_EVAL_GRAD: 1108 if (useCollograd) { 1109 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1110 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1111 } else { 1112 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1113 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1114 } 1115 break; 1116 case CEED_EVAL_WEIGHT: 1117 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1118 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 1119 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1120 data->W = basis_data->d_q_weight_1d; 1121 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1122 break; // No action 1123 case CEED_EVAL_DIV: 1124 break; // TODO: Not implemented 1125 case CEED_EVAL_CURL: 1126 break; // TODO: Not implemented 1127 } 1128 } 1129 1130 // Q function 1131 code << "\n // -- Output field setup --\n"; 1132 for (CeedInt i = 0; i < numoutputfields; i++) { 1133 code << "\n // ---- Output field "<<i<<" ----\n"; 1134 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1135 CeedChkBackend(ierr); 1136 if (emode==CEED_EVAL_GRAD) 1137 { 1138 if (useCollograd) { 1139 //Accumulator for gradient slices 1140 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1141 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1142 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1143 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1144 code << " }\n"; 1145 code << " }\n"; 1146 } else { 1147 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1148 } 1149 } 1150 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1151 { 1152 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1153 } 1154 } 1155 // We treat quadrature points per slice in 3d to save registers 1156 if (useCollograd) { 1157 code << "\n // Note: Collocated Gradient\n"; 1158 code << "#pragma unroll\n"; 1159 code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1160 code << " // -- Input fields --\n"; 1161 for (CeedInt i = 0; i < numinputfields; i++) { 1162 code << " // ---- Input field "<<i<<" ----\n"; 1163 // Get elemsize, emode, ncomp 1164 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1165 CeedChkBackend(ierr); 1166 // Basis action 1167 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1168 switch (emode) { 1169 case CEED_EVAL_NONE: 1170 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1171 1172 bool isStrided; 1173 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr); 1174 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1175 if (!isStrided) { 1176 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1177 CeedChkBackend(ierr); 1178 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1179 CeedInt compstride; 1180 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1181 code << " // CompStride: "<<compstride<<"\n"; 1182 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1183 data->indices.in[i] = restr_data->d_ind; 1184 code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1185 } else { 1186 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr); 1187 bool backendstrides; 1188 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1189 CeedChkBackend(ierr); 1190 CeedInt nelem; 1191 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1192 CeedChkBackend(ierr); 1193 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1194 if (!backendstrides) { 1195 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1196 CeedChkBackend(ierr); 1197 } 1198 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1199 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1200 } 1201 break; 1202 case CEED_EVAL_INTERP: 1203 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1204 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1205 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1206 code << " }\n"; 1207 break; 1208 case CEED_EVAL_GRAD: 1209 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1210 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1211 break; 1212 case CEED_EVAL_WEIGHT: 1213 code << " CeedScalar r_q"<<i<<"[1];\n"; 1214 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1215 break; // No action 1216 case CEED_EVAL_DIV: 1217 break; // TODO: Not implemented 1218 case CEED_EVAL_CURL: 1219 break; // TODO: Not implemented 1220 } 1221 } 1222 code << "\n // -- Output fields --\n"; 1223 for (CeedInt i = 0; i < numoutputfields; i++) { 1224 code << " // ---- Output field "<<i<<" ----\n"; 1225 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1226 CeedChkBackend(ierr); 1227 // Basis action 1228 switch (emode) { 1229 case CEED_EVAL_NONE: 1230 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1231 break; // No action 1232 case CEED_EVAL_INTERP: 1233 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1234 break; 1235 case CEED_EVAL_GRAD: 1236 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1237 break; 1238 case CEED_EVAL_WEIGHT: 1239 break; // Should not occur 1240 case CEED_EVAL_DIV: 1241 break; // TODO: Not implemented 1242 case CEED_EVAL_CURL: 1243 break; // TODO: Not implemented 1244 } 1245 } 1246 } else { 1247 code << "\n // Note: No Collocated Gradient\n"; 1248 code << " // -- Input fields --\n"; 1249 for (CeedInt i = 0; i < numinputfields; i++) { 1250 code << " // ---- Input field "<<i<<" ----\n"; 1251 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1252 } 1253 code << " // -- Output fields --\n"; 1254 for (CeedInt i = 0; i < numoutputfields; i++) { 1255 code << " // ---- Output field "<<i<<" ----\n"; 1256 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1257 } 1258 } 1259 code << "\n // -- QFunction Inputs and outputs --\n"; 1260 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1261 for (CeedInt i = 0; i < numinputfields; i++) { 1262 code << " // ---- Input field "<<i<<" ----\n"; 1263 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1264 } 1265 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1266 for (CeedInt i = 0; i < numoutputfields; i++) { 1267 code << " // ---- Output field "<<i<<" ----\n"; 1268 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1269 } 1270 code << "\n // -- Apply QFunction --\n"; 1271 code << " "<<qFunctionName<<"(ctx, "; 1272 if (dim != 3 || useCollograd) { 1273 code << "1"; 1274 } else { 1275 code << "Q1d"; 1276 } 1277 code << ", in, out);\n"; 1278 if (useCollograd) { 1279 code << "\n // Note: Collocated Gradient\n"; 1280 code << " // -- Output fields --\n"; 1281 for (CeedInt i = 0; i < numoutputfields; i++) { 1282 code << " // ---- Output field "<<i<<" ----\n"; 1283 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1284 CeedChkBackend(ierr); 1285 // Basis action 1286 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1287 switch (emode) { 1288 case CEED_EVAL_NONE: 1289 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1290 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1291 code << " }\n"; 1292 break; // No action 1293 case CEED_EVAL_INTERP: 1294 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1295 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1296 code << " }\n"; 1297 break; 1298 case CEED_EVAL_GRAD: 1299 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1300 break; 1301 case CEED_EVAL_WEIGHT: 1302 break; // Should not occur 1303 case CEED_EVAL_DIV: 1304 break; // TODO: Not implemented 1305 case CEED_EVAL_CURL: 1306 break; // TODO: Not implemented 1307 } 1308 } 1309 code << " }\n"; 1310 } 1311 1312 // Output basis apply if needed 1313 // Generate the correct eval mode code for each output 1314 code << "\n // -- Output field basis action and restrictions --\n"; 1315 for (CeedInt i = 0; i < numoutputfields; i++) { 1316 code << " // ---- Output field "<<i<<" ----\n"; 1317 // Get elemsize, emode, ncomp 1318 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1319 CeedChkBackend(ierr); 1320 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1321 CeedChkBackend(ierr); 1322 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1323 CeedChkBackend(ierr); 1324 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1325 CeedChkBackend(ierr); 1326 // Basis action 1327 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1328 switch (emode) { 1329 case CEED_EVAL_NONE: 1330 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1331 break; // No action 1332 case CEED_EVAL_INTERP: 1333 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1334 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1335 break; 1336 case CEED_EVAL_GRAD: 1337 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1338 if (useCollograd) { 1339 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1340 } else { 1341 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1342 } 1343 break; 1344 // LCOV_EXCL_START 1345 case CEED_EVAL_WEIGHT: { 1346 Ceed ceed; 1347 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1348 return CeedError(ceed, CEED_ERROR_BACKEND, 1349 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1350 break; // Should not occur 1351 } 1352 case CEED_EVAL_DIV: 1353 break; // TODO: Not implemented 1354 case CEED_EVAL_CURL: 1355 break; // TODO: Not implemented 1356 // LCOV_EXCL_STOP 1357 } 1358 // Restriction 1359 bool isStrided; 1360 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1361 if (!isStrided) { 1362 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1363 CeedChkBackend(ierr); 1364 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1365 CeedInt compstride; 1366 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1367 code << " // CompStride: "<<compstride<<"\n"; 1368 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1369 data->indices.out[i] = restr_data->d_ind; 1370 code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1371 } else { 1372 bool backendstrides; 1373 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1374 CeedChkBackend(ierr); 1375 CeedInt nelem; 1376 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1377 CeedChkBackend(ierr); 1378 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1379 if (!backendstrides) { 1380 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1381 CeedChkBackend(ierr); 1382 } 1383 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1384 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1385 } 1386 } 1387 1388 code << " }\n"; 1389 code << "}\n"; 1390 code << "// -----------------------------------------------------------------------------\n\n"; 1391 1392 // View kernel for debugging 1393 CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 1394 CeedDebug(ceed, code.str().c_str()); 1395 1396 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 1397 "T1d", CeedIntMax(Q1d, data->maxP1d)); 1398 CeedChkBackend(ierr); 1399 ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1400 CeedChkBackend(ierr); 1401 1402 ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 1403 return CEED_ERROR_SUCCESS; 1404 } 1405 //------------------------------------------------------------------------------ 1406