1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #define CEED_DEBUG_COLOR 12 18 19 #include <ceed/ceed.h> 20 #include <ceed/backend.h> 21 #include <iostream> 22 #include <string> 23 #include <sstream> 24 #include "ceed-hip-gen.h" 25 #include "../hip/ceed-hip.h" 26 #include "../hip-shared/ceed-hip-shared.h" 27 #include "../hip/ceed-hip-compile.h" 28 29 static const char *deviceFunctions = QUOTE( 30 31 //------------------------------------------------------------------------------ 32 // Typedefs 33 //------------------------------------------------------------------------------ 34 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields; 35 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt; 36 37 typedef struct { 38 CeedInt tidx; 39 CeedInt tidy; 40 CeedInt tidz; 41 CeedInt tid; 42 CeedScalar* slice; 43 } BackendData; 44 45 //------------------------------------------------------------------------------ 46 // Load matrices for basis actions 47 //------------------------------------------------------------------------------ 48 template <int P, int Q> 49 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 50 for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 51 B[i] = d_B[i]; 52 } 53 54 //------------------------------------------------------------------------------ 55 // 1D 56 //------------------------------------------------------------------------------ 57 58 //------------------------------------------------------------------------------ 59 // L-vector -> E-vector, offsets provided 60 //------------------------------------------------------------------------------ 61 template <int NCOMP, int COMPSTRIDE, int P1d> 62 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 63 if (data.tidx < P1d) { 64 const CeedInt node = data.tidx; 65 const CeedInt ind = indices[node + elem * P1d]; 66 for (CeedInt comp = 0; comp < NCOMP; ++comp) 67 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 68 } 69 } 70 71 //------------------------------------------------------------------------------ 72 // L-vector -> E-vector, strided 73 //------------------------------------------------------------------------------ 74 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 75 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 76 if (data.tidx < P1d) { 77 const CeedInt node = data.tidx; 78 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 79 for (CeedInt comp = 0; comp < NCOMP; ++comp) 80 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 81 } 82 } 83 84 //------------------------------------------------------------------------------ 85 // E-vector -> L-vector, offsets provided 86 //------------------------------------------------------------------------------ 87 template <int NCOMP, int COMPSTRIDE, int P1d> 88 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 89 if (data.tidx < P1d) { 90 const CeedInt node = data.tidx; 91 const CeedInt ind = indices[node + elem * P1d]; 92 for (CeedInt comp = 0; comp < NCOMP; ++comp) 93 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 94 } 95 } 96 97 //------------------------------------------------------------------------------ 98 // E-vector -> L-vector, strided 99 //------------------------------------------------------------------------------ 100 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 101 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 102 if (data.tidx < P1d) { 103 const CeedInt node = data.tidx; 104 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 105 for (CeedInt comp = 0; comp < NCOMP; ++comp) 106 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 107 } 108 } 109 110 //------------------------------------------------------------------------------ 111 // 1D tensor contraction x 112 //------------------------------------------------------------------------------ 113 template <int NCOMP, int P1d, int Q1d> 114 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 115 data.slice[data.tidx] = *U; 116 __syncthreads(); 117 *V = 0.0; 118 if (data.tidx < Q1d) 119 for (CeedInt i = 0; i < P1d; ++i) 120 *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 121 __syncthreads(); 122 } 123 124 //------------------------------------------------------------------------------ 125 // 1D transpose tensor contraction x 126 //------------------------------------------------------------------------------ 127 template <int NCOMP, int P1d, int Q1d> 128 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 129 data.slice[data.tidx] = *U; 130 __syncthreads(); 131 *V = 0.0; 132 if (data.tidx < P1d) 133 for (CeedInt i = 0; i < Q1d; ++i) 134 *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 135 __syncthreads(); 136 } 137 138 //------------------------------------------------------------------------------ 139 // 1D interpolate to quadrature points 140 //------------------------------------------------------------------------------ 141 template <int NCOMP, int P1d, int Q1d> 142 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 143 for (CeedInt comp = 0; comp < NCOMP; comp++) 144 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 145 } 146 147 //------------------------------------------------------------------------------ 148 // 1D interpolate transpose 149 //------------------------------------------------------------------------------ 150 template <int NCOMP, int P1d, int Q1d> 151 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 152 for (CeedInt comp=0; comp<NCOMP; comp++) 153 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 154 } 155 156 //------------------------------------------------------------------------------ 157 // 1D derivatives at quadrature points 158 //------------------------------------------------------------------------------ 159 template <int NCOMP, int P1d, int Q1d> 160 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 161 for (CeedInt comp = 0; comp < NCOMP; comp++) 162 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 163 } 164 165 //------------------------------------------------------------------------------ 166 // 1D derivatives transpose 167 //------------------------------------------------------------------------------ 168 template <int NCOMP, int P1d, int Q1d> 169 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 170 for (CeedInt comp = 0; comp < NCOMP; comp++) 171 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 172 } 173 174 //------------------------------------------------------------------------------ 175 // 2D 176 //------------------------------------------------------------------------------ 177 178 //------------------------------------------------------------------------------ 179 // L-vector -> E-vector, offsets provided 180 //------------------------------------------------------------------------------ 181 template <int NCOMP, int COMPSTRIDE, int P1d> 182 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 183 if (data.tidx < P1d && data.tidy < P1d) { 184 const CeedInt node = data.tidx + data.tidy*P1d; 185 const CeedInt ind = indices[node + elem * P1d*P1d]; 186 for (CeedInt comp = 0; comp < NCOMP; ++comp) 187 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 188 } 189 } 190 191 //------------------------------------------------------------------------------ 192 // L-vector -> E-vector, strided 193 //------------------------------------------------------------------------------ 194 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 195 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 196 if (data.tidx < P1d && data.tidy < P1d) { 197 const CeedInt node = data.tidx + data.tidy*P1d; 198 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 199 for (CeedInt comp = 0; comp < NCOMP; ++comp) 200 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 201 } 202 } 203 204 //------------------------------------------------------------------------------ 205 // E-vector -> L-vector, offsets provided 206 //------------------------------------------------------------------------------ 207 template <int NCOMP, int COMPSTRIDE, int P1d> 208 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 209 if (data.tidx < P1d && data.tidy < P1d) { 210 const CeedInt node = data.tidx + data.tidy*P1d; 211 const CeedInt ind = indices[node + elem * P1d*P1d]; 212 for (CeedInt comp = 0; comp < NCOMP; ++comp) 213 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 214 } 215 } 216 217 //------------------------------------------------------------------------------ 218 // E-vector -> L-vector, strided 219 //------------------------------------------------------------------------------ 220 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 221 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 222 if (data.tidx < P1d && data.tidy < P1d) { 223 const CeedInt node = data.tidx + data.tidy*P1d; 224 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 225 for (CeedInt comp = 0; comp < NCOMP; ++comp) 226 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 227 } 228 } 229 230 //------------------------------------------------------------------------------ 231 // 2D tensor contraction x 232 //------------------------------------------------------------------------------ 233 template <int NCOMP, int P1d, int Q1d> 234 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 235 data.slice[data.tidx+data.tidy*T1d] = *U; 236 __syncthreads(); 237 *V = 0.0; 238 if (data.tidx < Q1d && data.tidy < P1d) 239 for (CeedInt i = 0; i < P1d; ++i) 240 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 241 __syncthreads(); 242 } 243 244 //------------------------------------------------------------------------------ 245 // 2D tensor contract y 246 //------------------------------------------------------------------------------ 247 template <int NCOMP, int P1d, int Q1d> 248 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 249 data.slice[data.tidx+data.tidy*T1d] = *U; 250 __syncthreads(); 251 *V = 0.0; 252 if (data.tidx < Q1d && data.tidy < Q1d) 253 for (CeedInt i = 0; i < P1d; ++i) 254 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 255 __syncthreads(); 256 } 257 258 //------------------------------------------------------------------------------ 259 // 2D transpose tensor contract y 260 //------------------------------------------------------------------------------ 261 template <int NCOMP, int P1d, int Q1d> 262 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 263 data.slice[data.tidx+data.tidy*T1d] = *U; 264 __syncthreads(); 265 *V = 0.0; 266 if (data.tidx < Q1d && data.tidy < P1d) 267 for (CeedInt i = 0; i < Q1d; ++i) 268 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 269 __syncthreads(); 270 } 271 272 //------------------------------------------------------------------------------ 273 // 2D transpose tensor contract x 274 //------------------------------------------------------------------------------ 275 template <int NCOMP, int P1d, int Q1d> 276 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 277 data.slice[data.tidx+data.tidy*T1d] = *U; 278 __syncthreads(); 279 *V = 0.0; 280 if (data.tidx < P1d && data.tidy < P1d) 281 for (CeedInt i = 0; i < Q1d; ++i) 282 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 283 __syncthreads(); 284 } 285 286 //------------------------------------------------------------------------------ 287 // 2D transpose tensor contract and add x 288 //------------------------------------------------------------------------------ 289 template <int NCOMP, int P1d, int Q1d> 290 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 291 data.slice[data.tidx+data.tidy*T1d] = *U; 292 __syncthreads(); 293 if (data.tidx < P1d && data.tidy < P1d) 294 for (CeedInt i = 0; i < Q1d; ++i) 295 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 296 __syncthreads(); 297 } 298 299 //------------------------------------------------------------------------------ 300 // 2D interpolate to quadrature points 301 //------------------------------------------------------------------------------ 302 template <int NCOMP, int P1d, int Q1d> 303 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 304 CeedScalar r_t[1]; 305 for (CeedInt comp = 0; comp < NCOMP; comp++) { 306 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 307 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 308 } 309 } 310 311 //------------------------------------------------------------------------------ 312 // 2D interpolate transpose 313 //------------------------------------------------------------------------------ 314 template <int NCOMP, int P1d, int Q1d> 315 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 316 CeedScalar r_t[1]; 317 for (CeedInt comp = 0; comp < NCOMP; comp++) { 318 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 319 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 320 } 321 } 322 323 //------------------------------------------------------------------------------ 324 // 2D derivatives at quadrature points 325 //------------------------------------------------------------------------------ 326 template <int NCOMP, int P1d, int Q1d> 327 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 328 CeedScalar r_t[1]; 329 for (CeedInt comp = 0; comp < NCOMP; comp++) { 330 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 331 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 332 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 333 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 334 } 335 } 336 337 //------------------------------------------------------------------------------ 338 // 2D derivatives transpose 339 //------------------------------------------------------------------------------ 340 template <int NCOMP, int P1d, int Q1d> 341 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 342 CeedScalar r_t[1]; 343 for (CeedInt comp = 0; comp < NCOMP; comp++) { 344 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 345 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 346 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 347 ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 348 } 349 } 350 351 //------------------------------------------------------------------------------ 352 // 3D 353 //------------------------------------------------------------------------------ 354 355 //------------------------------------------------------------------------------ 356 // L-vector -> E-vector, offsets provided 357 //------------------------------------------------------------------------------ 358 template <int NCOMP, int COMPSTRIDE, int P1d> 359 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 360 if (data.tidx < P1d && data.tidy < P1d) 361 for (CeedInt z = 0; z < P1d; ++z) { 362 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 363 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 364 for (CeedInt comp = 0; comp < NCOMP; ++comp) 365 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 366 } 367 } 368 369 //------------------------------------------------------------------------------ 370 // L-vector -> E-vector, strided 371 //------------------------------------------------------------------------------ 372 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 373 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 374 if (data.tidx < P1d && data.tidy < P1d) 375 for (CeedInt z = 0; z < P1d; ++z) { 376 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 377 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 378 for (CeedInt comp = 0; comp < NCOMP; ++comp) 379 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 380 } 381 } 382 383 //------------------------------------------------------------------------------ 384 // E-vector -> Q-vector, offests provided 385 //------------------------------------------------------------------------------ 386 template <int NCOMP, int COMPSTRIDE, int Q1d> 387 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 388 if (data.tidx < Q1d && data.tidy < Q1d) { 389 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 390 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 391 for (CeedInt comp = 0; comp < NCOMP; ++comp) 392 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 393 } 394 } 395 396 //------------------------------------------------------------------------------ 397 // E-vector -> Q-vector, strided 398 //------------------------------------------------------------------------------ 399 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 400 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 401 if (data.tidx < Q1d && data.tidy < Q1d) { 402 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 403 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 404 for (CeedInt comp = 0; comp < NCOMP; ++comp) 405 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 406 } 407 } 408 409 //------------------------------------------------------------------------------ 410 // E-vector -> L-vector, offsets provided 411 //------------------------------------------------------------------------------ 412 template <int NCOMP, int COMPSTRIDE, int P1d> 413 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 414 if (data.tidx < P1d && data.tidy < P1d) 415 for (CeedInt z = 0; z < P1d; ++z) { 416 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 417 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 418 for (CeedInt comp = 0; comp < NCOMP; ++comp) 419 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 420 } 421 } 422 423 //------------------------------------------------------------------------------ 424 // E-vector -> L-vector, strided 425 //------------------------------------------------------------------------------ 426 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 427 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 428 if (data.tidx < P1d && data.tidy < P1d) 429 for (CeedInt z = 0; z < P1d; ++z) { 430 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 431 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 432 for (CeedInt comp = 0; comp < NCOMP; ++comp) 433 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 434 } 435 } 436 437 //------------------------------------------------------------------------------ 438 // 3D tensor contract x 439 //------------------------------------------------------------------------------ 440 template <int NCOMP, int P1d, int Q1d> 441 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 442 CeedScalar r_B[P1d]; 443 for (CeedInt i = 0; i < P1d; ++i) 444 r_B[i] = B[i + data.tidx*P1d]; 445 446 for (CeedInt k = 0; k < P1d; ++k) { 447 data.slice[data.tidx+data.tidy*T1d] = U[k]; 448 __syncthreads(); 449 V[k] = 0.0; 450 if (data.tidx < Q1d && data.tidy < P1d) 451 for (CeedInt i = 0; i < P1d; ++i) 452 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 453 __syncthreads(); 454 } 455 } 456 457 //------------------------------------------------------------------------------ 458 // 3D tensor contract y 459 //------------------------------------------------------------------------------ 460 template <int NCOMP, int P1d, int Q1d> 461 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 462 CeedScalar r_B[P1d]; 463 for (CeedInt i = 0; i < P1d; ++i) 464 r_B[i] = B[i + data.tidy*P1d]; 465 466 for (CeedInt k = 0; k < P1d; ++k) { 467 data.slice[data.tidx+data.tidy*T1d] = U[k]; 468 __syncthreads(); 469 V[k] = 0.0; 470 if (data.tidx < Q1d && data.tidy < Q1d) 471 for (CeedInt i = 0; i < P1d; ++i) 472 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 473 __syncthreads(); 474 } 475 } 476 477 //------------------------------------------------------------------------------ 478 // 3D tensor contract z 479 //------------------------------------------------------------------------------ 480 template <int NCOMP, int P1d, int Q1d> 481 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 482 for (CeedInt k = 0; k < Q1d; ++k) { 483 V[k] = 0.0; 484 if (data.tidx < Q1d && data.tidy < Q1d) 485 for (CeedInt i = 0; i < P1d; ++i) 486 V[k] += B[i + k*P1d] * U[i]; // Contract z direction 487 } 488 } 489 490 //------------------------------------------------------------------------------ 491 // 3D transpose tensor contract z 492 //------------------------------------------------------------------------------ 493 template <int NCOMP, int P1d, int Q1d> 494 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 495 for (CeedInt k = 0; k < P1d; ++k) { 496 V[k] = 0.0; 497 if (data.tidx < Q1d && data.tidy < Q1d) 498 for (CeedInt i = 0; i < Q1d; ++i) 499 V[k] += B[k + i*P1d] * U[i]; // Contract z direction 500 } 501 } 502 503 //------------------------------------------------------------------------------ 504 // 3D transpose tensor contract y 505 //------------------------------------------------------------------------------ 506 template <int NCOMP, int P1d, int Q1d> 507 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 508 CeedScalar r_B[Q1d]; 509 for (CeedInt i = 0; i < Q1d; ++i) 510 r_B[i] = B[data.tidy + i*P1d]; 511 512 for (CeedInt k = 0; k < P1d; ++k) { 513 data.slice[data.tidx+data.tidy*T1d] = U[k]; 514 __syncthreads(); 515 V[k] = 0.0; 516 if (data.tidx < Q1d && data.tidy < P1d) 517 for (CeedInt i = 0; i < Q1d; ++i) 518 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 519 __syncthreads(); 520 } 521 } 522 523 //------------------------------------------------------------------------------ 524 // 3D transpose tensor contract add y 525 //------------------------------------------------------------------------------ 526 template <int NCOMP, int P1d, int Q1d> 527 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 528 CeedScalar r_B[Q1d]; 529 for (CeedInt i = 0; i < Q1d; ++i) 530 r_B[i] = B[data.tidy + i*P1d]; 531 532 for (CeedInt k = 0; k < P1d; ++k) { 533 data.slice[data.tidx+data.tidy*T1d] = U[k]; 534 __syncthreads(); 535 if (data.tidx < Q1d && data.tidy < P1d) 536 for (CeedInt i = 0; i < Q1d; ++i) 537 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 538 __syncthreads(); 539 } 540 } 541 542 //------------------------------------------------------------------------------ 543 // 3D transpose tensor contract x 544 //------------------------------------------------------------------------------ 545 template <int NCOMP, int P1d, int Q1d> 546 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 547 CeedScalar r_B[Q1d]; 548 for (CeedInt i = 0; i < Q1d; ++i) 549 r_B[i] = B[data.tidx + i*P1d]; 550 551 for (CeedInt k = 0; k < P1d; ++k) { 552 data.slice[data.tidx+data.tidy*T1d] = U[k]; 553 __syncthreads(); 554 V[k] = 0.0; 555 if (data.tidx < P1d && data.tidy < P1d) 556 for (CeedInt i = 0; i < Q1d; ++i) 557 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 558 __syncthreads(); 559 } 560 } 561 562 //------------------------------------------------------------------------------ 563 // 3D transpose tensor contract add x 564 //------------------------------------------------------------------------------ 565 template <int NCOMP, int P1d, int Q1d> 566 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 567 CeedScalar r_B[Q1d]; 568 for (CeedInt i = 0; i < Q1d; ++i) 569 r_B[i] = B[data.tidx + i*P1d]; 570 571 for (CeedInt k = 0; k < P1d; ++k) { 572 data.slice[data.tidx+data.tidy*T1d] = U[k]; 573 __syncthreads(); 574 if (data.tidx < P1d && data.tidy < P1d) 575 for (CeedInt i = 0; i < Q1d; ++i) 576 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 577 __syncthreads(); 578 } 579 } 580 581 //------------------------------------------------------------------------------ 582 // 3D interpolate to quadrature points 583 //------------------------------------------------------------------------------ 584 template <int NCOMP, int P1d, int Q1d> 585 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 586 CeedScalar r_t1[T1d]; 587 CeedScalar r_t2[T1d]; 588 for (CeedInt comp = 0; comp < NCOMP; comp++) { 589 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 590 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 591 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 592 } 593 } 594 595 //------------------------------------------------------------------------------ 596 // 3D interpolate transpose 597 //------------------------------------------------------------------------------ 598 template <int NCOMP, int P1d, int Q1d> 599 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 600 CeedScalar r_t1[T1d]; 601 CeedScalar r_t2[T1d]; 602 for (CeedInt comp = 0; comp < NCOMP; comp++) { 603 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 604 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 605 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 606 } 607 } 608 609 //------------------------------------------------------------------------------ 610 // 3D derivatives at quadrature points 611 //------------------------------------------------------------------------------ 612 template <int NCOMP, int P1d, int Q1d> 613 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 614 CeedScalar r_t1[T1d]; 615 CeedScalar r_t2[T1d]; 616 for (CeedInt comp = 0; comp < NCOMP; comp++) { 617 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 618 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 619 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 620 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 621 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 622 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 623 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 624 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 625 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 626 } 627 } 628 629 //------------------------------------------------------------------------------ 630 // 3D derivatives transpose 631 //------------------------------------------------------------------------------ 632 template <int NCOMP, int P1d, int Q1d> 633 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 634 CeedScalar r_t1[T1d]; 635 CeedScalar r_t2[T1d]; 636 for (CeedInt comp = 0; comp < NCOMP; comp++) { 637 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 638 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 639 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 640 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 641 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 642 ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 643 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 644 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 645 ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 646 } 647 } 648 649 //------------------------------------------------------------------------------ 650 // 3D collocated derivatives computation 651 //------------------------------------------------------------------------------ 652 template <int NCOMP, int Q1d> 653 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 654 if (data.tidx < Q1d && data.tidy < Q1d) { 655 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 656 data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 657 __syncthreads(); 658 // X derivative 659 r_V[comp+0*NCOMP] = 0.0; 660 for (CeedInt i = 0; i < Q1d; ++i) 661 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 662 // Y derivative 663 r_V[comp+1*NCOMP] = 0.0; 664 for (CeedInt i = 0; i < Q1d; ++i) 665 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 666 // Z derivative 667 r_V[comp+2*NCOMP] = 0.0; 668 for (CeedInt i = 0; i < Q1d; ++i) 669 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 670 __syncthreads(); 671 } 672 } 673 } 674 675 //------------------------------------------------------------------------------ 676 // 3D collocated derivatives transpose 677 //------------------------------------------------------------------------------ 678 template <int NCOMP, int Q1d> 679 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 680 if (data.tidx < Q1d && data.tidy < Q1d) { 681 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 682 // X derivative 683 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 684 __syncthreads(); 685 for (CeedInt i = 0; i < Q1d; ++i) 686 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 687 __syncthreads(); 688 // Y derivative 689 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 690 __syncthreads(); 691 for (CeedInt i = 0; i < Q1d; ++i) 692 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 693 __syncthreads(); 694 // Z derivative 695 for (CeedInt i = 0; i < Q1d; ++i) 696 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 697 } 698 } 699 } 700 701 //------------------------------------------------------------------------------ 702 // 1D quadrature weights 703 //------------------------------------------------------------------------------ 704 template <int Q1d> 705 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 706 *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 707 } 708 709 //------------------------------------------------------------------------------ 710 // 2D quadrature weights 711 //------------------------------------------------------------------------------ 712 template <int Q1d> 713 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 714 *w = (data.tidx < Q1d && data.tidy < Q1d) ? 715 qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 716 } 717 718 //------------------------------------------------------------------------------ 719 // 3D quadrature weights 720 //------------------------------------------------------------------------------ 721 template <int Q1d> 722 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 723 const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 724 const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 725 for (CeedInt z = 0; z < Q1d; ++z) 726 w[z] = quad ? pw*qweight1d[z] : 0.0; 727 } 728 729 ); 730 //------------------------------------------------------------------------------ 731 // Build singe operator kernel 732 //------------------------------------------------------------------------------ 733 CEED_INTERN int CeedHipGenOperatorBuild(CeedOperator op) { 734 735 using std::ostringstream; 736 using std::string; 737 int ierr; 738 bool setupdone; 739 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 740 if (setupdone) return CEED_ERROR_SUCCESS; 741 Ceed ceed; 742 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 743 CeedOperator_Hip_gen *data; 744 ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 745 CeedQFunction qf; 746 CeedQFunction_Hip_gen *qf_data; 747 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 748 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 749 CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields, 750 numoutputfields, ncomp, dim = 0, lsize; 751 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 752 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 753 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 754 CeedChkBackend(ierr); 755 CeedOperatorField *opinputfields, *opoutputfields; 756 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 757 CeedChkBackend(ierr); 758 CeedQFunctionField *qfinputfields, *qfoutputfields; 759 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 760 CeedChkBackend(ierr); 761 CeedEvalMode emode; 762 CeedBasis basis; 763 CeedBasis_Hip_shared *basis_data; 764 CeedElemRestriction Erestrict; 765 CeedElemRestriction_Hip *restr_data; 766 767 // Check for restriction only identity operator 768 bool is_identity_qf; 769 ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 770 if (is_identity_qf) { 771 CeedEvalMode emodein, emodeout; 772 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein); CeedChkBackend(ierr); 773 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout); CeedChkBackend(ierr); 774 if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE) 775 // LCOV_EXCL_START 776 return CeedError(ceed, CEED_ERROR_BACKEND, 777 "Backend does not implement restriction only identity operators"); 778 // LCOV_EXCL_STOP 779 } 780 781 ostringstream code; 782 string devFunctions(deviceFunctions); 783 code << devFunctions; 784 785 string qFunction(qf_data->qFunctionSource); 786 string qFunctionName(qf_data->qFunctionName); 787 string oper; 788 oper = "CeedKernel_Hip_gen_" + qFunctionName; 789 790 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 791 code << "#define CEED_QFUNCTION_HELPER inline __device__ __forceinline__\n"; 792 code << "#define CeedPragmaSIMD\n"; 793 code << "#define CEED_ERROR_SUCCESS 0\n\n"; 794 795 // Find dim and Q1d 796 bool useCollograd = true; 797 data->maxP1d = 0; 798 for (CeedInt i = 0; i < numinputfields; i++) { 799 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 800 if (basis != CEED_BASIS_COLLOCATED) { 801 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 802 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 803 CeedChkBackend(ierr); 804 805 // Check for collocated gradient 806 useCollograd = useCollograd && basis_data->d_collograd1d; 807 808 // Collect dim and Q1d 809 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 810 bool isTensor; 811 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 812 if (isTensor) { 813 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 814 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 815 if (P1d>data->maxP1d) data->maxP1d = P1d; 816 } else { 817 // LCOV_EXCL_START 818 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 819 // LCOV_EXCL_STOP 820 } 821 } 822 } 823 // Check output bases for Q1d, dim as well 824 // The only imput basis might be CEED_BASIS_COLLOCATED 825 for (CeedInt i = 0; i < numoutputfields; i++) { 826 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 827 828 if (basis != CEED_BASIS_COLLOCATED) { 829 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 830 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 831 CeedChkBackend(ierr); 832 833 // Collect dim and Q1d 834 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 835 bool isTensor; 836 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 837 if (isTensor) { 838 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 839 } else { 840 // LCOV_EXCL_START 841 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 842 // LCOV_EXCL_STOP 843 } 844 845 // Check for collocated gradient 846 useCollograd = useCollograd && basis_data->d_collograd1d; 847 } 848 } 849 data->dim = dim; 850 data->Q1d = Q1d; 851 852 // Define CEED_Q_VLA 853 if (dim != 3 || useCollograd) { 854 code << "\n#define CEED_Q_VLA 1\n\n"; 855 } else { 856 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 857 } 858 859 code << qFunction; 860 861 // Setup 862 code << "\n// -----------------------------------------------------------------------------\n"; 863 code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n"; 864 for (CeedInt i = 0; i < numinputfields; i++) { 865 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 866 CeedChkBackend(ierr); 867 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 868 code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 869 } 870 } 871 872 for (CeedInt i = 0; i < numoutputfields; i++) { 873 code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 874 } 875 876 code << " const CeedInt Dim = "<<dim<<";\n"; 877 code << " const CeedInt Q1d = "<<Q1d<<";\n"; 878 879 code << " HIP_DYNAMIC_SHARED( CeedScalar, slice)\n"; 880 code << " BackendData data;\n"; 881 code << " data.tidx = threadIdx.x;\n"; 882 code << " data.tidy = threadIdx.y;\n"; 883 code << " data.tidz = threadIdx.z;\n"; 884 code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 885 code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 886 887 code << "\n // -- Input field constants and basis data --\n"; 888 //Initialize constants, and matrices B and G 889 for (CeedInt i = 0; i < numinputfields; i++) { 890 code << " // ---- Input field "<<i<<" ----\n"; 891 // Get elemsize, emode, ncomp 892 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 893 CeedChkBackend(ierr); 894 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 895 CeedChkBackend(ierr); 896 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 897 CeedChkBackend(ierr); 898 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 899 CeedChkBackend(ierr); 900 901 // Set field constants 902 if (emode != CEED_EVAL_WEIGHT) { 903 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 904 if (basis != CEED_BASIS_COLLOCATED) { 905 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 906 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 907 } else { 908 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 909 } 910 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 911 } 912 913 // Load basis data 914 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 915 switch (emode) { 916 case CEED_EVAL_NONE: 917 break; 918 case CEED_EVAL_INTERP: 919 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 920 data->B.in[i] = basis_data->d_interp1d; 921 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 922 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 923 break; 924 case CEED_EVAL_GRAD: 925 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 926 data->B.in[i] = basis_data->d_interp1d; 927 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 928 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 929 if (useCollograd) { 930 data->G.in[i] = basis_data->d_collograd1d; 931 code << " __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 932 code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 933 } else { 934 data->G.in[i] = basis_data->d_grad1d; 935 code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 936 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 937 } 938 break; 939 case CEED_EVAL_WEIGHT: 940 break; // No action 941 case CEED_EVAL_DIV: 942 break; // TODO: Not implemented 943 case CEED_EVAL_CURL: 944 break; // TODO: Not implemented 945 } 946 } 947 948 code << "\n // -- Output field constants and basis data --\n"; 949 for (CeedInt i = 0; i < numoutputfields; i++) { 950 code << " // ---- Output field "<<i<<" ----\n"; 951 // Get elemsize, emode, ncomp 952 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 953 CeedChkBackend(ierr); 954 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 955 CeedChkBackend(ierr); 956 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 957 CeedChkBackend(ierr); 958 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 959 CeedChkBackend(ierr); 960 961 // Set field constants 962 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 963 if (basis != CEED_BASIS_COLLOCATED) { 964 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 965 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 966 } else { 967 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 968 } 969 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 970 971 // Load basis data 972 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 973 switch (emode) { 974 case CEED_EVAL_NONE: 975 break; // No action 976 case CEED_EVAL_INTERP: 977 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 978 data->B.out[i] = basis_data->d_interp1d; 979 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 980 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 981 break; 982 case CEED_EVAL_GRAD: 983 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 984 data->B.out[i] = basis_data->d_interp1d; 985 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 986 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 987 if (useCollograd) { 988 data->G.out[i] = basis_data->d_collograd1d; 989 code << " __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 990 code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 991 } else { 992 data->G.out[i] = basis_data->d_grad1d; 993 code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 994 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 995 } 996 break; 997 // LCOV_EXCL_START 998 case CEED_EVAL_WEIGHT: { 999 Ceed ceed; 1000 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1001 return CeedError(ceed, CEED_ERROR_BACKEND, 1002 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1003 break; // Should not occur 1004 } 1005 case CEED_EVAL_DIV: 1006 break; // TODO: Not implemented 1007 case CEED_EVAL_CURL: 1008 break; // TODO: Not implemented 1009 // LCOV_EXCL_STOP 1010 } 1011 } 1012 code << "\n // -- Element loop --\n"; 1013 code << " __syncthreads();\n"; 1014 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1015 // Input basis apply if needed 1016 // Generate the correct eval mode code for each input 1017 code << " // -- Input field restrictions and basis actions --\n"; 1018 for (CeedInt i = 0; i < numinputfields; i++) { 1019 code << " // ---- Input field "<<i<<" ----\n"; 1020 // Get elemsize, emode, ncomp 1021 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1022 CeedChkBackend(ierr); 1023 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1024 CeedChkBackend(ierr); 1025 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1026 CeedChkBackend(ierr); 1027 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1028 CeedChkBackend(ierr); 1029 1030 // Restriction 1031 if (emode != CEED_EVAL_WEIGHT && 1032 !((emode == CEED_EVAL_NONE) && useCollograd)) { 1033 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1034 1035 bool isStrided; 1036 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1037 if (!isStrided) { 1038 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1039 CeedChkBackend(ierr); 1040 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1041 CeedInt compstride; 1042 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1043 code << " // CompStride: "<<compstride<<"\n"; 1044 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1045 data->indices.in[i] = restr_data->d_ind; 1046 code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1047 } else { 1048 bool backendstrides; 1049 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1050 CeedChkBackend(ierr); 1051 CeedInt nelem; 1052 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1053 CeedChkBackend(ierr); 1054 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1055 if (!backendstrides) { 1056 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1057 CeedChkBackend(ierr); 1058 } 1059 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1060 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1061 } 1062 } 1063 1064 // Basis action 1065 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1066 switch (emode) { 1067 case CEED_EVAL_NONE: 1068 if (!useCollograd) { 1069 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1070 } 1071 break; 1072 case CEED_EVAL_INTERP: 1073 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1074 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1075 break; 1076 case CEED_EVAL_GRAD: 1077 if (useCollograd) { 1078 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1079 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1080 } else { 1081 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1082 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1083 } 1084 break; 1085 case CEED_EVAL_WEIGHT: 1086 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1087 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 1088 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1089 data->W = basis_data->d_qweight1d; 1090 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1091 break; // No action 1092 case CEED_EVAL_DIV: 1093 break; // TODO: Not implemented 1094 case CEED_EVAL_CURL: 1095 break; // TODO: Not implemented 1096 } 1097 } 1098 1099 // Q function 1100 code << "\n // -- Output field setup --\n"; 1101 for (CeedInt i = 0; i < numoutputfields; i++) { 1102 code << "\n // ---- Output field "<<i<<" ----\n"; 1103 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1104 CeedChkBackend(ierr); 1105 if (emode==CEED_EVAL_GRAD) 1106 { 1107 if (useCollograd) { 1108 //Accumulator for gradient slices 1109 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1110 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1111 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1112 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1113 code << " }\n"; 1114 code << " }\n"; 1115 } else { 1116 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1117 } 1118 } 1119 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1120 { 1121 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1122 } 1123 } 1124 // We treat quadrature points per slice in 3d to save registers 1125 if (useCollograd) { 1126 code << "\n // Note: Collocated Gradient\n"; 1127 code << "#pragma unroll\n"; 1128 code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1129 code << " // -- Input fields --\n"; 1130 for (CeedInt i = 0; i < numinputfields; i++) { 1131 code << " // ---- Input field "<<i<<" ----\n"; 1132 // Get elemsize, emode, ncomp 1133 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1134 CeedChkBackend(ierr); 1135 // Basis action 1136 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1137 switch (emode) { 1138 case CEED_EVAL_NONE: 1139 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1140 1141 bool isStrided; 1142 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr); 1143 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr); 1144 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1145 if (!isStrided) { 1146 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1147 CeedChkBackend(ierr); 1148 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1149 CeedInt compstride; 1150 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1151 code << " // CompStride: "<<compstride<<"\n"; 1152 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1153 data->indices.in[i] = restr_data->d_ind; 1154 code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1155 } else { 1156 bool backendstrides; 1157 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1158 CeedChkBackend(ierr); 1159 CeedInt nelem; 1160 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1161 CeedChkBackend(ierr); 1162 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1163 if (!backendstrides) { 1164 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1165 CeedChkBackend(ierr); 1166 } 1167 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1168 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1169 } 1170 break; 1171 case CEED_EVAL_INTERP: 1172 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1173 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1174 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1175 code << " }\n"; 1176 break; 1177 case CEED_EVAL_GRAD: 1178 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1179 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1180 break; 1181 case CEED_EVAL_WEIGHT: 1182 code << " CeedScalar r_q"<<i<<"[1];\n"; 1183 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1184 break; // No action 1185 case CEED_EVAL_DIV: 1186 break; // TODO: Not implemented 1187 case CEED_EVAL_CURL: 1188 break; // TODO: Not implemented 1189 } 1190 } 1191 code << "\n // -- Output fields --\n"; 1192 for (CeedInt i = 0; i < numoutputfields; i++) { 1193 code << " // ---- Output field "<<i<<" ----\n"; 1194 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1195 CeedChkBackend(ierr); 1196 // Basis action 1197 switch (emode) { 1198 case CEED_EVAL_NONE: 1199 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1200 break; // No action 1201 case CEED_EVAL_INTERP: 1202 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1203 break; 1204 case CEED_EVAL_GRAD: 1205 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1206 break; 1207 case CEED_EVAL_WEIGHT: 1208 break; // Should not occur 1209 case CEED_EVAL_DIV: 1210 break; // TODO: Not implemented 1211 case CEED_EVAL_CURL: 1212 break; // TODO: Not implemented 1213 } 1214 } 1215 } else { 1216 code << "\n // Note: No Collocated Gradient\n"; 1217 code << " // -- Input fields --\n"; 1218 for (CeedInt i = 0; i < numinputfields; i++) { 1219 code << " // ---- Input field "<<i<<" ----\n"; 1220 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1221 } 1222 code << " // -- Output fields --\n"; 1223 for (CeedInt i = 0; i < numoutputfields; i++) { 1224 code << " // ---- Output field "<<i<<" ----\n"; 1225 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1226 } 1227 } 1228 code << "\n // -- QFunction Inputs and outputs --\n"; 1229 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1230 for (CeedInt i = 0; i < numinputfields; i++) { 1231 code << " // ---- Input field "<<i<<" ----\n"; 1232 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1233 } 1234 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1235 for (CeedInt i = 0; i < numoutputfields; i++) { 1236 code << " // ---- Output field "<<i<<" ----\n"; 1237 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1238 } 1239 code << "\n // -- Apply QFunction --\n"; 1240 code << " "<<qFunctionName<<"(ctx, "; 1241 if (dim != 3 || useCollograd) { 1242 code << "1"; 1243 } else { 1244 code << "Q1d"; 1245 } 1246 code << ", in, out);\n"; 1247 if (useCollograd) { 1248 code << "\n // Note: Collocated Gradient\n"; 1249 code << " // -- Output fields --\n"; 1250 for (CeedInt i = 0; i < numoutputfields; i++) { 1251 code << " // ---- Output field "<<i<<" ----\n"; 1252 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1253 CeedChkBackend(ierr); 1254 // Basis action 1255 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1256 switch (emode) { 1257 case CEED_EVAL_NONE: 1258 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1259 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1260 code << " }\n"; 1261 break; // No action 1262 case CEED_EVAL_INTERP: 1263 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1264 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1265 code << " }\n"; 1266 break; 1267 case CEED_EVAL_GRAD: 1268 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1269 break; 1270 case CEED_EVAL_WEIGHT: 1271 break; // Should not occur 1272 case CEED_EVAL_DIV: 1273 break; // TODO: Not implemented 1274 case CEED_EVAL_CURL: 1275 break; // TODO: Not implemented 1276 } 1277 } 1278 code << " }\n"; 1279 } 1280 1281 // Output basis apply if needed 1282 // Generate the correct eval mode code for each output 1283 code << "\n // -- Output field basis action and restrictions --\n"; 1284 for (CeedInt i = 0; i < numoutputfields; i++) { 1285 code << " // ---- Output field "<<i<<" ----\n"; 1286 // Get elemsize, emode, ncomp 1287 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1288 CeedChkBackend(ierr); 1289 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1290 CeedChkBackend(ierr); 1291 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1292 CeedChkBackend(ierr); 1293 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1294 CeedChkBackend(ierr); 1295 // Basis action 1296 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1297 switch (emode) { 1298 case CEED_EVAL_NONE: 1299 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1300 break; // No action 1301 case CEED_EVAL_INTERP: 1302 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1303 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1304 break; 1305 case CEED_EVAL_GRAD: 1306 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1307 if (useCollograd) { 1308 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1309 } else { 1310 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1311 } 1312 break; 1313 // LCOV_EXCL_START 1314 case CEED_EVAL_WEIGHT: { 1315 Ceed ceed; 1316 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1317 return CeedError(ceed, CEED_ERROR_BACKEND, 1318 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1319 break; // Should not occur 1320 } 1321 case CEED_EVAL_DIV: 1322 break; // TODO: Not implemented 1323 case CEED_EVAL_CURL: 1324 break; // TODO: Not implemented 1325 // LCOV_EXCL_STOP 1326 } 1327 // Restriction 1328 bool isStrided; 1329 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1330 if (!isStrided) { 1331 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1332 CeedChkBackend(ierr); 1333 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1334 CeedInt compstride; 1335 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1336 code << " // CompStride: "<<compstride<<"\n"; 1337 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1338 data->indices.out[i] = restr_data->d_ind; 1339 code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1340 } else { 1341 bool backendstrides; 1342 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1343 CeedChkBackend(ierr); 1344 CeedInt nelem; 1345 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1346 CeedChkBackend(ierr); 1347 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1348 if (!backendstrides) { 1349 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1350 CeedChkBackend(ierr); 1351 } 1352 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1353 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1354 } 1355 } 1356 1357 code << " }\n"; 1358 code << "}\n"; 1359 code << "// -----------------------------------------------------------------------------\n\n"; 1360 1361 // View kernel for debugging 1362 CeedDebug(code.str().c_str()); 1363 1364 ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 1, 1365 "T1d", CeedIntMax(Q1d, data->maxP1d)); 1366 CeedChkBackend(ierr); 1367 ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op); 1368 CeedChkBackend(ierr); 1369 1370 ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 1371 return CEED_ERROR_SUCCESS; 1372 } 1373 //------------------------------------------------------------------------------ 1374