1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #define CEED_DEBUG_COLOR 12 18 19 #include <ceed/ceed.h> 20 #include <ceed/backend.h> 21 #include <iostream> 22 #include <string> 23 #include <sstream> 24 #include "ceed-hip-gen.h" 25 #include "../hip-ref/ceed-hip-ref.h" 26 #include "../hip-shared/ceed-hip-shared.h" 27 #include "../hip/ceed-hip-compile.h" 28 29 30 //------------------------------------------------------------------------------ 31 // Calculate the block size used for launching the operator kernel 32 //------------------------------------------------------------------------------ 33 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt nelem, 34 const CeedInt P1d, const CeedInt Q1d, 35 CeedInt *block_sizes) { 36 37 const CeedInt thread1d = CeedIntMax(Q1d, P1d); 38 if (dim==1) { 39 CeedInt elemsPerBlock = 64*thread1d > 256? 256/thread1d : 64; 40 elemsPerBlock = elemsPerBlock>0?elemsPerBlock:1; 41 block_sizes[0] = thread1d; 42 block_sizes[1] = 1; 43 block_sizes[2] = elemsPerBlock; 44 } else if (dim==2) { 45 const CeedInt elemsPerBlock = thread1d<4? 16 : 2; 46 block_sizes[0] = thread1d; 47 block_sizes[1] = thread1d; 48 block_sizes[2] = elemsPerBlock; 49 } else if (dim==3) { 50 const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1); 51 block_sizes[0] = thread1d; 52 block_sizes[1] = thread1d; 53 block_sizes[2] = elemsPerBlock; 54 } 55 return CEED_ERROR_SUCCESS; 56 } 57 58 static const char *deviceFunctions = QUOTE( 59 60 //------------------------------------------------------------------------------ 61 // Typedefs 62 //------------------------------------------------------------------------------ 63 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields; 64 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt; 65 66 typedef struct { 67 CeedInt tidx; 68 CeedInt tidy; 69 CeedInt tidz; 70 CeedInt tid; 71 CeedScalar* slice; 72 } BackendData; 73 74 //------------------------------------------------------------------------------ 75 // Load matrices for basis actions 76 //------------------------------------------------------------------------------ 77 template <int P, int Q> 78 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 79 for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 80 B[i] = d_B[i]; 81 } 82 83 //------------------------------------------------------------------------------ 84 // 1D 85 //------------------------------------------------------------------------------ 86 87 //------------------------------------------------------------------------------ 88 // L-vector -> E-vector, offsets provided 89 //------------------------------------------------------------------------------ 90 template <int NCOMP, int COMPSTRIDE, int P1d> 91 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 92 if (data.tidx < P1d) { 93 const CeedInt node = data.tidx; 94 const CeedInt ind = indices[node + elem * P1d]; 95 for (CeedInt comp = 0; comp < NCOMP; ++comp) 96 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 97 } 98 } 99 100 //------------------------------------------------------------------------------ 101 // L-vector -> E-vector, strided 102 //------------------------------------------------------------------------------ 103 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 104 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 105 if (data.tidx < P1d) { 106 const CeedInt node = data.tidx; 107 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 108 for (CeedInt comp = 0; comp < NCOMP; ++comp) 109 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 110 } 111 } 112 113 //------------------------------------------------------------------------------ 114 // E-vector -> L-vector, offsets provided 115 //------------------------------------------------------------------------------ 116 template <int NCOMP, int COMPSTRIDE, int P1d> 117 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 118 if (data.tidx < P1d) { 119 const CeedInt node = data.tidx; 120 const CeedInt ind = indices[node + elem * P1d]; 121 for (CeedInt comp = 0; comp < NCOMP; ++comp) 122 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 123 } 124 } 125 126 //------------------------------------------------------------------------------ 127 // E-vector -> L-vector, strided 128 //------------------------------------------------------------------------------ 129 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 130 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 131 if (data.tidx < P1d) { 132 const CeedInt node = data.tidx; 133 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 134 for (CeedInt comp = 0; comp < NCOMP; ++comp) 135 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 136 } 137 } 138 139 //------------------------------------------------------------------------------ 140 // 1D tensor contraction x 141 //------------------------------------------------------------------------------ 142 template <int NCOMP, int P1d, int Q1d> 143 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 144 data.slice[data.tidx] = *U; 145 __syncthreads(); 146 *V = 0.0; 147 if (data.tidx < Q1d) 148 for (CeedInt i = 0; i < P1d; ++i) 149 *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 150 __syncthreads(); 151 } 152 153 //------------------------------------------------------------------------------ 154 // 1D transpose tensor contraction x 155 //------------------------------------------------------------------------------ 156 template <int NCOMP, int P1d, int Q1d> 157 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 158 data.slice[data.tidx] = *U; 159 __syncthreads(); 160 *V = 0.0; 161 if (data.tidx < P1d) 162 for (CeedInt i = 0; i < Q1d; ++i) 163 *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 164 __syncthreads(); 165 } 166 167 //------------------------------------------------------------------------------ 168 // 1D interpolate to quadrature points 169 //------------------------------------------------------------------------------ 170 template <int NCOMP, int P1d, int Q1d> 171 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 172 for (CeedInt comp = 0; comp < NCOMP; comp++) 173 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 174 } 175 176 //------------------------------------------------------------------------------ 177 // 1D interpolate transpose 178 //------------------------------------------------------------------------------ 179 template <int NCOMP, int P1d, int Q1d> 180 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 181 for (CeedInt comp=0; comp<NCOMP; comp++) 182 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 183 } 184 185 //------------------------------------------------------------------------------ 186 // 1D derivatives at quadrature points 187 //------------------------------------------------------------------------------ 188 template <int NCOMP, int P1d, int Q1d> 189 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 190 for (CeedInt comp = 0; comp < NCOMP; comp++) 191 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 192 } 193 194 //------------------------------------------------------------------------------ 195 // 1D derivatives transpose 196 //------------------------------------------------------------------------------ 197 template <int NCOMP, int P1d, int Q1d> 198 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 199 for (CeedInt comp = 0; comp < NCOMP; comp++) 200 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 201 } 202 203 //------------------------------------------------------------------------------ 204 // 2D 205 //------------------------------------------------------------------------------ 206 207 //------------------------------------------------------------------------------ 208 // L-vector -> E-vector, offsets provided 209 //------------------------------------------------------------------------------ 210 template <int NCOMP, int COMPSTRIDE, int P1d> 211 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 212 if (data.tidx < P1d && data.tidy < P1d) { 213 const CeedInt node = data.tidx + data.tidy*P1d; 214 const CeedInt ind = indices[node + elem * P1d*P1d]; 215 for (CeedInt comp = 0; comp < NCOMP; ++comp) 216 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 217 } 218 } 219 220 //------------------------------------------------------------------------------ 221 // L-vector -> E-vector, strided 222 //------------------------------------------------------------------------------ 223 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 224 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 225 if (data.tidx < P1d && data.tidy < P1d) { 226 const CeedInt node = data.tidx + data.tidy*P1d; 227 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 228 for (CeedInt comp = 0; comp < NCOMP; ++comp) 229 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 230 } 231 } 232 233 //------------------------------------------------------------------------------ 234 // E-vector -> L-vector, offsets provided 235 //------------------------------------------------------------------------------ 236 template <int NCOMP, int COMPSTRIDE, int P1d> 237 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 238 if (data.tidx < P1d && data.tidy < P1d) { 239 const CeedInt node = data.tidx + data.tidy*P1d; 240 const CeedInt ind = indices[node + elem * P1d*P1d]; 241 for (CeedInt comp = 0; comp < NCOMP; ++comp) 242 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 243 } 244 } 245 246 //------------------------------------------------------------------------------ 247 // E-vector -> L-vector, strided 248 //------------------------------------------------------------------------------ 249 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 250 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 251 if (data.tidx < P1d && data.tidy < P1d) { 252 const CeedInt node = data.tidx + data.tidy*P1d; 253 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 254 for (CeedInt comp = 0; comp < NCOMP; ++comp) 255 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 256 } 257 } 258 259 //------------------------------------------------------------------------------ 260 // 2D tensor contraction x 261 //------------------------------------------------------------------------------ 262 template <int NCOMP, int P1d, int Q1d> 263 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 264 data.slice[data.tidx+data.tidy*T1d] = *U; 265 __syncthreads(); 266 *V = 0.0; 267 if (data.tidx < Q1d && data.tidy < P1d) 268 for (CeedInt i = 0; i < P1d; ++i) 269 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 270 __syncthreads(); 271 } 272 273 //------------------------------------------------------------------------------ 274 // 2D tensor contract y 275 //------------------------------------------------------------------------------ 276 template <int NCOMP, int P1d, int Q1d> 277 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 278 data.slice[data.tidx+data.tidy*T1d] = *U; 279 __syncthreads(); 280 *V = 0.0; 281 if (data.tidx < Q1d && data.tidy < Q1d) 282 for (CeedInt i = 0; i < P1d; ++i) 283 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 284 __syncthreads(); 285 } 286 287 //------------------------------------------------------------------------------ 288 // 2D transpose tensor contract y 289 //------------------------------------------------------------------------------ 290 template <int NCOMP, int P1d, int Q1d> 291 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 292 data.slice[data.tidx+data.tidy*T1d] = *U; 293 __syncthreads(); 294 *V = 0.0; 295 if (data.tidx < Q1d && data.tidy < P1d) 296 for (CeedInt i = 0; i < Q1d; ++i) 297 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 298 __syncthreads(); 299 } 300 301 //------------------------------------------------------------------------------ 302 // 2D transpose tensor contract x 303 //------------------------------------------------------------------------------ 304 template <int NCOMP, int P1d, int Q1d> 305 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 306 data.slice[data.tidx+data.tidy*T1d] = *U; 307 __syncthreads(); 308 *V = 0.0; 309 if (data.tidx < P1d && data.tidy < P1d) 310 for (CeedInt i = 0; i < Q1d; ++i) 311 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 312 __syncthreads(); 313 } 314 315 //------------------------------------------------------------------------------ 316 // 2D transpose tensor contract and add x 317 //------------------------------------------------------------------------------ 318 template <int NCOMP, int P1d, int Q1d> 319 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 320 data.slice[data.tidx+data.tidy*T1d] = *U; 321 __syncthreads(); 322 if (data.tidx < P1d && data.tidy < P1d) 323 for (CeedInt i = 0; i < Q1d; ++i) 324 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 325 __syncthreads(); 326 } 327 328 //------------------------------------------------------------------------------ 329 // 2D interpolate to quadrature points 330 //------------------------------------------------------------------------------ 331 template <int NCOMP, int P1d, int Q1d> 332 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 333 CeedScalar r_t[1]; 334 for (CeedInt comp = 0; comp < NCOMP; comp++) { 335 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 336 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 337 } 338 } 339 340 //------------------------------------------------------------------------------ 341 // 2D interpolate transpose 342 //------------------------------------------------------------------------------ 343 template <int NCOMP, int P1d, int Q1d> 344 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 345 CeedScalar r_t[1]; 346 for (CeedInt comp = 0; comp < NCOMP; comp++) { 347 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 348 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 349 } 350 } 351 352 //------------------------------------------------------------------------------ 353 // 2D derivatives at quadrature points 354 //------------------------------------------------------------------------------ 355 template <int NCOMP, int P1d, int Q1d> 356 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 357 CeedScalar r_t[1]; 358 for (CeedInt comp = 0; comp < NCOMP; comp++) { 359 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 360 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 361 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 362 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 363 } 364 } 365 366 //------------------------------------------------------------------------------ 367 // 2D derivatives transpose 368 //------------------------------------------------------------------------------ 369 template <int NCOMP, int P1d, int Q1d> 370 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 371 CeedScalar r_t[1]; 372 for (CeedInt comp = 0; comp < NCOMP; comp++) { 373 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 374 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 375 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 376 ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 377 } 378 } 379 380 //------------------------------------------------------------------------------ 381 // 3D 382 //------------------------------------------------------------------------------ 383 384 //------------------------------------------------------------------------------ 385 // L-vector -> E-vector, offsets provided 386 //------------------------------------------------------------------------------ 387 template <int NCOMP, int COMPSTRIDE, int P1d> 388 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 389 if (data.tidx < P1d && data.tidy < P1d) 390 for (CeedInt z = 0; z < P1d; ++z) { 391 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 392 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 393 for (CeedInt comp = 0; comp < NCOMP; ++comp) 394 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 395 } 396 } 397 398 //------------------------------------------------------------------------------ 399 // L-vector -> E-vector, strided 400 //------------------------------------------------------------------------------ 401 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 402 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 403 if (data.tidx < P1d && data.tidy < P1d) 404 for (CeedInt z = 0; z < P1d; ++z) { 405 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 406 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 407 for (CeedInt comp = 0; comp < NCOMP; ++comp) 408 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 409 } 410 } 411 412 //------------------------------------------------------------------------------ 413 // E-vector -> Q-vector, offests provided 414 //------------------------------------------------------------------------------ 415 template <int NCOMP, int COMPSTRIDE, int Q1d> 416 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 417 if (data.tidx < Q1d && data.tidy < Q1d) { 418 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 419 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 420 for (CeedInt comp = 0; comp < NCOMP; ++comp) 421 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 422 } 423 } 424 425 //------------------------------------------------------------------------------ 426 // E-vector -> Q-vector, strided 427 //------------------------------------------------------------------------------ 428 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 429 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 430 if (data.tidx < Q1d && data.tidy < Q1d) { 431 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 432 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 433 for (CeedInt comp = 0; comp < NCOMP; ++comp) 434 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 435 } 436 } 437 438 //------------------------------------------------------------------------------ 439 // E-vector -> L-vector, offsets provided 440 //------------------------------------------------------------------------------ 441 template <int NCOMP, int COMPSTRIDE, int P1d> 442 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 443 if (data.tidx < P1d && data.tidy < P1d) 444 for (CeedInt z = 0; z < P1d; ++z) { 445 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 446 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 447 for (CeedInt comp = 0; comp < NCOMP; ++comp) 448 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 449 } 450 } 451 452 //------------------------------------------------------------------------------ 453 // E-vector -> L-vector, strided 454 //------------------------------------------------------------------------------ 455 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 456 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 457 if (data.tidx < P1d && data.tidy < P1d) 458 for (CeedInt z = 0; z < P1d; ++z) { 459 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 460 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 461 for (CeedInt comp = 0; comp < NCOMP; ++comp) 462 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 463 } 464 } 465 466 //------------------------------------------------------------------------------ 467 // 3D tensor contract x 468 //------------------------------------------------------------------------------ 469 template <int NCOMP, int P1d, int Q1d> 470 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 471 CeedScalar r_B[P1d]; 472 for (CeedInt i = 0; i < P1d; ++i) 473 r_B[i] = B[i + data.tidx*P1d]; 474 475 for (CeedInt k = 0; k < P1d; ++k) { 476 data.slice[data.tidx+data.tidy*T1d] = U[k]; 477 __syncthreads(); 478 V[k] = 0.0; 479 if (data.tidx < Q1d && data.tidy < P1d) 480 for (CeedInt i = 0; i < P1d; ++i) 481 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 482 __syncthreads(); 483 } 484 } 485 486 //------------------------------------------------------------------------------ 487 // 3D tensor contract y 488 //------------------------------------------------------------------------------ 489 template <int NCOMP, int P1d, int Q1d> 490 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 491 CeedScalar r_B[P1d]; 492 for (CeedInt i = 0; i < P1d; ++i) 493 r_B[i] = B[i + data.tidy*P1d]; 494 495 for (CeedInt k = 0; k < P1d; ++k) { 496 data.slice[data.tidx+data.tidy*T1d] = U[k]; 497 __syncthreads(); 498 V[k] = 0.0; 499 if (data.tidx < Q1d && data.tidy < Q1d) 500 for (CeedInt i = 0; i < P1d; ++i) 501 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 502 __syncthreads(); 503 } 504 } 505 506 //------------------------------------------------------------------------------ 507 // 3D tensor contract z 508 //------------------------------------------------------------------------------ 509 template <int NCOMP, int P1d, int Q1d> 510 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 511 for (CeedInt k = 0; k < Q1d; ++k) { 512 V[k] = 0.0; 513 if (data.tidx < Q1d && data.tidy < Q1d) 514 for (CeedInt i = 0; i < P1d; ++i) 515 V[k] += B[i + k*P1d] * U[i]; // Contract z direction 516 } 517 } 518 519 //------------------------------------------------------------------------------ 520 // 3D transpose tensor contract z 521 //------------------------------------------------------------------------------ 522 template <int NCOMP, int P1d, int Q1d> 523 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 524 for (CeedInt k = 0; k < P1d; ++k) { 525 V[k] = 0.0; 526 if (data.tidx < Q1d && data.tidy < Q1d) 527 for (CeedInt i = 0; i < Q1d; ++i) 528 V[k] += B[k + i*P1d] * U[i]; // Contract z direction 529 } 530 } 531 532 //------------------------------------------------------------------------------ 533 // 3D transpose tensor contract y 534 //------------------------------------------------------------------------------ 535 template <int NCOMP, int P1d, int Q1d> 536 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 537 CeedScalar r_B[Q1d]; 538 for (CeedInt i = 0; i < Q1d; ++i) 539 r_B[i] = B[data.tidy + i*P1d]; 540 541 for (CeedInt k = 0; k < P1d; ++k) { 542 data.slice[data.tidx+data.tidy*T1d] = U[k]; 543 __syncthreads(); 544 V[k] = 0.0; 545 if (data.tidx < Q1d && data.tidy < P1d) 546 for (CeedInt i = 0; i < Q1d; ++i) 547 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 548 __syncthreads(); 549 } 550 } 551 552 //------------------------------------------------------------------------------ 553 // 3D transpose tensor contract add y 554 //------------------------------------------------------------------------------ 555 template <int NCOMP, int P1d, int Q1d> 556 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 557 CeedScalar r_B[Q1d]; 558 for (CeedInt i = 0; i < Q1d; ++i) 559 r_B[i] = B[data.tidy + i*P1d]; 560 561 for (CeedInt k = 0; k < P1d; ++k) { 562 data.slice[data.tidx+data.tidy*T1d] = U[k]; 563 __syncthreads(); 564 if (data.tidx < Q1d && data.tidy < P1d) 565 for (CeedInt i = 0; i < Q1d; ++i) 566 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 567 __syncthreads(); 568 } 569 } 570 571 //------------------------------------------------------------------------------ 572 // 3D transpose tensor contract x 573 //------------------------------------------------------------------------------ 574 template <int NCOMP, int P1d, int Q1d> 575 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 576 CeedScalar r_B[Q1d]; 577 for (CeedInt i = 0; i < Q1d; ++i) 578 r_B[i] = B[data.tidx + i*P1d]; 579 580 for (CeedInt k = 0; k < P1d; ++k) { 581 data.slice[data.tidx+data.tidy*T1d] = U[k]; 582 __syncthreads(); 583 V[k] = 0.0; 584 if (data.tidx < P1d && data.tidy < P1d) 585 for (CeedInt i = 0; i < Q1d; ++i) 586 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 587 __syncthreads(); 588 } 589 } 590 591 //------------------------------------------------------------------------------ 592 // 3D transpose tensor contract add x 593 //------------------------------------------------------------------------------ 594 template <int NCOMP, int P1d, int Q1d> 595 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 596 CeedScalar r_B[Q1d]; 597 for (CeedInt i = 0; i < Q1d; ++i) 598 r_B[i] = B[data.tidx + i*P1d]; 599 600 for (CeedInt k = 0; k < P1d; ++k) { 601 data.slice[data.tidx+data.tidy*T1d] = U[k]; 602 __syncthreads(); 603 if (data.tidx < P1d && data.tidy < P1d) 604 for (CeedInt i = 0; i < Q1d; ++i) 605 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 606 __syncthreads(); 607 } 608 } 609 610 //------------------------------------------------------------------------------ 611 // 3D interpolate to quadrature points 612 //------------------------------------------------------------------------------ 613 template <int NCOMP, int P1d, int Q1d> 614 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 615 CeedScalar r_t1[T1d]; 616 CeedScalar r_t2[T1d]; 617 for (CeedInt comp = 0; comp < NCOMP; comp++) { 618 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 619 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 620 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 621 } 622 } 623 624 //------------------------------------------------------------------------------ 625 // 3D interpolate transpose 626 //------------------------------------------------------------------------------ 627 template <int NCOMP, int P1d, int Q1d> 628 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 629 CeedScalar r_t1[T1d]; 630 CeedScalar r_t2[T1d]; 631 for (CeedInt comp = 0; comp < NCOMP; comp++) { 632 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 633 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 634 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 635 } 636 } 637 638 //------------------------------------------------------------------------------ 639 // 3D derivatives at quadrature points 640 //------------------------------------------------------------------------------ 641 template <int NCOMP, int P1d, int Q1d> 642 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 643 CeedScalar r_t1[T1d]; 644 CeedScalar r_t2[T1d]; 645 for (CeedInt comp = 0; comp < NCOMP; comp++) { 646 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 647 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 648 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 649 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 650 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 651 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 652 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 653 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 654 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 655 } 656 } 657 658 //------------------------------------------------------------------------------ 659 // 3D derivatives transpose 660 //------------------------------------------------------------------------------ 661 template <int NCOMP, int P1d, int Q1d> 662 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 663 CeedScalar r_t1[T1d]; 664 CeedScalar r_t2[T1d]; 665 for (CeedInt comp = 0; comp < NCOMP; comp++) { 666 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 667 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 668 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 669 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 670 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 671 ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 672 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 673 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 674 ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 675 } 676 } 677 678 //------------------------------------------------------------------------------ 679 // 3D collocated derivatives computation 680 //------------------------------------------------------------------------------ 681 template <int NCOMP, int Q1d> 682 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 683 if (data.tidx < Q1d && data.tidy < Q1d) { 684 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 685 data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 686 __syncthreads(); 687 // X derivative 688 r_V[comp+0*NCOMP] = 0.0; 689 for (CeedInt i = 0; i < Q1d; ++i) 690 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 691 // Y derivative 692 r_V[comp+1*NCOMP] = 0.0; 693 for (CeedInt i = 0; i < Q1d; ++i) 694 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 695 // Z derivative 696 r_V[comp+2*NCOMP] = 0.0; 697 for (CeedInt i = 0; i < Q1d; ++i) 698 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 699 __syncthreads(); 700 } 701 } 702 } 703 704 //------------------------------------------------------------------------------ 705 // 3D collocated derivatives transpose 706 //------------------------------------------------------------------------------ 707 template <int NCOMP, int Q1d> 708 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 709 if (data.tidx < Q1d && data.tidy < Q1d) { 710 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 711 // X derivative 712 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 713 __syncthreads(); 714 for (CeedInt i = 0; i < Q1d; ++i) 715 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 716 __syncthreads(); 717 // Y derivative 718 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 719 __syncthreads(); 720 for (CeedInt i = 0; i < Q1d; ++i) 721 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 722 __syncthreads(); 723 // Z derivative 724 for (CeedInt i = 0; i < Q1d; ++i) 725 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 726 } 727 } 728 } 729 730 //------------------------------------------------------------------------------ 731 // 1D quadrature weights 732 //------------------------------------------------------------------------------ 733 template <int Q1d> 734 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 735 *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 736 } 737 738 //------------------------------------------------------------------------------ 739 // 2D quadrature weights 740 //------------------------------------------------------------------------------ 741 template <int Q1d> 742 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 743 *w = (data.tidx < Q1d && data.tidy < Q1d) ? 744 qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 745 } 746 747 //------------------------------------------------------------------------------ 748 // 3D quadrature weights 749 //------------------------------------------------------------------------------ 750 template <int Q1d> 751 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 752 const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 753 const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 754 for (CeedInt z = 0; z < Q1d; ++z) 755 w[z] = quad ? pw*qweight1d[z] : 0.0; 756 } 757 758 ); 759 //------------------------------------------------------------------------------ 760 // Build singe operator kernel 761 //------------------------------------------------------------------------------ 762 extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { 763 764 using std::ostringstream; 765 using std::string; 766 int ierr; 767 bool setupdone; 768 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 769 if (setupdone) return CEED_ERROR_SUCCESS; 770 Ceed ceed; 771 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 772 CeedOperator_Hip_gen *data; 773 ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 774 CeedQFunction qf; 775 CeedQFunction_Hip_gen *qf_data; 776 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 777 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 778 CeedSize lsize; 779 CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields, 780 numoutputfields, ncomp, dim = 0; 781 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 782 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 783 CeedOperatorField *opinputfields, *opoutputfields; 784 ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields); 785 CeedChkBackend(ierr); 786 CeedQFunctionField *qfinputfields, *qfoutputfields; 787 ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 788 CeedChkBackend(ierr); 789 CeedEvalMode emode; 790 CeedBasis basis; 791 CeedBasis_Hip_shared *basis_data; 792 CeedElemRestriction Erestrict; 793 CeedElemRestriction_Hip *restr_data; 794 795 // Check for restriction only identity operator 796 bool is_identity_qf; 797 ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 798 if (is_identity_qf) { 799 CeedEvalMode emodein, emodeout; 800 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein); CeedChkBackend(ierr); 801 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout); CeedChkBackend(ierr); 802 if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE) 803 // LCOV_EXCL_START 804 return CeedError(ceed, CEED_ERROR_BACKEND, 805 "Backend does not implement restriction only identity operators"); 806 // LCOV_EXCL_STOP 807 } 808 809 ostringstream code; 810 string devFunctions(deviceFunctions); 811 code << devFunctions; 812 813 string qFunction(qf_data->qFunctionSource); 814 string qFunctionName(qf_data->qFunctionName); 815 string oper; 816 oper = "CeedKernel_Hip_gen_" + qFunctionName; 817 818 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 819 code << "#define CEED_QFUNCTION_HELPER inline __device__ __forceinline__\n"; 820 code << "#define CeedPragmaSIMD\n"; 821 code << "#define CEED_ERROR_SUCCESS 0\n\n"; 822 823 // Find dim and Q1d 824 bool useCollograd = true; 825 data->maxP1d = 0; 826 for (CeedInt i = 0; i < numinputfields; i++) { 827 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 828 if (basis != CEED_BASIS_COLLOCATED) { 829 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 830 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 831 CeedChkBackend(ierr); 832 833 // Check for collocated gradient 834 useCollograd = useCollograd && basis_data->d_collo_grad_1d; 835 836 // Collect dim and Q1d 837 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 838 bool isTensor; 839 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 840 if (isTensor) { 841 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 842 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 843 if (P1d>data->maxP1d) data->maxP1d = P1d; 844 } else { 845 // LCOV_EXCL_START 846 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 847 // LCOV_EXCL_STOP 848 } 849 } 850 } 851 // Check output bases for Q1d, dim as well 852 // The only imput basis might be CEED_BASIS_COLLOCATED 853 for (CeedInt i = 0; i < numoutputfields; i++) { 854 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 855 856 if (basis != CEED_BASIS_COLLOCATED) { 857 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 858 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 859 CeedChkBackend(ierr); 860 861 // Collect dim and Q1d 862 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 863 bool isTensor; 864 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 865 if (isTensor) { 866 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 867 } else { 868 // LCOV_EXCL_START 869 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 870 // LCOV_EXCL_STOP 871 } 872 873 // Check for collocated gradient 874 useCollograd = useCollograd && basis_data->d_collo_grad_1d; 875 } 876 } 877 data->dim = dim; 878 data->Q1d = Q1d; 879 880 // Define CEED_Q_VLA 881 if (dim != 3 || useCollograd) { 882 code << "\n#define CEED_Q_VLA 1\n\n"; 883 } else { 884 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 885 } 886 887 code << qFunction; 888 889 // Setup 890 code << "\n// -----------------------------------------------------------------------------\n"; 891 code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 892 code << "__global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n"; 893 for (CeedInt i = 0; i < numinputfields; i++) { 894 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 895 CeedChkBackend(ierr); 896 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 897 code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 898 } 899 } 900 901 for (CeedInt i = 0; i < numoutputfields; i++) { 902 code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 903 } 904 905 code << " const CeedInt Dim = "<<dim<<";\n"; 906 code << " const CeedInt Q1d = "<<Q1d<<";\n"; 907 908 code << " HIP_DYNAMIC_SHARED( CeedScalar, slice)\n"; 909 code << " BackendData data;\n"; 910 code << " data.tidx = threadIdx.x;\n"; 911 code << " data.tidy = threadIdx.y;\n"; 912 code << " data.tidz = threadIdx.z;\n"; 913 code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 914 code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 915 916 code << "\n // -- Input field constants and basis data --\n"; 917 //Initialize constants, and matrices B and G 918 for (CeedInt i = 0; i < numinputfields; i++) { 919 code << " // ---- Input field "<<i<<" ----\n"; 920 // Get elemsize, emode, ncomp 921 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 922 CeedChkBackend(ierr); 923 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 924 CeedChkBackend(ierr); 925 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 926 CeedChkBackend(ierr); 927 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 928 CeedChkBackend(ierr); 929 930 // Set field constants 931 if (emode != CEED_EVAL_WEIGHT) { 932 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 933 if (basis != CEED_BASIS_COLLOCATED) { 934 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 935 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 936 } else { 937 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 938 } 939 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 940 } 941 942 // Load basis data 943 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 944 switch (emode) { 945 case CEED_EVAL_NONE: 946 break; 947 case CEED_EVAL_INTERP: 948 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 949 data->B.in[i] = basis_data->d_interp_1d; 950 code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 951 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 952 break; 953 case CEED_EVAL_GRAD: 954 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 955 data->B.in[i] = basis_data->d_interp_1d; 956 code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 957 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 958 if (useCollograd) { 959 data->G.in[i] = basis_data->d_collo_grad_1d; 960 code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 961 code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 962 } else { 963 data->G.in[i] = basis_data->d_grad_1d; 964 code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 965 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 966 } 967 break; 968 case CEED_EVAL_WEIGHT: 969 break; // No action 970 case CEED_EVAL_DIV: 971 break; // TODO: Not implemented 972 case CEED_EVAL_CURL: 973 break; // TODO: Not implemented 974 } 975 } 976 977 code << "\n // -- Output field constants and basis data --\n"; 978 for (CeedInt i = 0; i < numoutputfields; i++) { 979 code << " // ---- Output field "<<i<<" ----\n"; 980 // Get elemsize, emode, ncomp 981 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 982 CeedChkBackend(ierr); 983 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 984 CeedChkBackend(ierr); 985 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 986 CeedChkBackend(ierr); 987 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 988 CeedChkBackend(ierr); 989 990 // Set field constants 991 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 992 if (basis != CEED_BASIS_COLLOCATED) { 993 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 994 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 995 } else { 996 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 997 } 998 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 999 1000 // Load basis data 1001 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1002 switch (emode) { 1003 case CEED_EVAL_NONE: 1004 break; // No action 1005 case CEED_EVAL_INTERP: 1006 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1007 data->B.out[i] = basis_data->d_interp_1d; 1008 code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1009 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1010 break; 1011 case CEED_EVAL_GRAD: 1012 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1013 data->B.out[i] = basis_data->d_interp_1d; 1014 code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1015 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1016 if (useCollograd) { 1017 data->G.out[i] = basis_data->d_collo_grad_1d; 1018 code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1019 code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1020 } else { 1021 data->G.out[i] = basis_data->d_grad_1d; 1022 code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1023 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1024 } 1025 break; 1026 // LCOV_EXCL_START 1027 case CEED_EVAL_WEIGHT: { 1028 Ceed ceed; 1029 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1030 return CeedError(ceed, CEED_ERROR_BACKEND, 1031 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1032 break; // Should not occur 1033 } 1034 case CEED_EVAL_DIV: 1035 break; // TODO: Not implemented 1036 case CEED_EVAL_CURL: 1037 break; // TODO: Not implemented 1038 // LCOV_EXCL_STOP 1039 } 1040 } 1041 code << "\n // -- Element loop --\n"; 1042 code << " __syncthreads();\n"; 1043 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1044 // Input basis apply if needed 1045 // Generate the correct eval mode code for each input 1046 code << " // -- Input field restrictions and basis actions --\n"; 1047 for (CeedInt i = 0; i < numinputfields; i++) { 1048 code << " // ---- Input field "<<i<<" ----\n"; 1049 // Get elemsize, emode, ncomp 1050 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1051 CeedChkBackend(ierr); 1052 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1053 CeedChkBackend(ierr); 1054 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1055 CeedChkBackend(ierr); 1056 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1057 CeedChkBackend(ierr); 1058 1059 // Restriction 1060 if (emode != CEED_EVAL_WEIGHT && 1061 !((emode == CEED_EVAL_NONE) && useCollograd)) { 1062 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1063 1064 bool isStrided; 1065 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1066 if (!isStrided) { 1067 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1068 CeedChkBackend(ierr); 1069 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1070 CeedInt compstride; 1071 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1072 code << " // CompStride: "<<compstride<<"\n"; 1073 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1074 data->indices.in[i] = restr_data->d_ind; 1075 code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1076 } else { 1077 bool backendstrides; 1078 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1079 CeedChkBackend(ierr); 1080 CeedInt nelem; 1081 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1082 CeedChkBackend(ierr); 1083 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1084 if (!backendstrides) { 1085 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1086 CeedChkBackend(ierr); 1087 } 1088 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1089 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1090 } 1091 } 1092 1093 // Basis action 1094 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1095 switch (emode) { 1096 case CEED_EVAL_NONE: 1097 if (!useCollograd) { 1098 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1099 } 1100 break; 1101 case CEED_EVAL_INTERP: 1102 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1103 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1104 break; 1105 case CEED_EVAL_GRAD: 1106 if (useCollograd) { 1107 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1108 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1109 } else { 1110 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1111 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1112 } 1113 break; 1114 case CEED_EVAL_WEIGHT: 1115 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1116 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 1117 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1118 data->W = basis_data->d_q_weight_1d; 1119 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1120 break; // No action 1121 case CEED_EVAL_DIV: 1122 break; // TODO: Not implemented 1123 case CEED_EVAL_CURL: 1124 break; // TODO: Not implemented 1125 } 1126 } 1127 1128 // Q function 1129 code << "\n // -- Output field setup --\n"; 1130 for (CeedInt i = 0; i < numoutputfields; i++) { 1131 code << "\n // ---- Output field "<<i<<" ----\n"; 1132 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1133 CeedChkBackend(ierr); 1134 if (emode==CEED_EVAL_GRAD) 1135 { 1136 if (useCollograd) { 1137 //Accumulator for gradient slices 1138 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1139 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1140 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1141 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1142 code << " }\n"; 1143 code << " }\n"; 1144 } else { 1145 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1146 } 1147 } 1148 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1149 { 1150 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1151 } 1152 } 1153 // We treat quadrature points per slice in 3d to save registers 1154 if (useCollograd) { 1155 code << "\n // Note: Collocated Gradient\n"; 1156 code << "#pragma unroll\n"; 1157 code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1158 code << " // -- Input fields --\n"; 1159 for (CeedInt i = 0; i < numinputfields; i++) { 1160 code << " // ---- Input field "<<i<<" ----\n"; 1161 // Get elemsize, emode, ncomp 1162 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1163 CeedChkBackend(ierr); 1164 // Basis action 1165 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1166 switch (emode) { 1167 case CEED_EVAL_NONE: 1168 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1169 1170 bool isStrided; 1171 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr); 1172 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr); 1173 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1174 if (!isStrided) { 1175 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1176 CeedChkBackend(ierr); 1177 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1178 CeedInt compstride; 1179 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1180 code << " // CompStride: "<<compstride<<"\n"; 1181 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1182 data->indices.in[i] = restr_data->d_ind; 1183 code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1184 } else { 1185 bool backendstrides; 1186 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1187 CeedChkBackend(ierr); 1188 CeedInt nelem; 1189 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1190 CeedChkBackend(ierr); 1191 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1192 if (!backendstrides) { 1193 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1194 CeedChkBackend(ierr); 1195 } 1196 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1197 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1198 } 1199 break; 1200 case CEED_EVAL_INTERP: 1201 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1202 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1203 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1204 code << " }\n"; 1205 break; 1206 case CEED_EVAL_GRAD: 1207 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1208 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1209 break; 1210 case CEED_EVAL_WEIGHT: 1211 code << " CeedScalar r_q"<<i<<"[1];\n"; 1212 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1213 break; // No action 1214 case CEED_EVAL_DIV: 1215 break; // TODO: Not implemented 1216 case CEED_EVAL_CURL: 1217 break; // TODO: Not implemented 1218 } 1219 } 1220 code << "\n // -- Output fields --\n"; 1221 for (CeedInt i = 0; i < numoutputfields; i++) { 1222 code << " // ---- Output field "<<i<<" ----\n"; 1223 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1224 CeedChkBackend(ierr); 1225 // Basis action 1226 switch (emode) { 1227 case CEED_EVAL_NONE: 1228 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1229 break; // No action 1230 case CEED_EVAL_INTERP: 1231 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1232 break; 1233 case CEED_EVAL_GRAD: 1234 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1235 break; 1236 case CEED_EVAL_WEIGHT: 1237 break; // Should not occur 1238 case CEED_EVAL_DIV: 1239 break; // TODO: Not implemented 1240 case CEED_EVAL_CURL: 1241 break; // TODO: Not implemented 1242 } 1243 } 1244 } else { 1245 code << "\n // Note: No Collocated Gradient\n"; 1246 code << " // -- Input fields --\n"; 1247 for (CeedInt i = 0; i < numinputfields; i++) { 1248 code << " // ---- Input field "<<i<<" ----\n"; 1249 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1250 } 1251 code << " // -- Output fields --\n"; 1252 for (CeedInt i = 0; i < numoutputfields; i++) { 1253 code << " // ---- Output field "<<i<<" ----\n"; 1254 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1255 } 1256 } 1257 code << "\n // -- QFunction Inputs and outputs --\n"; 1258 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1259 for (CeedInt i = 0; i < numinputfields; i++) { 1260 code << " // ---- Input field "<<i<<" ----\n"; 1261 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1262 } 1263 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1264 for (CeedInt i = 0; i < numoutputfields; i++) { 1265 code << " // ---- Output field "<<i<<" ----\n"; 1266 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1267 } 1268 code << "\n // -- Apply QFunction --\n"; 1269 code << " "<<qFunctionName<<"(ctx, "; 1270 if (dim != 3 || useCollograd) { 1271 code << "1"; 1272 } else { 1273 code << "Q1d"; 1274 } 1275 code << ", in, out);\n"; 1276 if (useCollograd) { 1277 code << "\n // Note: Collocated Gradient\n"; 1278 code << " // -- Output fields --\n"; 1279 for (CeedInt i = 0; i < numoutputfields; i++) { 1280 code << " // ---- Output field "<<i<<" ----\n"; 1281 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1282 CeedChkBackend(ierr); 1283 // Basis action 1284 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1285 switch (emode) { 1286 case CEED_EVAL_NONE: 1287 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1288 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1289 code << " }\n"; 1290 break; // No action 1291 case CEED_EVAL_INTERP: 1292 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1293 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1294 code << " }\n"; 1295 break; 1296 case CEED_EVAL_GRAD: 1297 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1298 break; 1299 case CEED_EVAL_WEIGHT: 1300 break; // Should not occur 1301 case CEED_EVAL_DIV: 1302 break; // TODO: Not implemented 1303 case CEED_EVAL_CURL: 1304 break; // TODO: Not implemented 1305 } 1306 } 1307 code << " }\n"; 1308 } 1309 1310 // Output basis apply if needed 1311 // Generate the correct eval mode code for each output 1312 code << "\n // -- Output field basis action and restrictions --\n"; 1313 for (CeedInt i = 0; i < numoutputfields; i++) { 1314 code << " // ---- Output field "<<i<<" ----\n"; 1315 // Get elemsize, emode, ncomp 1316 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1317 CeedChkBackend(ierr); 1318 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1319 CeedChkBackend(ierr); 1320 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1321 CeedChkBackend(ierr); 1322 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1323 CeedChkBackend(ierr); 1324 // Basis action 1325 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1326 switch (emode) { 1327 case CEED_EVAL_NONE: 1328 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1329 break; // No action 1330 case CEED_EVAL_INTERP: 1331 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1332 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1333 break; 1334 case CEED_EVAL_GRAD: 1335 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1336 if (useCollograd) { 1337 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1338 } else { 1339 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1340 } 1341 break; 1342 // LCOV_EXCL_START 1343 case CEED_EVAL_WEIGHT: { 1344 Ceed ceed; 1345 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1346 return CeedError(ceed, CEED_ERROR_BACKEND, 1347 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1348 break; // Should not occur 1349 } 1350 case CEED_EVAL_DIV: 1351 break; // TODO: Not implemented 1352 case CEED_EVAL_CURL: 1353 break; // TODO: Not implemented 1354 // LCOV_EXCL_STOP 1355 } 1356 // Restriction 1357 bool isStrided; 1358 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1359 if (!isStrided) { 1360 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1361 CeedChkBackend(ierr); 1362 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1363 CeedInt compstride; 1364 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1365 code << " // CompStride: "<<compstride<<"\n"; 1366 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1367 data->indices.out[i] = restr_data->d_ind; 1368 code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1369 } else { 1370 bool backendstrides; 1371 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1372 CeedChkBackend(ierr); 1373 CeedInt nelem; 1374 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1375 CeedChkBackend(ierr); 1376 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1377 if (!backendstrides) { 1378 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1379 CeedChkBackend(ierr); 1380 } 1381 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1382 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1383 } 1384 } 1385 1386 code << " }\n"; 1387 code << "}\n"; 1388 code << "// -----------------------------------------------------------------------------\n\n"; 1389 1390 // View kernel for debugging 1391 CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 1392 CeedDebug(ceed, code.str().c_str()); 1393 1394 CeedInt block_sizes[3]; 1395 ierr = BlockGridCalculate_Hip_gen(dim, numelements, data->maxP1d, Q1d, block_sizes); 1396 CeedChkBackend(ierr); 1397 ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2, 1398 "T1d", block_sizes[0], 1399 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]); 1400 CeedChkBackend(ierr); 1401 ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op); 1402 CeedChkBackend(ierr); 1403 1404 ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 1405 return CEED_ERROR_SUCCESS; 1406 } 1407 //------------------------------------------------------------------------------ 1408