1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed/ceed.h> 11 #include <ceed/backend.h> 12 #include <iostream> 13 #include <string> 14 #include <sstream> 15 #include "ceed-hip-gen.h" 16 #include "../hip-ref/ceed-hip-ref.h" 17 #include "../hip-shared/ceed-hip-shared.h" 18 #include "../hip/ceed-hip-compile.h" 19 20 21 //------------------------------------------------------------------------------ 22 // Calculate the block size used for launching the operator kernel 23 //------------------------------------------------------------------------------ 24 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt nelem, 25 const CeedInt P1d, const CeedInt Q1d, 26 CeedInt *block_sizes) { 27 28 const CeedInt thread1d = CeedIntMax(Q1d, P1d); 29 if (dim==1) { 30 CeedInt elemsPerBlock = 64*thread1d > 256? 256/thread1d : 64; 31 elemsPerBlock = elemsPerBlock>0?elemsPerBlock:1; 32 block_sizes[0] = thread1d; 33 block_sizes[1] = 1; 34 block_sizes[2] = elemsPerBlock; 35 } else if (dim==2) { 36 const CeedInt elemsPerBlock = thread1d<4? 16 : 2; 37 block_sizes[0] = thread1d; 38 block_sizes[1] = thread1d; 39 block_sizes[2] = elemsPerBlock; 40 } else if (dim==3) { 41 const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1); 42 block_sizes[0] = thread1d; 43 block_sizes[1] = thread1d; 44 block_sizes[2] = elemsPerBlock; 45 } 46 return CEED_ERROR_SUCCESS; 47 } 48 49 static const char *deviceFunctions = QUOTE( 50 51 //------------------------------------------------------------------------------ 52 // Typedefs 53 //------------------------------------------------------------------------------ 54 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields; 55 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt; 56 57 typedef struct { 58 CeedInt tidx; 59 CeedInt tidy; 60 CeedInt tidz; 61 CeedInt tid; 62 CeedScalar* slice; 63 } BackendData; 64 65 //------------------------------------------------------------------------------ 66 // Load matrices for basis actions 67 //------------------------------------------------------------------------------ 68 template <int P, int Q> 69 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 70 for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 71 B[i] = d_B[i]; 72 } 73 74 //------------------------------------------------------------------------------ 75 // 1D 76 //------------------------------------------------------------------------------ 77 78 //------------------------------------------------------------------------------ 79 // L-vector -> E-vector, offsets provided 80 //------------------------------------------------------------------------------ 81 template <int NCOMP, int COMPSTRIDE, int P1d> 82 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 83 if (data.tidx < P1d) { 84 const CeedInt node = data.tidx; 85 const CeedInt ind = indices[node + elem * P1d]; 86 for (CeedInt comp = 0; comp < NCOMP; ++comp) 87 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // L-vector -> E-vector, strided 93 //------------------------------------------------------------------------------ 94 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 95 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 96 if (data.tidx < P1d) { 97 const CeedInt node = data.tidx; 98 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 99 for (CeedInt comp = 0; comp < NCOMP; ++comp) 100 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 101 } 102 } 103 104 //------------------------------------------------------------------------------ 105 // E-vector -> L-vector, offsets provided 106 //------------------------------------------------------------------------------ 107 template <int NCOMP, int COMPSTRIDE, int P1d> 108 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 109 if (data.tidx < P1d) { 110 const CeedInt node = data.tidx; 111 const CeedInt ind = indices[node + elem * P1d]; 112 for (CeedInt comp = 0; comp < NCOMP; ++comp) 113 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 114 } 115 } 116 117 //------------------------------------------------------------------------------ 118 // E-vector -> L-vector, strided 119 //------------------------------------------------------------------------------ 120 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 121 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 122 if (data.tidx < P1d) { 123 const CeedInt node = data.tidx; 124 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 125 for (CeedInt comp = 0; comp < NCOMP; ++comp) 126 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 127 } 128 } 129 130 //------------------------------------------------------------------------------ 131 // 1D tensor contraction x 132 //------------------------------------------------------------------------------ 133 template <int NCOMP, int P1d, int Q1d> 134 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 135 data.slice[data.tidx] = *U; 136 __syncthreads(); 137 *V = 0.0; 138 if (data.tidx < Q1d) 139 for (CeedInt i = 0; i < P1d; ++i) 140 *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 141 __syncthreads(); 142 } 143 144 //------------------------------------------------------------------------------ 145 // 1D transpose tensor contraction x 146 //------------------------------------------------------------------------------ 147 template <int NCOMP, int P1d, int Q1d> 148 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 149 data.slice[data.tidx] = *U; 150 __syncthreads(); 151 *V = 0.0; 152 if (data.tidx < P1d) 153 for (CeedInt i = 0; i < Q1d; ++i) 154 *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 155 __syncthreads(); 156 } 157 158 //------------------------------------------------------------------------------ 159 // 1D interpolate to quadrature points 160 //------------------------------------------------------------------------------ 161 template <int NCOMP, int P1d, int Q1d> 162 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 163 for (CeedInt comp = 0; comp < NCOMP; comp++) 164 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 165 } 166 167 //------------------------------------------------------------------------------ 168 // 1D interpolate transpose 169 //------------------------------------------------------------------------------ 170 template <int NCOMP, int P1d, int Q1d> 171 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 172 for (CeedInt comp=0; comp<NCOMP; comp++) 173 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 174 } 175 176 //------------------------------------------------------------------------------ 177 // 1D derivatives at quadrature points 178 //------------------------------------------------------------------------------ 179 template <int NCOMP, int P1d, int Q1d> 180 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 181 for (CeedInt comp = 0; comp < NCOMP; comp++) 182 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 183 } 184 185 //------------------------------------------------------------------------------ 186 // 1D derivatives transpose 187 //------------------------------------------------------------------------------ 188 template <int NCOMP, int P1d, int Q1d> 189 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 190 for (CeedInt comp = 0; comp < NCOMP; comp++) 191 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 192 } 193 194 //------------------------------------------------------------------------------ 195 // 2D 196 //------------------------------------------------------------------------------ 197 198 //------------------------------------------------------------------------------ 199 // L-vector -> E-vector, offsets provided 200 //------------------------------------------------------------------------------ 201 template <int NCOMP, int COMPSTRIDE, int P1d> 202 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 203 if (data.tidx < P1d && data.tidy < P1d) { 204 const CeedInt node = data.tidx + data.tidy*P1d; 205 const CeedInt ind = indices[node + elem * P1d*P1d]; 206 for (CeedInt comp = 0; comp < NCOMP; ++comp) 207 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 208 } 209 } 210 211 //------------------------------------------------------------------------------ 212 // L-vector -> E-vector, strided 213 //------------------------------------------------------------------------------ 214 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 215 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 216 if (data.tidx < P1d && data.tidy < P1d) { 217 const CeedInt node = data.tidx + data.tidy*P1d; 218 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 219 for (CeedInt comp = 0; comp < NCOMP; ++comp) 220 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 221 } 222 } 223 224 //------------------------------------------------------------------------------ 225 // E-vector -> L-vector, offsets provided 226 //------------------------------------------------------------------------------ 227 template <int NCOMP, int COMPSTRIDE, int P1d> 228 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 229 if (data.tidx < P1d && data.tidy < P1d) { 230 const CeedInt node = data.tidx + data.tidy*P1d; 231 const CeedInt ind = indices[node + elem * P1d*P1d]; 232 for (CeedInt comp = 0; comp < NCOMP; ++comp) 233 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 234 } 235 } 236 237 //------------------------------------------------------------------------------ 238 // E-vector -> L-vector, strided 239 //------------------------------------------------------------------------------ 240 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 241 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 242 if (data.tidx < P1d && data.tidy < P1d) { 243 const CeedInt node = data.tidx + data.tidy*P1d; 244 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 245 for (CeedInt comp = 0; comp < NCOMP; ++comp) 246 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 247 } 248 } 249 250 //------------------------------------------------------------------------------ 251 // 2D tensor contraction x 252 //------------------------------------------------------------------------------ 253 template <int NCOMP, int P1d, int Q1d> 254 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 255 data.slice[data.tidx+data.tidy*T1d] = *U; 256 __syncthreads(); 257 *V = 0.0; 258 if (data.tidx < Q1d && data.tidy < P1d) 259 for (CeedInt i = 0; i < P1d; ++i) 260 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 261 __syncthreads(); 262 } 263 264 //------------------------------------------------------------------------------ 265 // 2D tensor contract y 266 //------------------------------------------------------------------------------ 267 template <int NCOMP, int P1d, int Q1d> 268 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 269 data.slice[data.tidx+data.tidy*T1d] = *U; 270 __syncthreads(); 271 *V = 0.0; 272 if (data.tidx < Q1d && data.tidy < Q1d) 273 for (CeedInt i = 0; i < P1d; ++i) 274 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 275 __syncthreads(); 276 } 277 278 //------------------------------------------------------------------------------ 279 // 2D transpose tensor contract y 280 //------------------------------------------------------------------------------ 281 template <int NCOMP, int P1d, int Q1d> 282 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 283 data.slice[data.tidx+data.tidy*T1d] = *U; 284 __syncthreads(); 285 *V = 0.0; 286 if (data.tidx < Q1d && data.tidy < P1d) 287 for (CeedInt i = 0; i < Q1d; ++i) 288 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 289 __syncthreads(); 290 } 291 292 //------------------------------------------------------------------------------ 293 // 2D transpose tensor contract x 294 //------------------------------------------------------------------------------ 295 template <int NCOMP, int P1d, int Q1d> 296 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 297 data.slice[data.tidx+data.tidy*T1d] = *U; 298 __syncthreads(); 299 *V = 0.0; 300 if (data.tidx < P1d && data.tidy < P1d) 301 for (CeedInt i = 0; i < Q1d; ++i) 302 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 303 __syncthreads(); 304 } 305 306 //------------------------------------------------------------------------------ 307 // 2D transpose tensor contract and add x 308 //------------------------------------------------------------------------------ 309 template <int NCOMP, int P1d, int Q1d> 310 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 311 data.slice[data.tidx+data.tidy*T1d] = *U; 312 __syncthreads(); 313 if (data.tidx < P1d && data.tidy < P1d) 314 for (CeedInt i = 0; i < Q1d; ++i) 315 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 316 __syncthreads(); 317 } 318 319 //------------------------------------------------------------------------------ 320 // 2D interpolate to quadrature points 321 //------------------------------------------------------------------------------ 322 template <int NCOMP, int P1d, int Q1d> 323 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 324 CeedScalar r_t[1]; 325 for (CeedInt comp = 0; comp < NCOMP; comp++) { 326 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 327 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 328 } 329 } 330 331 //------------------------------------------------------------------------------ 332 // 2D interpolate transpose 333 //------------------------------------------------------------------------------ 334 template <int NCOMP, int P1d, int Q1d> 335 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 336 CeedScalar r_t[1]; 337 for (CeedInt comp = 0; comp < NCOMP; comp++) { 338 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 339 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 340 } 341 } 342 343 //------------------------------------------------------------------------------ 344 // 2D derivatives at quadrature points 345 //------------------------------------------------------------------------------ 346 template <int NCOMP, int P1d, int Q1d> 347 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 348 CeedScalar r_t[1]; 349 for (CeedInt comp = 0; comp < NCOMP; comp++) { 350 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 351 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 352 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 353 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 354 } 355 } 356 357 //------------------------------------------------------------------------------ 358 // 2D derivatives transpose 359 //------------------------------------------------------------------------------ 360 template <int NCOMP, int P1d, int Q1d> 361 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 362 CeedScalar r_t[1]; 363 for (CeedInt comp = 0; comp < NCOMP; comp++) { 364 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 365 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 366 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 367 ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 368 } 369 } 370 371 //------------------------------------------------------------------------------ 372 // 3D 373 //------------------------------------------------------------------------------ 374 375 //------------------------------------------------------------------------------ 376 // L-vector -> E-vector, offsets provided 377 //------------------------------------------------------------------------------ 378 template <int NCOMP, int COMPSTRIDE, int P1d> 379 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 380 if (data.tidx < P1d && data.tidy < P1d) 381 for (CeedInt z = 0; z < P1d; ++z) { 382 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 383 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 384 for (CeedInt comp = 0; comp < NCOMP; ++comp) 385 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 386 } 387 } 388 389 //------------------------------------------------------------------------------ 390 // L-vector -> E-vector, strided 391 //------------------------------------------------------------------------------ 392 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 393 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 394 if (data.tidx < P1d && data.tidy < P1d) 395 for (CeedInt z = 0; z < P1d; ++z) { 396 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 397 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 398 for (CeedInt comp = 0; comp < NCOMP; ++comp) 399 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 400 } 401 } 402 403 //------------------------------------------------------------------------------ 404 // E-vector -> Q-vector, offests provided 405 //------------------------------------------------------------------------------ 406 template <int NCOMP, int COMPSTRIDE, int Q1d> 407 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 408 if (data.tidx < Q1d && data.tidy < Q1d) { 409 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 410 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 411 for (CeedInt comp = 0; comp < NCOMP; ++comp) 412 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 413 } 414 } 415 416 //------------------------------------------------------------------------------ 417 // E-vector -> Q-vector, strided 418 //------------------------------------------------------------------------------ 419 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 420 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 421 if (data.tidx < Q1d && data.tidy < Q1d) { 422 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 423 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 424 for (CeedInt comp = 0; comp < NCOMP; ++comp) 425 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 426 } 427 } 428 429 //------------------------------------------------------------------------------ 430 // E-vector -> L-vector, offsets provided 431 //------------------------------------------------------------------------------ 432 template <int NCOMP, int COMPSTRIDE, int P1d> 433 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 434 if (data.tidx < P1d && data.tidy < P1d) 435 for (CeedInt z = 0; z < P1d; ++z) { 436 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 437 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 438 for (CeedInt comp = 0; comp < NCOMP; ++comp) 439 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 440 } 441 } 442 443 //------------------------------------------------------------------------------ 444 // E-vector -> L-vector, strided 445 //------------------------------------------------------------------------------ 446 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 447 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 448 if (data.tidx < P1d && data.tidy < P1d) 449 for (CeedInt z = 0; z < P1d; ++z) { 450 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 451 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 452 for (CeedInt comp = 0; comp < NCOMP; ++comp) 453 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 454 } 455 } 456 457 //------------------------------------------------------------------------------ 458 // 3D tensor contract x 459 //------------------------------------------------------------------------------ 460 template <int NCOMP, int P1d, int Q1d> 461 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 462 CeedScalar r_B[P1d]; 463 for (CeedInt i = 0; i < P1d; ++i) 464 r_B[i] = B[i + data.tidx*P1d]; 465 466 for (CeedInt k = 0; k < P1d; ++k) { 467 data.slice[data.tidx+data.tidy*T1d] = U[k]; 468 __syncthreads(); 469 V[k] = 0.0; 470 if (data.tidx < Q1d && data.tidy < P1d) 471 for (CeedInt i = 0; i < P1d; ++i) 472 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 473 __syncthreads(); 474 } 475 } 476 477 //------------------------------------------------------------------------------ 478 // 3D tensor contract y 479 //------------------------------------------------------------------------------ 480 template <int NCOMP, int P1d, int Q1d> 481 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 482 CeedScalar r_B[P1d]; 483 for (CeedInt i = 0; i < P1d; ++i) 484 r_B[i] = B[i + data.tidy*P1d]; 485 486 for (CeedInt k = 0; k < P1d; ++k) { 487 data.slice[data.tidx+data.tidy*T1d] = U[k]; 488 __syncthreads(); 489 V[k] = 0.0; 490 if (data.tidx < Q1d && data.tidy < Q1d) 491 for (CeedInt i = 0; i < P1d; ++i) 492 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 493 __syncthreads(); 494 } 495 } 496 497 //------------------------------------------------------------------------------ 498 // 3D tensor contract z 499 //------------------------------------------------------------------------------ 500 template <int NCOMP, int P1d, int Q1d> 501 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 502 for (CeedInt k = 0; k < Q1d; ++k) { 503 V[k] = 0.0; 504 if (data.tidx < Q1d && data.tidy < Q1d) 505 for (CeedInt i = 0; i < P1d; ++i) 506 V[k] += B[i + k*P1d] * U[i]; // Contract z direction 507 } 508 } 509 510 //------------------------------------------------------------------------------ 511 // 3D transpose tensor contract z 512 //------------------------------------------------------------------------------ 513 template <int NCOMP, int P1d, int Q1d> 514 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 515 for (CeedInt k = 0; k < P1d; ++k) { 516 V[k] = 0.0; 517 if (data.tidx < Q1d && data.tidy < Q1d) 518 for (CeedInt i = 0; i < Q1d; ++i) 519 V[k] += B[k + i*P1d] * U[i]; // Contract z direction 520 } 521 } 522 523 //------------------------------------------------------------------------------ 524 // 3D transpose tensor contract y 525 //------------------------------------------------------------------------------ 526 template <int NCOMP, int P1d, int Q1d> 527 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 528 CeedScalar r_B[Q1d]; 529 for (CeedInt i = 0; i < Q1d; ++i) 530 r_B[i] = B[data.tidy + i*P1d]; 531 532 for (CeedInt k = 0; k < P1d; ++k) { 533 data.slice[data.tidx+data.tidy*T1d] = U[k]; 534 __syncthreads(); 535 V[k] = 0.0; 536 if (data.tidx < Q1d && data.tidy < P1d) 537 for (CeedInt i = 0; i < Q1d; ++i) 538 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 539 __syncthreads(); 540 } 541 } 542 543 //------------------------------------------------------------------------------ 544 // 3D transpose tensor contract add y 545 //------------------------------------------------------------------------------ 546 template <int NCOMP, int P1d, int Q1d> 547 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 548 CeedScalar r_B[Q1d]; 549 for (CeedInt i = 0; i < Q1d; ++i) 550 r_B[i] = B[data.tidy + i*P1d]; 551 552 for (CeedInt k = 0; k < P1d; ++k) { 553 data.slice[data.tidx+data.tidy*T1d] = U[k]; 554 __syncthreads(); 555 if (data.tidx < Q1d && data.tidy < P1d) 556 for (CeedInt i = 0; i < Q1d; ++i) 557 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 558 __syncthreads(); 559 } 560 } 561 562 //------------------------------------------------------------------------------ 563 // 3D transpose tensor contract x 564 //------------------------------------------------------------------------------ 565 template <int NCOMP, int P1d, int Q1d> 566 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 567 CeedScalar r_B[Q1d]; 568 for (CeedInt i = 0; i < Q1d; ++i) 569 r_B[i] = B[data.tidx + i*P1d]; 570 571 for (CeedInt k = 0; k < P1d; ++k) { 572 data.slice[data.tidx+data.tidy*T1d] = U[k]; 573 __syncthreads(); 574 V[k] = 0.0; 575 if (data.tidx < P1d && data.tidy < P1d) 576 for (CeedInt i = 0; i < Q1d; ++i) 577 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 578 __syncthreads(); 579 } 580 } 581 582 //------------------------------------------------------------------------------ 583 // 3D transpose tensor contract add x 584 //------------------------------------------------------------------------------ 585 template <int NCOMP, int P1d, int Q1d> 586 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 587 CeedScalar r_B[Q1d]; 588 for (CeedInt i = 0; i < Q1d; ++i) 589 r_B[i] = B[data.tidx + i*P1d]; 590 591 for (CeedInt k = 0; k < P1d; ++k) { 592 data.slice[data.tidx+data.tidy*T1d] = U[k]; 593 __syncthreads(); 594 if (data.tidx < P1d && data.tidy < P1d) 595 for (CeedInt i = 0; i < Q1d; ++i) 596 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 597 __syncthreads(); 598 } 599 } 600 601 //------------------------------------------------------------------------------ 602 // 3D interpolate to quadrature points 603 //------------------------------------------------------------------------------ 604 template <int NCOMP, int P1d, int Q1d> 605 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 606 CeedScalar r_t1[T1d]; 607 CeedScalar r_t2[T1d]; 608 for (CeedInt comp = 0; comp < NCOMP; comp++) { 609 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 610 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 611 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 612 } 613 } 614 615 //------------------------------------------------------------------------------ 616 // 3D interpolate transpose 617 //------------------------------------------------------------------------------ 618 template <int NCOMP, int P1d, int Q1d> 619 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 620 CeedScalar r_t1[T1d]; 621 CeedScalar r_t2[T1d]; 622 for (CeedInt comp = 0; comp < NCOMP; comp++) { 623 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 624 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 625 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 626 } 627 } 628 629 //------------------------------------------------------------------------------ 630 // 3D derivatives at quadrature points 631 //------------------------------------------------------------------------------ 632 template <int NCOMP, int P1d, int Q1d> 633 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 634 CeedScalar r_t1[T1d]; 635 CeedScalar r_t2[T1d]; 636 for (CeedInt comp = 0; comp < NCOMP; comp++) { 637 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 638 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 639 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 640 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 641 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 642 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 643 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 644 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 645 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 646 } 647 } 648 649 //------------------------------------------------------------------------------ 650 // 3D derivatives transpose 651 //------------------------------------------------------------------------------ 652 template <int NCOMP, int P1d, int Q1d> 653 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 654 CeedScalar r_t1[T1d]; 655 CeedScalar r_t2[T1d]; 656 for (CeedInt comp = 0; comp < NCOMP; comp++) { 657 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 658 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 659 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 660 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 661 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 662 ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 663 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 664 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 665 ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 666 } 667 } 668 669 //------------------------------------------------------------------------------ 670 // 3D collocated derivatives computation 671 //------------------------------------------------------------------------------ 672 template <int NCOMP, int Q1d> 673 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 674 if (data.tidx < Q1d && data.tidy < Q1d) { 675 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 676 data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 677 __syncthreads(); 678 // X derivative 679 r_V[comp+0*NCOMP] = 0.0; 680 for (CeedInt i = 0; i < Q1d; ++i) 681 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 682 // Y derivative 683 r_V[comp+1*NCOMP] = 0.0; 684 for (CeedInt i = 0; i < Q1d; ++i) 685 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 686 // Z derivative 687 r_V[comp+2*NCOMP] = 0.0; 688 for (CeedInt i = 0; i < Q1d; ++i) 689 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 690 __syncthreads(); 691 } 692 } 693 } 694 695 //------------------------------------------------------------------------------ 696 // 3D collocated derivatives transpose 697 //------------------------------------------------------------------------------ 698 template <int NCOMP, int Q1d> 699 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 700 if (data.tidx < Q1d && data.tidy < Q1d) { 701 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 702 // X derivative 703 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 704 __syncthreads(); 705 for (CeedInt i = 0; i < Q1d; ++i) 706 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 707 __syncthreads(); 708 // Y derivative 709 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 710 __syncthreads(); 711 for (CeedInt i = 0; i < Q1d; ++i) 712 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 713 __syncthreads(); 714 // Z derivative 715 for (CeedInt i = 0; i < Q1d; ++i) 716 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 717 } 718 } 719 } 720 721 //------------------------------------------------------------------------------ 722 // 1D quadrature weights 723 //------------------------------------------------------------------------------ 724 template <int Q1d> 725 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 726 *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 727 } 728 729 //------------------------------------------------------------------------------ 730 // 2D quadrature weights 731 //------------------------------------------------------------------------------ 732 template <int Q1d> 733 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 734 *w = (data.tidx < Q1d && data.tidy < Q1d) ? 735 qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 736 } 737 738 //------------------------------------------------------------------------------ 739 // 3D quadrature weights 740 //------------------------------------------------------------------------------ 741 template <int Q1d> 742 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 743 const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 744 const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 745 for (CeedInt z = 0; z < Q1d; ++z) 746 w[z] = quad ? pw*qweight1d[z] : 0.0; 747 } 748 749 ); 750 //------------------------------------------------------------------------------ 751 // Build singe operator kernel 752 //------------------------------------------------------------------------------ 753 extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { 754 755 using std::ostringstream; 756 using std::string; 757 int ierr; 758 bool setupdone; 759 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 760 if (setupdone) return CEED_ERROR_SUCCESS; 761 Ceed ceed; 762 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 763 CeedOperator_Hip_gen *data; 764 ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 765 CeedQFunction qf; 766 CeedQFunction_Hip_gen *qf_data; 767 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 768 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 769 CeedSize lsize; 770 CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields, 771 numoutputfields, ncomp, dim = 0; 772 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 773 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 774 CeedOperatorField *opinputfields, *opoutputfields; 775 ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields); 776 CeedChkBackend(ierr); 777 CeedQFunctionField *qfinputfields, *qfoutputfields; 778 ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 779 CeedChkBackend(ierr); 780 CeedEvalMode emode; 781 CeedBasis basis; 782 CeedBasis_Hip_shared *basis_data; 783 CeedElemRestriction Erestrict; 784 CeedElemRestriction_Hip *restr_data; 785 786 // Check for restriction only identity operator 787 bool is_identity_qf; 788 ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 789 if (is_identity_qf) { 790 CeedEvalMode emodein, emodeout; 791 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein); CeedChkBackend(ierr); 792 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout); CeedChkBackend(ierr); 793 if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE) 794 // LCOV_EXCL_START 795 return CeedError(ceed, CEED_ERROR_BACKEND, 796 "Backend does not implement restriction only identity operators"); 797 // LCOV_EXCL_STOP 798 } 799 800 ostringstream code; 801 string devFunctions(deviceFunctions); 802 code << devFunctions; 803 804 string qFunction(qf_data->qFunctionSource); 805 string qFunctionName(qf_data->qFunctionName); 806 string oper; 807 oper = "CeedKernel_Hip_gen_" + qFunctionName; 808 809 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 810 code << "#define CEED_QFUNCTION_HELPER inline __device__ __forceinline__\n"; 811 code << "#define CeedPragmaSIMD\n"; 812 code << "#define CEED_ERROR_SUCCESS 0\n\n"; 813 814 // Find dim and Q1d 815 bool useCollograd = true; 816 data->maxP1d = 0; 817 for (CeedInt i = 0; i < numinputfields; i++) { 818 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 819 if (basis != CEED_BASIS_COLLOCATED) { 820 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 821 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 822 CeedChkBackend(ierr); 823 824 // Check for collocated gradient 825 useCollograd = useCollograd && basis_data->d_collo_grad_1d; 826 827 // Collect dim and Q1d 828 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 829 bool isTensor; 830 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 831 if (isTensor) { 832 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 833 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 834 if (P1d>data->maxP1d) data->maxP1d = P1d; 835 } else { 836 // LCOV_EXCL_START 837 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 838 // LCOV_EXCL_STOP 839 } 840 } 841 } 842 // Check output bases for Q1d, dim as well 843 // The only imput basis might be CEED_BASIS_COLLOCATED 844 for (CeedInt i = 0; i < numoutputfields; i++) { 845 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 846 847 if (basis != CEED_BASIS_COLLOCATED) { 848 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 849 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 850 CeedChkBackend(ierr); 851 852 // Collect dim and Q1d 853 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 854 bool isTensor; 855 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 856 if (isTensor) { 857 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 858 } else { 859 // LCOV_EXCL_START 860 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 861 // LCOV_EXCL_STOP 862 } 863 864 // Check for collocated gradient 865 useCollograd = useCollograd && basis_data->d_collo_grad_1d; 866 } 867 } 868 data->dim = dim; 869 data->Q1d = Q1d; 870 871 // Define CEED_Q_VLA 872 if (dim != 3 || useCollograd) { 873 code << "\n#define CEED_Q_VLA 1\n\n"; 874 } else { 875 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 876 } 877 878 code << qFunction; 879 880 // Setup 881 code << "\n// -----------------------------------------------------------------------------\n"; 882 code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 883 code << "__global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n"; 884 for (CeedInt i = 0; i < numinputfields; i++) { 885 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 886 CeedChkBackend(ierr); 887 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 888 code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 889 } 890 } 891 892 for (CeedInt i = 0; i < numoutputfields; i++) { 893 code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 894 } 895 896 code << " const CeedInt Dim = "<<dim<<";\n"; 897 code << " const CeedInt Q1d = "<<Q1d<<";\n"; 898 899 code << " HIP_DYNAMIC_SHARED( CeedScalar, slice)\n"; 900 code << " BackendData data;\n"; 901 code << " data.tidx = threadIdx.x;\n"; 902 code << " data.tidy = threadIdx.y;\n"; 903 code << " data.tidz = threadIdx.z;\n"; 904 code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 905 code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 906 907 code << "\n // -- Input field constants and basis data --\n"; 908 //Initialize constants, and matrices B and G 909 for (CeedInt i = 0; i < numinputfields; i++) { 910 code << " // ---- Input field "<<i<<" ----\n"; 911 // Get elemsize, emode, ncomp 912 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 913 CeedChkBackend(ierr); 914 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 915 CeedChkBackend(ierr); 916 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 917 CeedChkBackend(ierr); 918 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 919 CeedChkBackend(ierr); 920 921 // Set field constants 922 if (emode != CEED_EVAL_WEIGHT) { 923 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 924 if (basis != CEED_BASIS_COLLOCATED) { 925 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 926 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 927 } else { 928 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 929 } 930 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 931 } 932 933 // Load basis data 934 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 935 switch (emode) { 936 case CEED_EVAL_NONE: 937 break; 938 case CEED_EVAL_INTERP: 939 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 940 data->B.in[i] = basis_data->d_interp_1d; 941 code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 942 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 943 break; 944 case CEED_EVAL_GRAD: 945 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 946 data->B.in[i] = basis_data->d_interp_1d; 947 code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 948 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 949 if (useCollograd) { 950 data->G.in[i] = basis_data->d_collo_grad_1d; 951 code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 952 code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 953 } else { 954 data->G.in[i] = basis_data->d_grad_1d; 955 code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 956 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 957 } 958 break; 959 case CEED_EVAL_WEIGHT: 960 break; // No action 961 case CEED_EVAL_DIV: 962 break; // TODO: Not implemented 963 case CEED_EVAL_CURL: 964 break; // TODO: Not implemented 965 } 966 } 967 968 code << "\n // -- Output field constants and basis data --\n"; 969 for (CeedInt i = 0; i < numoutputfields; i++) { 970 code << " // ---- Output field "<<i<<" ----\n"; 971 // Get elemsize, emode, ncomp 972 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 973 CeedChkBackend(ierr); 974 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 975 CeedChkBackend(ierr); 976 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 977 CeedChkBackend(ierr); 978 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 979 CeedChkBackend(ierr); 980 981 // Set field constants 982 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 983 if (basis != CEED_BASIS_COLLOCATED) { 984 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 985 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 986 } else { 987 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 988 } 989 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 990 991 // Load basis data 992 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 993 switch (emode) { 994 case CEED_EVAL_NONE: 995 break; // No action 996 case CEED_EVAL_INTERP: 997 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 998 data->B.out[i] = basis_data->d_interp_1d; 999 code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1000 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1001 break; 1002 case CEED_EVAL_GRAD: 1003 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1004 data->B.out[i] = basis_data->d_interp_1d; 1005 code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1006 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1007 if (useCollograd) { 1008 data->G.out[i] = basis_data->d_collo_grad_1d; 1009 code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1010 code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1011 } else { 1012 data->G.out[i] = basis_data->d_grad_1d; 1013 code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1014 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1015 } 1016 break; 1017 // LCOV_EXCL_START 1018 case CEED_EVAL_WEIGHT: { 1019 Ceed ceed; 1020 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1021 return CeedError(ceed, CEED_ERROR_BACKEND, 1022 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1023 break; // Should not occur 1024 } 1025 case CEED_EVAL_DIV: 1026 break; // TODO: Not implemented 1027 case CEED_EVAL_CURL: 1028 break; // TODO: Not implemented 1029 // LCOV_EXCL_STOP 1030 } 1031 } 1032 code << "\n // -- Element loop --\n"; 1033 code << " __syncthreads();\n"; 1034 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1035 // Input basis apply if needed 1036 // Generate the correct eval mode code for each input 1037 code << " // -- Input field restrictions and basis actions --\n"; 1038 for (CeedInt i = 0; i < numinputfields; i++) { 1039 code << " // ---- Input field "<<i<<" ----\n"; 1040 // Get elemsize, emode, ncomp 1041 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1042 CeedChkBackend(ierr); 1043 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1044 CeedChkBackend(ierr); 1045 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1046 CeedChkBackend(ierr); 1047 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1048 CeedChkBackend(ierr); 1049 1050 // Restriction 1051 if (emode != CEED_EVAL_WEIGHT && 1052 !((emode == CEED_EVAL_NONE) && useCollograd)) { 1053 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1054 1055 bool isStrided; 1056 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1057 if (!isStrided) { 1058 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1059 CeedChkBackend(ierr); 1060 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1061 CeedInt compstride; 1062 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1063 code << " // CompStride: "<<compstride<<"\n"; 1064 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1065 data->indices.in[i] = restr_data->d_ind; 1066 code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1067 } else { 1068 bool backendstrides; 1069 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1070 CeedChkBackend(ierr); 1071 CeedInt nelem; 1072 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1073 CeedChkBackend(ierr); 1074 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1075 if (!backendstrides) { 1076 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1077 CeedChkBackend(ierr); 1078 } 1079 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1080 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1081 } 1082 } 1083 1084 // Basis action 1085 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1086 switch (emode) { 1087 case CEED_EVAL_NONE: 1088 if (!useCollograd) { 1089 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1090 } 1091 break; 1092 case CEED_EVAL_INTERP: 1093 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1094 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1095 break; 1096 case CEED_EVAL_GRAD: 1097 if (useCollograd) { 1098 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1099 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1100 } else { 1101 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1102 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1103 } 1104 break; 1105 case CEED_EVAL_WEIGHT: 1106 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1107 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 1108 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1109 data->W = basis_data->d_q_weight_1d; 1110 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1111 break; // No action 1112 case CEED_EVAL_DIV: 1113 break; // TODO: Not implemented 1114 case CEED_EVAL_CURL: 1115 break; // TODO: Not implemented 1116 } 1117 } 1118 1119 // Q function 1120 code << "\n // -- Output field setup --\n"; 1121 for (CeedInt i = 0; i < numoutputfields; i++) { 1122 code << "\n // ---- Output field "<<i<<" ----\n"; 1123 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1124 CeedChkBackend(ierr); 1125 if (emode==CEED_EVAL_GRAD) 1126 { 1127 if (useCollograd) { 1128 //Accumulator for gradient slices 1129 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1130 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1131 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1132 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1133 code << " }\n"; 1134 code << " }\n"; 1135 } else { 1136 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1137 } 1138 } 1139 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1140 { 1141 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1142 } 1143 } 1144 // We treat quadrature points per slice in 3d to save registers 1145 if (useCollograd) { 1146 code << "\n // Note: Collocated Gradient\n"; 1147 code << "#pragma unroll\n"; 1148 code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1149 code << " // -- Input fields --\n"; 1150 for (CeedInt i = 0; i < numinputfields; i++) { 1151 code << " // ---- Input field "<<i<<" ----\n"; 1152 // Get elemsize, emode, ncomp 1153 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1154 CeedChkBackend(ierr); 1155 // Basis action 1156 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1157 switch (emode) { 1158 case CEED_EVAL_NONE: 1159 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1160 1161 bool isStrided; 1162 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr); 1163 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr); 1164 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1165 if (!isStrided) { 1166 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1167 CeedChkBackend(ierr); 1168 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1169 CeedInt compstride; 1170 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1171 code << " // CompStride: "<<compstride<<"\n"; 1172 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1173 data->indices.in[i] = restr_data->d_ind; 1174 code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1175 } else { 1176 bool backendstrides; 1177 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1178 CeedChkBackend(ierr); 1179 CeedInt nelem; 1180 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1181 CeedChkBackend(ierr); 1182 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1183 if (!backendstrides) { 1184 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1185 CeedChkBackend(ierr); 1186 } 1187 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1188 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1189 } 1190 break; 1191 case CEED_EVAL_INTERP: 1192 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1193 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1194 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1195 code << " }\n"; 1196 break; 1197 case CEED_EVAL_GRAD: 1198 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1199 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1200 break; 1201 case CEED_EVAL_WEIGHT: 1202 code << " CeedScalar r_q"<<i<<"[1];\n"; 1203 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1204 break; // No action 1205 case CEED_EVAL_DIV: 1206 break; // TODO: Not implemented 1207 case CEED_EVAL_CURL: 1208 break; // TODO: Not implemented 1209 } 1210 } 1211 code << "\n // -- Output fields --\n"; 1212 for (CeedInt i = 0; i < numoutputfields; i++) { 1213 code << " // ---- Output field "<<i<<" ----\n"; 1214 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1215 CeedChkBackend(ierr); 1216 // Basis action 1217 switch (emode) { 1218 case CEED_EVAL_NONE: 1219 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1220 break; // No action 1221 case CEED_EVAL_INTERP: 1222 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1223 break; 1224 case CEED_EVAL_GRAD: 1225 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1226 break; 1227 case CEED_EVAL_WEIGHT: 1228 break; // Should not occur 1229 case CEED_EVAL_DIV: 1230 break; // TODO: Not implemented 1231 case CEED_EVAL_CURL: 1232 break; // TODO: Not implemented 1233 } 1234 } 1235 } else { 1236 code << "\n // Note: No Collocated Gradient\n"; 1237 code << " // -- Input fields --\n"; 1238 for (CeedInt i = 0; i < numinputfields; i++) { 1239 code << " // ---- Input field "<<i<<" ----\n"; 1240 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1241 } 1242 code << " // -- Output fields --\n"; 1243 for (CeedInt i = 0; i < numoutputfields; i++) { 1244 code << " // ---- Output field "<<i<<" ----\n"; 1245 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1246 } 1247 } 1248 code << "\n // -- QFunction Inputs and outputs --\n"; 1249 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1250 for (CeedInt i = 0; i < numinputfields; i++) { 1251 code << " // ---- Input field "<<i<<" ----\n"; 1252 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1253 } 1254 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1255 for (CeedInt i = 0; i < numoutputfields; i++) { 1256 code << " // ---- Output field "<<i<<" ----\n"; 1257 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1258 } 1259 code << "\n // -- Apply QFunction --\n"; 1260 code << " "<<qFunctionName<<"(ctx, "; 1261 if (dim != 3 || useCollograd) { 1262 code << "1"; 1263 } else { 1264 code << "Q1d"; 1265 } 1266 code << ", in, out);\n"; 1267 if (useCollograd) { 1268 code << "\n // Note: Collocated Gradient\n"; 1269 code << " // -- Output fields --\n"; 1270 for (CeedInt i = 0; i < numoutputfields; i++) { 1271 code << " // ---- Output field "<<i<<" ----\n"; 1272 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1273 CeedChkBackend(ierr); 1274 // Basis action 1275 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1276 switch (emode) { 1277 case CEED_EVAL_NONE: 1278 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1279 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1280 code << " }\n"; 1281 break; // No action 1282 case CEED_EVAL_INTERP: 1283 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1284 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1285 code << " }\n"; 1286 break; 1287 case CEED_EVAL_GRAD: 1288 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1289 break; 1290 case CEED_EVAL_WEIGHT: 1291 break; // Should not occur 1292 case CEED_EVAL_DIV: 1293 break; // TODO: Not implemented 1294 case CEED_EVAL_CURL: 1295 break; // TODO: Not implemented 1296 } 1297 } 1298 code << " }\n"; 1299 } 1300 1301 // Output basis apply if needed 1302 // Generate the correct eval mode code for each output 1303 code << "\n // -- Output field basis action and restrictions --\n"; 1304 for (CeedInt i = 0; i < numoutputfields; i++) { 1305 code << " // ---- Output field "<<i<<" ----\n"; 1306 // Get elemsize, emode, ncomp 1307 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1308 CeedChkBackend(ierr); 1309 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1310 CeedChkBackend(ierr); 1311 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1312 CeedChkBackend(ierr); 1313 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1314 CeedChkBackend(ierr); 1315 // Basis action 1316 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1317 switch (emode) { 1318 case CEED_EVAL_NONE: 1319 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1320 break; // No action 1321 case CEED_EVAL_INTERP: 1322 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1323 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1324 break; 1325 case CEED_EVAL_GRAD: 1326 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1327 if (useCollograd) { 1328 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1329 } else { 1330 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1331 } 1332 break; 1333 // LCOV_EXCL_START 1334 case CEED_EVAL_WEIGHT: { 1335 Ceed ceed; 1336 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1337 return CeedError(ceed, CEED_ERROR_BACKEND, 1338 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1339 break; // Should not occur 1340 } 1341 case CEED_EVAL_DIV: 1342 break; // TODO: Not implemented 1343 case CEED_EVAL_CURL: 1344 break; // TODO: Not implemented 1345 // LCOV_EXCL_STOP 1346 } 1347 // Restriction 1348 bool isStrided; 1349 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 1350 if (!isStrided) { 1351 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1352 CeedChkBackend(ierr); 1353 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1354 CeedInt compstride; 1355 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 1356 code << " // CompStride: "<<compstride<<"\n"; 1357 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 1358 data->indices.out[i] = restr_data->d_ind; 1359 code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1360 } else { 1361 bool backendstrides; 1362 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1363 CeedChkBackend(ierr); 1364 CeedInt nelem; 1365 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1366 CeedChkBackend(ierr); 1367 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1368 if (!backendstrides) { 1369 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1370 CeedChkBackend(ierr); 1371 } 1372 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1373 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1374 } 1375 } 1376 1377 code << " }\n"; 1378 code << "}\n"; 1379 code << "// -----------------------------------------------------------------------------\n\n"; 1380 1381 // View kernel for debugging 1382 CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 1383 CeedDebug(ceed, code.str().c_str()); 1384 1385 CeedInt block_sizes[3]; 1386 ierr = BlockGridCalculate_Hip_gen(dim, numelements, data->maxP1d, Q1d, block_sizes); 1387 CeedChkBackend(ierr); 1388 ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2, 1389 "T1d", block_sizes[0], 1390 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]); 1391 CeedChkBackend(ierr); 1392 ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op); 1393 CeedChkBackend(ierr); 1394 1395 ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 1396 return CEED_ERROR_SUCCESS; 1397 } 1398 //------------------------------------------------------------------------------ 1399