1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 #define CEED_DEBUG_COLOR 12 17 18 #include "ceed-hip-gen.h" 19 #include <iostream> 20 #include <sstream> 21 #include "../hip-shared/ceed-hip-shared.h" 22 #include "../hip/ceed-hip-compile.h" 23 24 static const char *atomicAdd = QUOTE( 25 //------------------------------------------------------------------------------ 26 // Atomic add, for older CUDA 27 //------------------------------------------------------------------------------ 28 __device__ double atomicAdd(double *address, double val) { 29 unsigned long long int *address_as_ull = (unsigned long long int *)address; 30 unsigned long long int old = *address_as_ull, assumed; 31 do { 32 assumed = old; 33 old = 34 atomicCAS(address_as_ull, assumed, 35 __double_as_longlong(val + 36 __longlong_as_double(assumed))); 37 // Note: uses integer comparison to avoid hang in case of NaN 38 // (since NaN != NaN) 39 } while (assumed != old); 40 return __longlong_as_double(old); 41 } 42 ); 43 44 static const char *deviceFunctions = QUOTE( 45 46 //------------------------------------------------------------------------------ 47 // Typedefs 48 //------------------------------------------------------------------------------ 49 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields; 50 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt; 51 52 typedef struct { 53 CeedInt tidx; 54 CeedInt tidy; 55 CeedInt tidz; 56 CeedInt tid; 57 CeedScalar* slice; 58 } BackendData; 59 60 //------------------------------------------------------------------------------ 61 // Load matrices for basis actions 62 //------------------------------------------------------------------------------ 63 template <int P, int Q> 64 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 65 for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 66 B[i] = d_B[i]; 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D 71 //------------------------------------------------------------------------------ 72 73 //------------------------------------------------------------------------------ 74 // L-vector -> E-vector, offsets provided 75 //------------------------------------------------------------------------------ 76 template <int NCOMP, int COMPSTRIDE, int P1d> 77 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 78 if (data.tidx < P1d) { 79 const CeedInt node = data.tidx; 80 const CeedInt ind = indices[node + elem * P1d]; 81 for (CeedInt comp = 0; comp < NCOMP; ++comp) 82 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 83 } 84 } 85 86 //------------------------------------------------------------------------------ 87 // L-vector -> E-vector, strided 88 //------------------------------------------------------------------------------ 89 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 90 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 91 if (data.tidx < P1d) { 92 const CeedInt node = data.tidx; 93 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 94 for (CeedInt comp = 0; comp < NCOMP; ++comp) 95 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 96 } 97 } 98 99 //------------------------------------------------------------------------------ 100 // E-vector -> L-vector, offsets provided 101 //------------------------------------------------------------------------------ 102 template <int NCOMP, int COMPSTRIDE, int P1d> 103 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 104 if (data.tidx < P1d) { 105 const CeedInt node = data.tidx; 106 const CeedInt ind = indices[node + elem * P1d]; 107 for (CeedInt comp = 0; comp < NCOMP; ++comp) 108 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 109 } 110 } 111 112 //------------------------------------------------------------------------------ 113 // E-vector -> L-vector, strided 114 //------------------------------------------------------------------------------ 115 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 116 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 117 if (data.tidx < P1d) { 118 const CeedInt node = data.tidx; 119 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 120 for (CeedInt comp = 0; comp < NCOMP; ++comp) 121 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 122 } 123 } 124 125 //------------------------------------------------------------------------------ 126 // 1D tensor contraction x 127 //------------------------------------------------------------------------------ 128 template <int NCOMP, int P1d, int Q1d> 129 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 130 data.slice[data.tidx] = *U; 131 __syncthreads(); 132 *V = 0.0; 133 if (data.tidx < Q1d) 134 for (CeedInt i = 0; i < P1d; ++i) 135 *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 136 __syncthreads(); 137 } 138 139 //------------------------------------------------------------------------------ 140 // 1D transpose tensor contraction x 141 //------------------------------------------------------------------------------ 142 template <int NCOMP, int P1d, int Q1d> 143 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 144 data.slice[data.tidx] = *U; 145 __syncthreads(); 146 *V = 0.0; 147 if (data.tidx < P1d) 148 for (CeedInt i = 0; i < Q1d; ++i) 149 *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 150 __syncthreads(); 151 } 152 153 //------------------------------------------------------------------------------ 154 // 1D interpolate to quadrature points 155 //------------------------------------------------------------------------------ 156 template <int NCOMP, int P1d, int Q1d> 157 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 158 for (CeedInt comp = 0; comp < NCOMP; comp++) 159 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 160 } 161 162 //------------------------------------------------------------------------------ 163 // 1D interpolate transpose 164 //------------------------------------------------------------------------------ 165 template <int NCOMP, int P1d, int Q1d> 166 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 167 for (CeedInt comp=0; comp<NCOMP; comp++) 168 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 169 } 170 171 //------------------------------------------------------------------------------ 172 // 1D derivatives at quadrature points 173 //------------------------------------------------------------------------------ 174 template <int NCOMP, int P1d, int Q1d> 175 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 176 for (CeedInt comp = 0; comp < NCOMP; comp++) 177 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 178 } 179 180 //------------------------------------------------------------------------------ 181 // 1D derivatives transpose 182 //------------------------------------------------------------------------------ 183 template <int NCOMP, int P1d, int Q1d> 184 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 185 for (CeedInt comp = 0; comp < NCOMP; comp++) 186 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 187 } 188 189 //------------------------------------------------------------------------------ 190 // 2D 191 //------------------------------------------------------------------------------ 192 193 //------------------------------------------------------------------------------ 194 // L-vector -> E-vector, offsets provided 195 //------------------------------------------------------------------------------ 196 template <int NCOMP, int COMPSTRIDE, int P1d> 197 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 198 if (data.tidx < P1d && data.tidy < P1d) { 199 const CeedInt node = data.tidx + data.tidy*P1d; 200 const CeedInt ind = indices[node + elem * P1d*P1d]; 201 for (CeedInt comp = 0; comp < NCOMP; ++comp) 202 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 203 } 204 } 205 206 //------------------------------------------------------------------------------ 207 // L-vector -> E-vector, strided 208 //------------------------------------------------------------------------------ 209 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 210 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 211 if (data.tidx < P1d && data.tidy < P1d) { 212 const CeedInt node = data.tidx + data.tidy*P1d; 213 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 214 for (CeedInt comp = 0; comp < NCOMP; ++comp) 215 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 216 } 217 } 218 219 //------------------------------------------------------------------------------ 220 // E-vector -> L-vector, offsets provided 221 //------------------------------------------------------------------------------ 222 template <int NCOMP, int COMPSTRIDE, int P1d> 223 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 224 if (data.tidx < P1d && data.tidy < P1d) { 225 const CeedInt node = data.tidx + data.tidy*P1d; 226 const CeedInt ind = indices[node + elem * P1d*P1d]; 227 for (CeedInt comp = 0; comp < NCOMP; ++comp) 228 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 229 } 230 } 231 232 //------------------------------------------------------------------------------ 233 // E-vector -> L-vector, strided 234 //------------------------------------------------------------------------------ 235 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 236 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 237 if (data.tidx < P1d && data.tidy < P1d) { 238 const CeedInt node = data.tidx + data.tidy*P1d; 239 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 240 for (CeedInt comp = 0; comp < NCOMP; ++comp) 241 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 242 } 243 } 244 245 //------------------------------------------------------------------------------ 246 // 2D tensor contraction x 247 //------------------------------------------------------------------------------ 248 template <int NCOMP, int P1d, int Q1d> 249 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 250 data.slice[data.tidx+data.tidy*T1d] = *U; 251 __syncthreads(); 252 *V = 0.0; 253 if (data.tidx < Q1d && data.tidy < P1d) 254 for (CeedInt i = 0; i < P1d; ++i) 255 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 256 __syncthreads(); 257 } 258 259 //------------------------------------------------------------------------------ 260 // 2D tensor contract y 261 //------------------------------------------------------------------------------ 262 template <int NCOMP, int P1d, int Q1d> 263 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 264 data.slice[data.tidx+data.tidy*T1d] = *U; 265 __syncthreads(); 266 *V = 0.0; 267 if (data.tidx < Q1d && data.tidy < Q1d) 268 for (CeedInt i = 0; i < P1d; ++i) 269 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 270 __syncthreads(); 271 } 272 273 //------------------------------------------------------------------------------ 274 // 2D transpose tensor contract y 275 //------------------------------------------------------------------------------ 276 template <int NCOMP, int P1d, int Q1d> 277 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 278 data.slice[data.tidx+data.tidy*T1d] = *U; 279 __syncthreads(); 280 *V = 0.0; 281 if (data.tidx < Q1d && data.tidy < P1d) 282 for (CeedInt i = 0; i < Q1d; ++i) 283 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 284 __syncthreads(); 285 } 286 287 //------------------------------------------------------------------------------ 288 // 2D transpose tensor contract x 289 //------------------------------------------------------------------------------ 290 template <int NCOMP, int P1d, int Q1d> 291 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 292 data.slice[data.tidx+data.tidy*T1d] = *U; 293 __syncthreads(); 294 *V = 0.0; 295 if (data.tidx < P1d && data.tidy < P1d) 296 for (CeedInt i = 0; i < Q1d; ++i) 297 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 298 __syncthreads(); 299 } 300 301 //------------------------------------------------------------------------------ 302 // 2D transpose tensor contract and add x 303 //------------------------------------------------------------------------------ 304 template <int NCOMP, int P1d, int Q1d> 305 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 306 data.slice[data.tidx+data.tidy*T1d] = *U; 307 __syncthreads(); 308 if (data.tidx < P1d && data.tidy < P1d) 309 for (CeedInt i = 0; i < Q1d; ++i) 310 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 311 __syncthreads(); 312 } 313 314 //------------------------------------------------------------------------------ 315 // 2D interpolate to quadrature points 316 //------------------------------------------------------------------------------ 317 template <int NCOMP, int P1d, int Q1d> 318 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 319 CeedScalar r_t[1]; 320 for (CeedInt comp = 0; comp < NCOMP; comp++) { 321 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 322 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 323 } 324 } 325 326 //------------------------------------------------------------------------------ 327 // 2D interpolate transpose 328 //------------------------------------------------------------------------------ 329 template <int NCOMP, int P1d, int Q1d> 330 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 331 CeedScalar r_t[1]; 332 for (CeedInt comp = 0; comp < NCOMP; comp++) { 333 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 334 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 335 } 336 } 337 338 //------------------------------------------------------------------------------ 339 // 2D derivatives at quadrature points 340 //------------------------------------------------------------------------------ 341 template <int NCOMP, int P1d, int Q1d> 342 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 343 CeedScalar r_t[1]; 344 for (CeedInt comp = 0; comp < NCOMP; comp++) { 345 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 346 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 347 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 348 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 349 } 350 } 351 352 //------------------------------------------------------------------------------ 353 // 2D derivatives transpose 354 //------------------------------------------------------------------------------ 355 template <int NCOMP, int P1d, int Q1d> 356 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 357 CeedScalar r_t[1]; 358 for (CeedInt comp = 0; comp < NCOMP; comp++) { 359 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 360 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 361 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 362 ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 363 } 364 } 365 366 //------------------------------------------------------------------------------ 367 // 3D 368 //------------------------------------------------------------------------------ 369 370 //------------------------------------------------------------------------------ 371 // L-vector -> E-vector, offsets provided 372 //------------------------------------------------------------------------------ 373 template <int NCOMP, int COMPSTRIDE, int P1d> 374 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 375 if (data.tidx < P1d && data.tidy < P1d) 376 for (CeedInt z = 0; z < P1d; ++z) { 377 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 378 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 379 for (CeedInt comp = 0; comp < NCOMP; ++comp) 380 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 381 } 382 } 383 384 //------------------------------------------------------------------------------ 385 // L-vector -> E-vector, strided 386 //------------------------------------------------------------------------------ 387 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 388 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 389 if (data.tidx < P1d && data.tidy < P1d) 390 for (CeedInt z = 0; z < P1d; ++z) { 391 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 392 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 393 for (CeedInt comp = 0; comp < NCOMP; ++comp) 394 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 395 } 396 } 397 398 //------------------------------------------------------------------------------ 399 // E-vector -> Q-vector, offests provided 400 //------------------------------------------------------------------------------ 401 template <int NCOMP, int COMPSTRIDE, int Q1d> 402 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 403 if (data.tidx < Q1d && data.tidy < Q1d) { 404 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 405 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 406 for (CeedInt comp = 0; comp < NCOMP; ++comp) 407 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 408 } 409 } 410 411 //------------------------------------------------------------------------------ 412 // E-vector -> Q-vector, strided 413 //------------------------------------------------------------------------------ 414 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 415 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 416 if (data.tidx < Q1d && data.tidy < Q1d) { 417 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 418 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 419 for (CeedInt comp = 0; comp < NCOMP; ++comp) 420 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 421 } 422 } 423 424 //------------------------------------------------------------------------------ 425 // E-vector -> L-vector, offsets provided 426 //------------------------------------------------------------------------------ 427 template <int NCOMP, int COMPSTRIDE, int P1d> 428 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 429 if (data.tidx < P1d && data.tidy < P1d) 430 for (CeedInt z = 0; z < P1d; ++z) { 431 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 432 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 433 for (CeedInt comp = 0; comp < NCOMP; ++comp) 434 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 435 } 436 } 437 438 //------------------------------------------------------------------------------ 439 // E-vector -> L-vector, strided 440 //------------------------------------------------------------------------------ 441 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 442 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 443 if (data.tidx < P1d && data.tidy < P1d) 444 for (CeedInt z = 0; z < P1d; ++z) { 445 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 446 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 447 for (CeedInt comp = 0; comp < NCOMP; ++comp) 448 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 449 } 450 } 451 452 //------------------------------------------------------------------------------ 453 // 3D tensor contract x 454 //------------------------------------------------------------------------------ 455 template <int NCOMP, int P1d, int Q1d> 456 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 457 CeedScalar r_B[P1d]; 458 for (CeedInt i = 0; i < P1d; ++i) 459 r_B[i] = B[i + data.tidx*P1d]; 460 461 for (CeedInt k = 0; k < P1d; ++k) { 462 data.slice[data.tidx+data.tidy*T1d] = U[k]; 463 __syncthreads(); 464 V[k] = 0.0; 465 if (data.tidx < Q1d && data.tidy < P1d) 466 for (CeedInt i = 0; i < P1d; ++i) 467 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 468 __syncthreads(); 469 } 470 } 471 472 //------------------------------------------------------------------------------ 473 // 3D tensor contract y 474 //------------------------------------------------------------------------------ 475 template <int NCOMP, int P1d, int Q1d> 476 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 477 CeedScalar r_B[P1d]; 478 for (CeedInt i = 0; i < P1d; ++i) 479 r_B[i] = B[i + data.tidy*P1d]; 480 481 for (CeedInt k = 0; k < P1d; ++k) { 482 data.slice[data.tidx+data.tidy*T1d] = U[k]; 483 __syncthreads(); 484 V[k] = 0.0; 485 if (data.tidx < Q1d && data.tidy < Q1d) 486 for (CeedInt i = 0; i < P1d; ++i) 487 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 488 __syncthreads(); 489 } 490 } 491 492 //------------------------------------------------------------------------------ 493 // 3D tensor contract z 494 //------------------------------------------------------------------------------ 495 template <int NCOMP, int P1d, int Q1d> 496 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 497 for (CeedInt k = 0; k < Q1d; ++k) { 498 V[k] = 0.0; 499 if (data.tidx < Q1d && data.tidy < Q1d) 500 for (CeedInt i = 0; i < P1d; ++i) 501 V[k] += B[i + k*P1d] * U[i]; // Contract z direction 502 } 503 } 504 505 //------------------------------------------------------------------------------ 506 // 3D transpose tensor contract z 507 //------------------------------------------------------------------------------ 508 template <int NCOMP, int P1d, int Q1d> 509 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 510 for (CeedInt k = 0; k < P1d; ++k) { 511 V[k] = 0.0; 512 if (data.tidx < Q1d && data.tidy < Q1d) 513 for (CeedInt i = 0; i < Q1d; ++i) 514 V[k] += B[k + i*P1d] * U[i]; // Contract z direction 515 } 516 } 517 518 //------------------------------------------------------------------------------ 519 // 3D transpose tensor contract y 520 //------------------------------------------------------------------------------ 521 template <int NCOMP, int P1d, int Q1d> 522 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 523 CeedScalar r_B[Q1d]; 524 for (CeedInt i = 0; i < Q1d; ++i) 525 r_B[i] = B[data.tidy + i*P1d]; 526 527 for (CeedInt k = 0; k < P1d; ++k) { 528 data.slice[data.tidx+data.tidy*T1d] = U[k]; 529 __syncthreads(); 530 V[k] = 0.0; 531 if (data.tidx < Q1d && data.tidy < P1d) 532 for (CeedInt i = 0; i < Q1d; ++i) 533 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 534 __syncthreads(); 535 } 536 } 537 538 //------------------------------------------------------------------------------ 539 // 3D transpose tensor contract add y 540 //------------------------------------------------------------------------------ 541 template <int NCOMP, int P1d, int Q1d> 542 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 543 CeedScalar r_B[Q1d]; 544 for (CeedInt i = 0; i < Q1d; ++i) 545 r_B[i] = B[data.tidy + i*P1d]; 546 547 for (CeedInt k = 0; k < P1d; ++k) { 548 data.slice[data.tidx+data.tidy*T1d] = U[k]; 549 __syncthreads(); 550 if (data.tidx < Q1d && data.tidy < P1d) 551 for (CeedInt i = 0; i < Q1d; ++i) 552 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 553 __syncthreads(); 554 } 555 } 556 557 //------------------------------------------------------------------------------ 558 // 3D transpose tensor contract x 559 //------------------------------------------------------------------------------ 560 template <int NCOMP, int P1d, int Q1d> 561 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 562 CeedScalar r_B[Q1d]; 563 for (CeedInt i = 0; i < Q1d; ++i) 564 r_B[i] = B[data.tidx + i*P1d]; 565 566 for (CeedInt k = 0; k < P1d; ++k) { 567 data.slice[data.tidx+data.tidy*T1d] = U[k]; 568 __syncthreads(); 569 V[k] = 0.0; 570 if (data.tidx < P1d && data.tidy < P1d) 571 for (CeedInt i = 0; i < Q1d; ++i) 572 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 573 __syncthreads(); 574 } 575 } 576 577 //------------------------------------------------------------------------------ 578 // 3D transpose tensor contract add x 579 //------------------------------------------------------------------------------ 580 template <int NCOMP, int P1d, int Q1d> 581 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 582 CeedScalar r_B[Q1d]; 583 for (CeedInt i = 0; i < Q1d; ++i) 584 r_B[i] = B[data.tidx + i*P1d]; 585 586 for (CeedInt k = 0; k < P1d; ++k) { 587 data.slice[data.tidx+data.tidy*T1d] = U[k]; 588 __syncthreads(); 589 if (data.tidx < P1d && data.tidy < P1d) 590 for (CeedInt i = 0; i < Q1d; ++i) 591 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 592 __syncthreads(); 593 } 594 } 595 596 //------------------------------------------------------------------------------ 597 // 3D interpolate to quadrature points 598 //------------------------------------------------------------------------------ 599 template <int NCOMP, int P1d, int Q1d> 600 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 601 CeedScalar r_t1[T1d]; 602 CeedScalar r_t2[T1d]; 603 for (CeedInt comp = 0; comp < NCOMP; comp++) { 604 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 605 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 606 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 607 } 608 } 609 610 //------------------------------------------------------------------------------ 611 // 3D interpolate transpose 612 //------------------------------------------------------------------------------ 613 template <int NCOMP, int P1d, int Q1d> 614 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 615 CeedScalar r_t1[T1d]; 616 CeedScalar r_t2[T1d]; 617 for (CeedInt comp = 0; comp < NCOMP; comp++) { 618 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 619 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 620 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 621 } 622 } 623 624 //------------------------------------------------------------------------------ 625 // 3D derivatives at quadrature points 626 //------------------------------------------------------------------------------ 627 template <int NCOMP, int P1d, int Q1d> 628 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 629 CeedScalar r_t1[T1d]; 630 CeedScalar r_t2[T1d]; 631 for (CeedInt comp = 0; comp < NCOMP; comp++) { 632 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 633 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 634 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 635 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 636 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 637 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 638 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 639 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 640 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 641 } 642 } 643 644 //------------------------------------------------------------------------------ 645 // 3D derivatives transpose 646 //------------------------------------------------------------------------------ 647 template <int NCOMP, int P1d, int Q1d> 648 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 649 CeedScalar r_t1[T1d]; 650 CeedScalar r_t2[T1d]; 651 for (CeedInt comp = 0; comp < NCOMP; comp++) { 652 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 653 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 654 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 655 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 656 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 657 ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 658 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 659 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 660 ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 661 } 662 } 663 664 //------------------------------------------------------------------------------ 665 // 3D collocated derivatives computation 666 //------------------------------------------------------------------------------ 667 template <int NCOMP, int Q1d> 668 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 669 if (data.tidx < Q1d && data.tidy < Q1d) { 670 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 671 data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 672 __syncthreads(); 673 // X derivative 674 r_V[comp+0*NCOMP] = 0.0; 675 for (CeedInt i = 0; i < Q1d; ++i) 676 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 677 // Y derivative 678 r_V[comp+1*NCOMP] = 0.0; 679 for (CeedInt i = 0; i < Q1d; ++i) 680 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 681 // Z derivative 682 r_V[comp+2*NCOMP] = 0.0; 683 for (CeedInt i = 0; i < Q1d; ++i) 684 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 685 __syncthreads(); 686 } 687 } 688 } 689 690 //------------------------------------------------------------------------------ 691 // 3D collocated derivatives transpose 692 //------------------------------------------------------------------------------ 693 template <int NCOMP, int Q1d> 694 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 695 if (data.tidx < Q1d && data.tidy < Q1d) { 696 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 697 // X derivative 698 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 699 __syncthreads(); 700 for (CeedInt i = 0; i < Q1d; ++i) 701 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 702 __syncthreads(); 703 // Y derivative 704 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 705 __syncthreads(); 706 for (CeedInt i = 0; i < Q1d; ++i) 707 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 708 __syncthreads(); 709 // Z derivative 710 for (CeedInt i = 0; i < Q1d; ++i) 711 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 712 } 713 } 714 } 715 716 //------------------------------------------------------------------------------ 717 // 1D quadrature weights 718 //------------------------------------------------------------------------------ 719 template <int Q1d> 720 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 721 *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 722 } 723 724 //------------------------------------------------------------------------------ 725 // 2D quadrature weights 726 //------------------------------------------------------------------------------ 727 template <int Q1d> 728 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 729 *w = (data.tidx < Q1d && data.tidy < Q1d) ? 730 qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 731 } 732 733 //------------------------------------------------------------------------------ 734 // 3D quadrature weights 735 //------------------------------------------------------------------------------ 736 template <int Q1d> 737 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 738 const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 739 const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 740 for (CeedInt z = 0; z < Q1d; ++z) 741 w[z] = quad ? pw*qweight1d[z] : 0.0; 742 } 743 744 ); 745 //------------------------------------------------------------------------------ 746 // Build singe operator kernel 747 //------------------------------------------------------------------------------ 748 extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { 749 750 using std::ostringstream; 751 using std::string; 752 int ierr; 753 bool setupdone; 754 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr); 755 if (setupdone) return 0; 756 Ceed ceed; 757 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 758 CeedOperator_Hip_gen *data; 759 ierr = CeedOperatorGetData(op, &data); CeedChk(ierr); 760 CeedQFunction qf; 761 CeedQFunction_Hip_gen *qf_data; 762 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 763 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr); 764 CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 765 numoutputfields, ncomp, dim = 0, lsize; 766 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 767 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 768 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 769 CeedChk(ierr); 770 CeedOperatorField *opinputfields, *opoutputfields; 771 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 772 CeedChk(ierr); 773 CeedQFunctionField *qfinputfields, *qfoutputfields; 774 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 775 CeedChk(ierr); 776 CeedEvalMode emode; 777 CeedBasis basis; 778 CeedBasis_Hip_shared *basis_data; 779 CeedElemRestriction Erestrict; 780 CeedElemRestriction_Hip *restr_data; 781 782 ostringstream code; 783 string devFunctions(deviceFunctions); 784 785 // Add atomicAdd function for old NVidia architectures 786 struct hipDeviceProp_t prop; 787 Ceed_Hip *ceed_data; 788 ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr); 789 ierr = hipGetDeviceProperties(&prop, ceed_data->deviceId); 790 if (prop.major<6){ 791 code << atomicAdd; 792 } 793 794 code << devFunctions; 795 796 string qFunction(qf_data->qFunctionSource); 797 string qFunctionName(qf_data->qFunctionName); 798 string oper; 799 oper = "CeedKernel_Hip_gen_" + qFunctionName; 800 801 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 802 code << "\n#define CeedPragmaSIMD\n"; 803 804 // Find dim and Q1d 805 bool useCollograd = true; 806 data->maxP1d = 0; 807 for (CeedInt i = 0; i < numinputfields; i++) { 808 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 809 if (basis != CEED_BASIS_COLLOCATED) { 810 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 811 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 812 CeedChk(ierr); 813 814 // Check for collocated gradient 815 useCollograd = useCollograd && basis_data->d_collograd1d; 816 817 // Collect dim and Q1d 818 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 819 bool isTensor; 820 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 821 if (isTensor) { 822 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 823 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 824 if (P1d>data->maxP1d) data->maxP1d = P1d; 825 } else { 826 // LCOV_EXCL_START 827 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 828 // LCOV_EXCL_STOP 829 } 830 } 831 } 832 // Check output bases for Q1d, dim as well 833 // The only imput basis might be CEED_BASIS_COLLOCATED 834 for (CeedInt i = 0; i < numoutputfields; i++) { 835 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 836 837 if (basis != CEED_BASIS_COLLOCATED) { 838 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 839 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 840 CeedChk(ierr); 841 842 // Collect dim and Q1d 843 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 844 bool isTensor; 845 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 846 if (isTensor) { 847 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 848 } else { 849 // LCOV_EXCL_START 850 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 851 // LCOV_EXCL_STOP 852 } 853 854 // Check for collocated gradient 855 useCollograd = useCollograd && basis_data->d_collograd1d; 856 } 857 } 858 data->dim = dim; 859 data->Q1d = Q1d; 860 861 // Define CEED_Q_VLA 862 if (dim != 3 || useCollograd) { 863 code << "\n#define CEED_Q_VLA 1\n\n"; 864 } else { 865 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 866 } 867 868 code << qFunction; 869 870 // Setup 871 code << "\n// -----------------------------------------------------------------------------\n"; 872 code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n"; 873 for (CeedInt i = 0; i < numinputfields; i++) { 874 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 875 CeedChk(ierr); 876 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 877 code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 878 } 879 } 880 881 for (CeedInt i = 0; i < numoutputfields; i++) { 882 code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 883 } 884 885 code << " const CeedInt Dim = "<<dim<<";\n"; 886 code << " const CeedInt Q1d = "<<Q1d<<";\n"; 887 888 code << " HIP_DYNAMIC_SHARED( CeedScalar, slice)\n"; 889 code << " BackendData data;\n"; 890 code << " data.tidx = threadIdx.x;\n"; 891 code << " data.tidy = threadIdx.y;\n"; 892 code << " data.tidz = threadIdx.z;\n"; 893 code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 894 code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 895 896 code << "\n // -- Input field constants and basis data --\n"; 897 //Initialize constants, and matrices B and G 898 for (CeedInt i = 0; i < numinputfields; i++) { 899 code << " // ---- Input field "<<i<<" ----\n"; 900 // Get elemsize, emode, ncomp 901 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 902 CeedChk(ierr); 903 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 904 CeedChk(ierr); 905 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 906 CeedChk(ierr); 907 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 908 CeedChk(ierr); 909 910 // Set field constants 911 if (emode != CEED_EVAL_WEIGHT) { 912 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 913 if (basis != CEED_BASIS_COLLOCATED) { 914 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 915 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 916 } else { 917 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 918 } 919 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 920 } 921 922 // Load basis data 923 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 924 switch (emode) { 925 case CEED_EVAL_NONE: 926 break; 927 case CEED_EVAL_INTERP: 928 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 929 data->B.in[i] = basis_data->d_interp1d; 930 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 931 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 932 break; 933 case CEED_EVAL_GRAD: 934 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 935 data->B.in[i] = basis_data->d_interp1d; 936 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 937 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 938 if (useCollograd) { 939 data->G.in[i] = basis_data->d_collograd1d; 940 code << " __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 941 code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 942 } else { 943 data->G.in[i] = basis_data->d_grad1d; 944 code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 945 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 946 } 947 break; 948 case CEED_EVAL_WEIGHT: 949 break; // No action 950 case CEED_EVAL_DIV: 951 break; // TODO: Not implemented 952 case CEED_EVAL_CURL: 953 break; // TODO: Not implemented 954 } 955 } 956 957 code << "\n // -- Output field constants and basis data --\n"; 958 for (CeedInt i = 0; i < numoutputfields; i++) { 959 code << " // ---- Output field "<<i<<" ----\n"; 960 // Get elemsize, emode, ncomp 961 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 962 CeedChk(ierr); 963 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 964 CeedChk(ierr); 965 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 966 CeedChk(ierr); 967 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 968 CeedChk(ierr); 969 970 // Set field constants 971 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 972 if (basis != CEED_BASIS_COLLOCATED) { 973 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 974 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 975 } else { 976 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 977 } 978 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 979 980 // Load basis data 981 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 982 switch (emode) { 983 case CEED_EVAL_NONE: 984 break; // No action 985 case CEED_EVAL_INTERP: 986 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 987 data->B.out[i] = basis_data->d_interp1d; 988 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 989 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 990 break; 991 case CEED_EVAL_GRAD: 992 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 993 data->B.out[i] = basis_data->d_interp1d; 994 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 995 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 996 if (useCollograd) { 997 data->G.out[i] = basis_data->d_collograd1d; 998 code << " __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 999 code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1000 } else { 1001 data->G.out[i] = basis_data->d_grad1d; 1002 code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1003 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1004 } 1005 break; 1006 // LCOV_EXCL_START 1007 case CEED_EVAL_WEIGHT: { 1008 Ceed ceed; 1009 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1010 return CeedError(ceed, 1, 1011 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1012 break; // Should not occur 1013 } 1014 case CEED_EVAL_DIV: 1015 break; // TODO: Not implemented 1016 case CEED_EVAL_CURL: 1017 break; // TODO: Not implemented 1018 // LCOV_EXCL_STOP 1019 } 1020 } 1021 code << "\n // -- Element loop --\n"; 1022 code << " __syncthreads();\n"; 1023 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1024 // Input basis apply if needed 1025 // Generate the correct eval mode code for each input 1026 code << " // -- Input field restrictions and basis actions --\n"; 1027 for (CeedInt i = 0; i < numinputfields; i++) { 1028 code << " // ---- Input field "<<i<<" ----\n"; 1029 // Get elemsize, emode, ncomp 1030 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1031 CeedChk(ierr); 1032 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1033 CeedChk(ierr); 1034 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1035 CeedChk(ierr); 1036 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1037 CeedChk(ierr); 1038 1039 // Restriction 1040 if (emode != CEED_EVAL_WEIGHT && 1041 !((emode == CEED_EVAL_NONE) && useCollograd)) { 1042 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1043 1044 bool isStrided; 1045 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1046 if (!isStrided) { 1047 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1048 CeedChk(ierr); 1049 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1050 CeedInt compstride; 1051 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1052 code << " // CompStride: "<<compstride<<"\n"; 1053 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1054 data->indices.in[i] = restr_data->d_ind; 1055 code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1056 } else { 1057 bool backendstrides; 1058 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1059 CeedChk(ierr); 1060 CeedInt nelem; 1061 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1062 CeedChk(ierr); 1063 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1064 if (!backendstrides) { 1065 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1066 CeedChk(ierr); 1067 } 1068 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1069 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1070 } 1071 } 1072 1073 // Basis action 1074 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1075 switch (emode) { 1076 case CEED_EVAL_NONE: 1077 if (!useCollograd) { 1078 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1079 } 1080 break; 1081 case CEED_EVAL_INTERP: 1082 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1083 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1084 break; 1085 case CEED_EVAL_GRAD: 1086 if (useCollograd) { 1087 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1088 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1089 } else { 1090 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1091 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1092 } 1093 break; 1094 case CEED_EVAL_WEIGHT: 1095 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1096 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1097 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 1098 data->W = basis_data->d_qweight1d; 1099 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1100 break; // No action 1101 case CEED_EVAL_DIV: 1102 break; // TODO: Not implemented 1103 case CEED_EVAL_CURL: 1104 break; // TODO: Not implemented 1105 } 1106 } 1107 1108 // Q function 1109 code << "\n // -- Output field setup --\n"; 1110 for (CeedInt i = 0; i < numoutputfields; i++) { 1111 code << "\n // ---- Output field "<<i<<" ----\n"; 1112 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1113 CeedChk(ierr); 1114 if (emode==CEED_EVAL_GRAD) 1115 { 1116 if (useCollograd) { 1117 //Accumulator for gradient slices 1118 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1119 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1120 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1121 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1122 code << " }\n"; 1123 code << " }\n"; 1124 } else { 1125 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1126 } 1127 } 1128 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1129 { 1130 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1131 } 1132 } 1133 // We treat quadrature points per slice in 3d to save registers 1134 if (useCollograd) { 1135 code << "\n // Note: Collocated Gradient\n"; 1136 code << "#pragma unroll\n"; 1137 code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1138 code << " // -- Input fields --\n"; 1139 for (CeedInt i = 0; i < numinputfields; i++) { 1140 code << " // ---- Input field "<<i<<" ----\n"; 1141 // Get elemsize, emode, ncomp 1142 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1143 CeedChk(ierr); 1144 // Basis action 1145 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1146 switch (emode) { 1147 case CEED_EVAL_NONE: 1148 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1149 1150 bool isStrided; 1151 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr); 1152 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr); 1153 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1154 if (!isStrided) { 1155 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1156 CeedChk(ierr); 1157 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1158 CeedInt compstride; 1159 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1160 code << " // CompStride: "<<compstride<<"\n"; 1161 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1162 data->indices.in[i] = restr_data->d_ind; 1163 code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1164 } else { 1165 bool backendstrides; 1166 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1167 CeedChk(ierr); 1168 CeedInt nelem; 1169 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1170 CeedChk(ierr); 1171 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1172 if (!backendstrides) { 1173 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1174 CeedChk(ierr); 1175 } 1176 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1177 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1178 } 1179 break; 1180 case CEED_EVAL_INTERP: 1181 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1182 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1183 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1184 code << " }\n"; 1185 break; 1186 case CEED_EVAL_GRAD: 1187 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1188 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1189 break; 1190 case CEED_EVAL_WEIGHT: 1191 code << " CeedScalar r_q"<<i<<"[1];\n"; 1192 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1193 break; // No action 1194 case CEED_EVAL_DIV: 1195 break; // TODO: Not implemented 1196 case CEED_EVAL_CURL: 1197 break; // TODO: Not implemented 1198 } 1199 } 1200 code << "\n // -- Output fields --\n"; 1201 for (CeedInt i = 0; i < numoutputfields; i++) { 1202 code << " // ---- Output field "<<i<<" ----\n"; 1203 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1204 CeedChk(ierr); 1205 // Basis action 1206 switch (emode) { 1207 case CEED_EVAL_NONE: 1208 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1209 break; // No action 1210 case CEED_EVAL_INTERP: 1211 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1212 break; 1213 case CEED_EVAL_GRAD: 1214 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1215 break; 1216 case CEED_EVAL_WEIGHT: 1217 break; // Should not occur 1218 case CEED_EVAL_DIV: 1219 break; // TODO: Not implemented 1220 case CEED_EVAL_CURL: 1221 break; // TODO: Not implemented 1222 } 1223 } 1224 } else { 1225 code << "\n // Note: No Collocated Gradient\n"; 1226 code << " // -- Input fields --\n"; 1227 for (CeedInt i = 0; i < numinputfields; i++) { 1228 code << " // ---- Input field "<<i<<" ----\n"; 1229 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1230 } 1231 code << " // -- Output fields --\n"; 1232 for (CeedInt i = 0; i < numoutputfields; i++) { 1233 code << " // ---- Output field "<<i<<" ----\n"; 1234 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1235 } 1236 } 1237 code << "\n // -- QFunction Inputs and outputs --\n"; 1238 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1239 for (CeedInt i = 0; i < numinputfields; i++) { 1240 code << " // ---- Input field "<<i<<" ----\n"; 1241 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1242 } 1243 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1244 for (CeedInt i = 0; i < numoutputfields; i++) { 1245 code << " // ---- Output field "<<i<<" ----\n"; 1246 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1247 } 1248 code << "\n // -- Apply QFunction --\n"; 1249 code << " "<<qFunctionName<<"(ctx, "; 1250 if (dim != 3 || useCollograd) { 1251 code << "1"; 1252 } else { 1253 code << "Q1d"; 1254 } 1255 code << ", in, out);\n"; 1256 if (useCollograd) { 1257 code << "\n // Note: Collocated Gradient\n"; 1258 code << " // -- Output fields --\n"; 1259 for (CeedInt i = 0; i < numoutputfields; i++) { 1260 code << " // ---- Output field "<<i<<" ----\n"; 1261 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1262 CeedChk(ierr); 1263 // Basis action 1264 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1265 switch (emode) { 1266 case CEED_EVAL_NONE: 1267 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1268 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1269 code << " }\n"; 1270 break; // No action 1271 case CEED_EVAL_INTERP: 1272 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1273 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1274 code << " }\n"; 1275 break; 1276 case CEED_EVAL_GRAD: 1277 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1278 break; 1279 case CEED_EVAL_WEIGHT: 1280 break; // Should not occur 1281 case CEED_EVAL_DIV: 1282 break; // TODO: Not implemented 1283 case CEED_EVAL_CURL: 1284 break; // TODO: Not implemented 1285 } 1286 } 1287 code << " }\n"; 1288 } 1289 1290 // Output basis apply if needed 1291 // Generate the correct eval mode code for each output 1292 code << "\n // -- Output field basis action and restrictions --\n"; 1293 for (CeedInt i = 0; i < numoutputfields; i++) { 1294 code << " // ---- Output field "<<i<<" ----\n"; 1295 // Get elemsize, emode, ncomp 1296 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1297 CeedChk(ierr); 1298 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1299 CeedChk(ierr); 1300 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1301 CeedChk(ierr); 1302 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1303 CeedChk(ierr); 1304 // Basis action 1305 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1306 switch (emode) { 1307 case CEED_EVAL_NONE: 1308 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1309 break; // No action 1310 case CEED_EVAL_INTERP: 1311 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1312 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1313 break; 1314 case CEED_EVAL_GRAD: 1315 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1316 if (useCollograd) { 1317 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1318 } else { 1319 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1320 } 1321 break; 1322 // LCOV_EXCL_START 1323 case CEED_EVAL_WEIGHT: { 1324 Ceed ceed; 1325 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1326 return CeedError(ceed, 1, 1327 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1328 break; // Should not occur 1329 } 1330 case CEED_EVAL_DIV: 1331 break; // TODO: Not implemented 1332 case CEED_EVAL_CURL: 1333 break; // TODO: Not implemented 1334 // LCOV_EXCL_STOP 1335 } 1336 // Restriction 1337 bool isStrided; 1338 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1339 if (!isStrided) { 1340 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1341 CeedChk(ierr); 1342 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1343 CeedInt compstride; 1344 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1345 code << " // CompStride: "<<compstride<<"\n"; 1346 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1347 data->indices.out[i] = restr_data->d_ind; 1348 code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1349 } else { 1350 bool backendstrides; 1351 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1352 CeedChk(ierr); 1353 CeedInt nelem; 1354 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1355 CeedChk(ierr); 1356 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1357 if (!backendstrides) { 1358 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1359 CeedChk(ierr); 1360 } 1361 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1362 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1363 } 1364 } 1365 1366 code << " }\n"; 1367 code << "}\n"; 1368 code << "// -----------------------------------------------------------------------------\n\n"; 1369 1370 // View kernel for debugging 1371 CeedDebug(code.str().c_str()); 1372 1373 ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 1, 1374 "T1d", CeedIntMax(Q1d, data->maxP1d)); 1375 CeedChk(ierr); 1376 ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op); 1377 CeedChk(ierr); 1378 1379 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1380 return 0; 1381 } 1382 //------------------------------------------------------------------------------ 1383