1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 #define CEED_DEBUG_COLOR 12 17 18 #include "ceed-hip-gen.h" 19 #include <iostream> 20 #include <sstream> 21 #include "../hip-shared/ceed-hip-shared.h" 22 #include "../hip/ceed-hip-compile.h" 23 24 static const char *deviceFunctions = QUOTE( 25 26 //------------------------------------------------------------------------------ 27 // Typedefs 28 //------------------------------------------------------------------------------ 29 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields; 30 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt; 31 32 typedef struct { 33 CeedInt tidx; 34 CeedInt tidy; 35 CeedInt tidz; 36 CeedInt tid; 37 CeedScalar* slice; 38 } BackendData; 39 40 //------------------------------------------------------------------------------ 41 // Load matrices for basis actions 42 //------------------------------------------------------------------------------ 43 template <int P, int Q> 44 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 45 for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 46 B[i] = d_B[i]; 47 } 48 49 //------------------------------------------------------------------------------ 50 // 1D 51 //------------------------------------------------------------------------------ 52 53 //------------------------------------------------------------------------------ 54 // L-vector -> E-vector, offsets provided 55 //------------------------------------------------------------------------------ 56 template <int NCOMP, int COMPSTRIDE, int P1d> 57 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 58 if (data.tidx < P1d) { 59 const CeedInt node = data.tidx; 60 const CeedInt ind = indices[node + elem * P1d]; 61 for (CeedInt comp = 0; comp < NCOMP; ++comp) 62 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 63 } 64 } 65 66 //------------------------------------------------------------------------------ 67 // L-vector -> E-vector, strided 68 //------------------------------------------------------------------------------ 69 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 70 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 71 if (data.tidx < P1d) { 72 const CeedInt node = data.tidx; 73 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 74 for (CeedInt comp = 0; comp < NCOMP; ++comp) 75 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 76 } 77 } 78 79 //------------------------------------------------------------------------------ 80 // E-vector -> L-vector, offsets provided 81 //------------------------------------------------------------------------------ 82 template <int NCOMP, int COMPSTRIDE, int P1d> 83 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 84 if (data.tidx < P1d) { 85 const CeedInt node = data.tidx; 86 const CeedInt ind = indices[node + elem * P1d]; 87 for (CeedInt comp = 0; comp < NCOMP; ++comp) 88 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 89 } 90 } 91 92 //------------------------------------------------------------------------------ 93 // E-vector -> L-vector, strided 94 //------------------------------------------------------------------------------ 95 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 96 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 97 if (data.tidx < P1d) { 98 const CeedInt node = data.tidx; 99 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 100 for (CeedInt comp = 0; comp < NCOMP; ++comp) 101 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 102 } 103 } 104 105 //------------------------------------------------------------------------------ 106 // 1D tensor contraction x 107 //------------------------------------------------------------------------------ 108 template <int NCOMP, int P1d, int Q1d> 109 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 110 data.slice[data.tidx] = *U; 111 __syncthreads(); 112 *V = 0.0; 113 if (data.tidx < Q1d) 114 for (CeedInt i = 0; i < P1d; ++i) 115 *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 116 __syncthreads(); 117 } 118 119 //------------------------------------------------------------------------------ 120 // 1D transpose tensor contraction x 121 //------------------------------------------------------------------------------ 122 template <int NCOMP, int P1d, int Q1d> 123 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 124 data.slice[data.tidx] = *U; 125 __syncthreads(); 126 *V = 0.0; 127 if (data.tidx < P1d) 128 for (CeedInt i = 0; i < Q1d; ++i) 129 *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 130 __syncthreads(); 131 } 132 133 //------------------------------------------------------------------------------ 134 // 1D interpolate to quadrature points 135 //------------------------------------------------------------------------------ 136 template <int NCOMP, int P1d, int Q1d> 137 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 138 for (CeedInt comp = 0; comp < NCOMP; comp++) 139 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 140 } 141 142 //------------------------------------------------------------------------------ 143 // 1D interpolate transpose 144 //------------------------------------------------------------------------------ 145 template <int NCOMP, int P1d, int Q1d> 146 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 147 for (CeedInt comp=0; comp<NCOMP; comp++) 148 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 149 } 150 151 //------------------------------------------------------------------------------ 152 // 1D derivatives at quadrature points 153 //------------------------------------------------------------------------------ 154 template <int NCOMP, int P1d, int Q1d> 155 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 156 for (CeedInt comp = 0; comp < NCOMP; comp++) 157 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 158 } 159 160 //------------------------------------------------------------------------------ 161 // 1D derivatives transpose 162 //------------------------------------------------------------------------------ 163 template <int NCOMP, int P1d, int Q1d> 164 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 165 for (CeedInt comp = 0; comp < NCOMP; comp++) 166 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 167 } 168 169 //------------------------------------------------------------------------------ 170 // 2D 171 //------------------------------------------------------------------------------ 172 173 //------------------------------------------------------------------------------ 174 // L-vector -> E-vector, offsets provided 175 //------------------------------------------------------------------------------ 176 template <int NCOMP, int COMPSTRIDE, int P1d> 177 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 178 if (data.tidx < P1d && data.tidy < P1d) { 179 const CeedInt node = data.tidx + data.tidy*P1d; 180 const CeedInt ind = indices[node + elem * P1d*P1d]; 181 for (CeedInt comp = 0; comp < NCOMP; ++comp) 182 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 183 } 184 } 185 186 //------------------------------------------------------------------------------ 187 // L-vector -> E-vector, strided 188 //------------------------------------------------------------------------------ 189 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 190 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 191 if (data.tidx < P1d && data.tidy < P1d) { 192 const CeedInt node = data.tidx + data.tidy*P1d; 193 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 194 for (CeedInt comp = 0; comp < NCOMP; ++comp) 195 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 196 } 197 } 198 199 //------------------------------------------------------------------------------ 200 // E-vector -> L-vector, offsets provided 201 //------------------------------------------------------------------------------ 202 template <int NCOMP, int COMPSTRIDE, int P1d> 203 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 204 if (data.tidx < P1d && data.tidy < P1d) { 205 const CeedInt node = data.tidx + data.tidy*P1d; 206 const CeedInt ind = indices[node + elem * P1d*P1d]; 207 for (CeedInt comp = 0; comp < NCOMP; ++comp) 208 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 209 } 210 } 211 212 //------------------------------------------------------------------------------ 213 // E-vector -> L-vector, strided 214 //------------------------------------------------------------------------------ 215 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 216 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 217 if (data.tidx < P1d && data.tidy < P1d) { 218 const CeedInt node = data.tidx + data.tidy*P1d; 219 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 220 for (CeedInt comp = 0; comp < NCOMP; ++comp) 221 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 222 } 223 } 224 225 //------------------------------------------------------------------------------ 226 // 2D tensor contraction x 227 //------------------------------------------------------------------------------ 228 template <int NCOMP, int P1d, int Q1d> 229 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 230 data.slice[data.tidx+data.tidy*T1d] = *U; 231 __syncthreads(); 232 *V = 0.0; 233 if (data.tidx < Q1d && data.tidy < P1d) 234 for (CeedInt i = 0; i < P1d; ++i) 235 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 236 __syncthreads(); 237 } 238 239 //------------------------------------------------------------------------------ 240 // 2D tensor contract y 241 //------------------------------------------------------------------------------ 242 template <int NCOMP, int P1d, int Q1d> 243 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 244 data.slice[data.tidx+data.tidy*T1d] = *U; 245 __syncthreads(); 246 *V = 0.0; 247 if (data.tidx < Q1d && data.tidy < Q1d) 248 for (CeedInt i = 0; i < P1d; ++i) 249 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 250 __syncthreads(); 251 } 252 253 //------------------------------------------------------------------------------ 254 // 2D transpose tensor contract y 255 //------------------------------------------------------------------------------ 256 template <int NCOMP, int P1d, int Q1d> 257 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 258 data.slice[data.tidx+data.tidy*T1d] = *U; 259 __syncthreads(); 260 *V = 0.0; 261 if (data.tidx < Q1d && data.tidy < P1d) 262 for (CeedInt i = 0; i < Q1d; ++i) 263 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 264 __syncthreads(); 265 } 266 267 //------------------------------------------------------------------------------ 268 // 2D transpose tensor contract x 269 //------------------------------------------------------------------------------ 270 template <int NCOMP, int P1d, int Q1d> 271 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 272 data.slice[data.tidx+data.tidy*T1d] = *U; 273 __syncthreads(); 274 *V = 0.0; 275 if (data.tidx < P1d && data.tidy < P1d) 276 for (CeedInt i = 0; i < Q1d; ++i) 277 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 278 __syncthreads(); 279 } 280 281 //------------------------------------------------------------------------------ 282 // 2D transpose tensor contract and add x 283 //------------------------------------------------------------------------------ 284 template <int NCOMP, int P1d, int Q1d> 285 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 286 data.slice[data.tidx+data.tidy*T1d] = *U; 287 __syncthreads(); 288 if (data.tidx < P1d && data.tidy < P1d) 289 for (CeedInt i = 0; i < Q1d; ++i) 290 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 291 __syncthreads(); 292 } 293 294 //------------------------------------------------------------------------------ 295 // 2D interpolate to quadrature points 296 //------------------------------------------------------------------------------ 297 template <int NCOMP, int P1d, int Q1d> 298 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 299 CeedScalar r_t[1]; 300 for (CeedInt comp = 0; comp < NCOMP; comp++) { 301 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 302 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 303 } 304 } 305 306 //------------------------------------------------------------------------------ 307 // 2D interpolate transpose 308 //------------------------------------------------------------------------------ 309 template <int NCOMP, int P1d, int Q1d> 310 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 311 CeedScalar r_t[1]; 312 for (CeedInt comp = 0; comp < NCOMP; comp++) { 313 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 314 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 315 } 316 } 317 318 //------------------------------------------------------------------------------ 319 // 2D derivatives at quadrature points 320 //------------------------------------------------------------------------------ 321 template <int NCOMP, int P1d, int Q1d> 322 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 323 CeedScalar r_t[1]; 324 for (CeedInt comp = 0; comp < NCOMP; comp++) { 325 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 326 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 327 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 328 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 329 } 330 } 331 332 //------------------------------------------------------------------------------ 333 // 2D derivatives transpose 334 //------------------------------------------------------------------------------ 335 template <int NCOMP, int P1d, int Q1d> 336 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 337 CeedScalar r_t[1]; 338 for (CeedInt comp = 0; comp < NCOMP; comp++) { 339 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 340 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 341 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 342 ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 343 } 344 } 345 346 //------------------------------------------------------------------------------ 347 // 3D 348 //------------------------------------------------------------------------------ 349 350 //------------------------------------------------------------------------------ 351 // L-vector -> E-vector, offsets provided 352 //------------------------------------------------------------------------------ 353 template <int NCOMP, int COMPSTRIDE, int P1d> 354 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 355 if (data.tidx < P1d && data.tidy < P1d) 356 for (CeedInt z = 0; z < P1d; ++z) { 357 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 358 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 359 for (CeedInt comp = 0; comp < NCOMP; ++comp) 360 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 361 } 362 } 363 364 //------------------------------------------------------------------------------ 365 // L-vector -> E-vector, strided 366 //------------------------------------------------------------------------------ 367 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 368 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 369 if (data.tidx < P1d && data.tidy < P1d) 370 for (CeedInt z = 0; z < P1d; ++z) { 371 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 372 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 373 for (CeedInt comp = 0; comp < NCOMP; ++comp) 374 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 375 } 376 } 377 378 //------------------------------------------------------------------------------ 379 // E-vector -> Q-vector, offests provided 380 //------------------------------------------------------------------------------ 381 template <int NCOMP, int COMPSTRIDE, int Q1d> 382 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 383 if (data.tidx < Q1d && data.tidy < Q1d) { 384 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 385 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 386 for (CeedInt comp = 0; comp < NCOMP; ++comp) 387 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 388 } 389 } 390 391 //------------------------------------------------------------------------------ 392 // E-vector -> Q-vector, strided 393 //------------------------------------------------------------------------------ 394 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 395 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 396 if (data.tidx < Q1d && data.tidy < Q1d) { 397 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 398 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 399 for (CeedInt comp = 0; comp < NCOMP; ++comp) 400 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 401 } 402 } 403 404 //------------------------------------------------------------------------------ 405 // E-vector -> L-vector, offsets provided 406 //------------------------------------------------------------------------------ 407 template <int NCOMP, int COMPSTRIDE, int P1d> 408 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 409 if (data.tidx < P1d && data.tidy < P1d) 410 for (CeedInt z = 0; z < P1d; ++z) { 411 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 412 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 413 for (CeedInt comp = 0; comp < NCOMP; ++comp) 414 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 415 } 416 } 417 418 //------------------------------------------------------------------------------ 419 // E-vector -> L-vector, strided 420 //------------------------------------------------------------------------------ 421 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 422 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 423 if (data.tidx < P1d && data.tidy < P1d) 424 for (CeedInt z = 0; z < P1d; ++z) { 425 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 426 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 427 for (CeedInt comp = 0; comp < NCOMP; ++comp) 428 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 429 } 430 } 431 432 //------------------------------------------------------------------------------ 433 // 3D tensor contract x 434 //------------------------------------------------------------------------------ 435 template <int NCOMP, int P1d, int Q1d> 436 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 437 CeedScalar r_B[P1d]; 438 for (CeedInt i = 0; i < P1d; ++i) 439 r_B[i] = B[i + data.tidx*P1d]; 440 441 for (CeedInt k = 0; k < P1d; ++k) { 442 data.slice[data.tidx+data.tidy*T1d] = U[k]; 443 __syncthreads(); 444 V[k] = 0.0; 445 if (data.tidx < Q1d && data.tidy < P1d) 446 for (CeedInt i = 0; i < P1d; ++i) 447 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 448 __syncthreads(); 449 } 450 } 451 452 //------------------------------------------------------------------------------ 453 // 3D tensor contract y 454 //------------------------------------------------------------------------------ 455 template <int NCOMP, int P1d, int Q1d> 456 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 457 CeedScalar r_B[P1d]; 458 for (CeedInt i = 0; i < P1d; ++i) 459 r_B[i] = B[i + data.tidy*P1d]; 460 461 for (CeedInt k = 0; k < P1d; ++k) { 462 data.slice[data.tidx+data.tidy*T1d] = U[k]; 463 __syncthreads(); 464 V[k] = 0.0; 465 if (data.tidx < Q1d && data.tidy < Q1d) 466 for (CeedInt i = 0; i < P1d; ++i) 467 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 468 __syncthreads(); 469 } 470 } 471 472 //------------------------------------------------------------------------------ 473 // 3D tensor contract z 474 //------------------------------------------------------------------------------ 475 template <int NCOMP, int P1d, int Q1d> 476 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 477 for (CeedInt k = 0; k < Q1d; ++k) { 478 V[k] = 0.0; 479 if (data.tidx < Q1d && data.tidy < Q1d) 480 for (CeedInt i = 0; i < P1d; ++i) 481 V[k] += B[i + k*P1d] * U[i]; // Contract z direction 482 } 483 } 484 485 //------------------------------------------------------------------------------ 486 // 3D transpose tensor contract z 487 //------------------------------------------------------------------------------ 488 template <int NCOMP, int P1d, int Q1d> 489 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 490 for (CeedInt k = 0; k < P1d; ++k) { 491 V[k] = 0.0; 492 if (data.tidx < Q1d && data.tidy < Q1d) 493 for (CeedInt i = 0; i < Q1d; ++i) 494 V[k] += B[k + i*P1d] * U[i]; // Contract z direction 495 } 496 } 497 498 //------------------------------------------------------------------------------ 499 // 3D transpose tensor contract y 500 //------------------------------------------------------------------------------ 501 template <int NCOMP, int P1d, int Q1d> 502 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 503 CeedScalar r_B[Q1d]; 504 for (CeedInt i = 0; i < Q1d; ++i) 505 r_B[i] = B[data.tidy + i*P1d]; 506 507 for (CeedInt k = 0; k < P1d; ++k) { 508 data.slice[data.tidx+data.tidy*T1d] = U[k]; 509 __syncthreads(); 510 V[k] = 0.0; 511 if (data.tidx < Q1d && data.tidy < P1d) 512 for (CeedInt i = 0; i < Q1d; ++i) 513 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 514 __syncthreads(); 515 } 516 } 517 518 //------------------------------------------------------------------------------ 519 // 3D transpose tensor contract add y 520 //------------------------------------------------------------------------------ 521 template <int NCOMP, int P1d, int Q1d> 522 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 523 CeedScalar r_B[Q1d]; 524 for (CeedInt i = 0; i < Q1d; ++i) 525 r_B[i] = B[data.tidy + i*P1d]; 526 527 for (CeedInt k = 0; k < P1d; ++k) { 528 data.slice[data.tidx+data.tidy*T1d] = U[k]; 529 __syncthreads(); 530 if (data.tidx < Q1d && data.tidy < P1d) 531 for (CeedInt i = 0; i < Q1d; ++i) 532 V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 533 __syncthreads(); 534 } 535 } 536 537 //------------------------------------------------------------------------------ 538 // 3D transpose tensor contract x 539 //------------------------------------------------------------------------------ 540 template <int NCOMP, int P1d, int Q1d> 541 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 542 CeedScalar r_B[Q1d]; 543 for (CeedInt i = 0; i < Q1d; ++i) 544 r_B[i] = B[data.tidx + i*P1d]; 545 546 for (CeedInt k = 0; k < P1d; ++k) { 547 data.slice[data.tidx+data.tidy*T1d] = U[k]; 548 __syncthreads(); 549 V[k] = 0.0; 550 if (data.tidx < P1d && data.tidy < P1d) 551 for (CeedInt i = 0; i < Q1d; ++i) 552 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 553 __syncthreads(); 554 } 555 } 556 557 //------------------------------------------------------------------------------ 558 // 3D transpose tensor contract add x 559 //------------------------------------------------------------------------------ 560 template <int NCOMP, int P1d, int Q1d> 561 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 562 CeedScalar r_B[Q1d]; 563 for (CeedInt i = 0; i < Q1d; ++i) 564 r_B[i] = B[data.tidx + i*P1d]; 565 566 for (CeedInt k = 0; k < P1d; ++k) { 567 data.slice[data.tidx+data.tidy*T1d] = U[k]; 568 __syncthreads(); 569 if (data.tidx < P1d && data.tidy < P1d) 570 for (CeedInt i = 0; i < Q1d; ++i) 571 V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 572 __syncthreads(); 573 } 574 } 575 576 //------------------------------------------------------------------------------ 577 // 3D interpolate to quadrature points 578 //------------------------------------------------------------------------------ 579 template <int NCOMP, int P1d, int Q1d> 580 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 581 CeedScalar r_t1[T1d]; 582 CeedScalar r_t2[T1d]; 583 for (CeedInt comp = 0; comp < NCOMP; comp++) { 584 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 585 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 586 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 587 } 588 } 589 590 //------------------------------------------------------------------------------ 591 // 3D interpolate transpose 592 //------------------------------------------------------------------------------ 593 template <int NCOMP, int P1d, int Q1d> 594 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 595 CeedScalar r_t1[T1d]; 596 CeedScalar r_t2[T1d]; 597 for (CeedInt comp = 0; comp < NCOMP; comp++) { 598 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 599 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 600 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 601 } 602 } 603 604 //------------------------------------------------------------------------------ 605 // 3D derivatives at quadrature points 606 //------------------------------------------------------------------------------ 607 template <int NCOMP, int P1d, int Q1d> 608 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 609 CeedScalar r_t1[T1d]; 610 CeedScalar r_t2[T1d]; 611 for (CeedInt comp = 0; comp < NCOMP; comp++) { 612 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 613 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 614 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 615 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 616 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 617 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 618 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 619 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 620 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 621 } 622 } 623 624 //------------------------------------------------------------------------------ 625 // 3D derivatives transpose 626 //------------------------------------------------------------------------------ 627 template <int NCOMP, int P1d, int Q1d> 628 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 629 CeedScalar r_t1[T1d]; 630 CeedScalar r_t2[T1d]; 631 for (CeedInt comp = 0; comp < NCOMP; comp++) { 632 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 633 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 634 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 635 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 636 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 637 ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 638 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 639 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 640 ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 641 } 642 } 643 644 //------------------------------------------------------------------------------ 645 // 3D collocated derivatives computation 646 //------------------------------------------------------------------------------ 647 template <int NCOMP, int Q1d> 648 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 649 if (data.tidx < Q1d && data.tidy < Q1d) { 650 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 651 data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 652 __syncthreads(); 653 // X derivative 654 r_V[comp+0*NCOMP] = 0.0; 655 for (CeedInt i = 0; i < Q1d; ++i) 656 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 657 // Y derivative 658 r_V[comp+1*NCOMP] = 0.0; 659 for (CeedInt i = 0; i < Q1d; ++i) 660 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 661 // Z derivative 662 r_V[comp+2*NCOMP] = 0.0; 663 for (CeedInt i = 0; i < Q1d; ++i) 664 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 665 __syncthreads(); 666 } 667 } 668 } 669 670 //------------------------------------------------------------------------------ 671 // 3D collocated derivatives transpose 672 //------------------------------------------------------------------------------ 673 template <int NCOMP, int Q1d> 674 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 675 if (data.tidx < Q1d && data.tidy < Q1d) { 676 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 677 // X derivative 678 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 679 __syncthreads(); 680 for (CeedInt i = 0; i < Q1d; ++i) 681 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 682 __syncthreads(); 683 // Y derivative 684 data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 685 __syncthreads(); 686 for (CeedInt i = 0; i < Q1d; ++i) 687 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 688 __syncthreads(); 689 // Z derivative 690 for (CeedInt i = 0; i < Q1d; ++i) 691 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 692 } 693 } 694 } 695 696 //------------------------------------------------------------------------------ 697 // 1D quadrature weights 698 //------------------------------------------------------------------------------ 699 template <int Q1d> 700 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 701 *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 702 } 703 704 //------------------------------------------------------------------------------ 705 // 2D quadrature weights 706 //------------------------------------------------------------------------------ 707 template <int Q1d> 708 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 709 *w = (data.tidx < Q1d && data.tidy < Q1d) ? 710 qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 711 } 712 713 //------------------------------------------------------------------------------ 714 // 3D quadrature weights 715 //------------------------------------------------------------------------------ 716 template <int Q1d> 717 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 718 const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 719 const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 720 for (CeedInt z = 0; z < Q1d; ++z) 721 w[z] = quad ? pw*qweight1d[z] : 0.0; 722 } 723 724 ); 725 //------------------------------------------------------------------------------ 726 // Build singe operator kernel 727 //------------------------------------------------------------------------------ 728 CEED_INTERN int CeedHipGenOperatorBuild(CeedOperator op) { 729 730 using std::ostringstream; 731 using std::string; 732 int ierr; 733 bool setupdone; 734 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr); 735 if (setupdone) return 0; 736 Ceed ceed; 737 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 738 CeedOperator_Hip_gen *data; 739 ierr = CeedOperatorGetData(op, &data); CeedChk(ierr); 740 CeedQFunction qf; 741 CeedQFunction_Hip_gen *qf_data; 742 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 743 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr); 744 CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 745 numoutputfields, ncomp, dim = 0, lsize; 746 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 747 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 748 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 749 CeedChk(ierr); 750 CeedOperatorField *opinputfields, *opoutputfields; 751 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 752 CeedChk(ierr); 753 CeedQFunctionField *qfinputfields, *qfoutputfields; 754 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 755 CeedChk(ierr); 756 CeedEvalMode emode; 757 CeedBasis basis; 758 CeedBasis_Hip_shared *basis_data; 759 CeedElemRestriction Erestrict; 760 CeedElemRestriction_Hip *restr_data; 761 762 ostringstream code; 763 string devFunctions(deviceFunctions); 764 code << devFunctions; 765 766 string qFunction(qf_data->qFunctionSource); 767 string qFunctionName(qf_data->qFunctionName); 768 string oper; 769 oper = "CeedKernel_Hip_gen_" + qFunctionName; 770 771 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 772 code << "\n#define CeedPragmaSIMD\n"; 773 774 // Find dim and Q1d 775 bool useCollograd = true; 776 data->maxP1d = 0; 777 for (CeedInt i = 0; i < numinputfields; i++) { 778 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 779 if (basis != CEED_BASIS_COLLOCATED) { 780 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 781 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 782 CeedChk(ierr); 783 784 // Check for collocated gradient 785 useCollograd = useCollograd && basis_data->d_collograd1d; 786 787 // Collect dim and Q1d 788 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 789 bool isTensor; 790 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 791 if (isTensor) { 792 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 793 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 794 if (P1d>data->maxP1d) data->maxP1d = P1d; 795 } else { 796 // LCOV_EXCL_START 797 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 798 // LCOV_EXCL_STOP 799 } 800 } 801 } 802 // Check output bases for Q1d, dim as well 803 // The only imput basis might be CEED_BASIS_COLLOCATED 804 for (CeedInt i = 0; i < numoutputfields; i++) { 805 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 806 807 if (basis != CEED_BASIS_COLLOCATED) { 808 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 809 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 810 CeedChk(ierr); 811 812 // Collect dim and Q1d 813 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 814 bool isTensor; 815 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 816 if (isTensor) { 817 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 818 } else { 819 // LCOV_EXCL_START 820 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 821 // LCOV_EXCL_STOP 822 } 823 824 // Check for collocated gradient 825 useCollograd = useCollograd && basis_data->d_collograd1d; 826 } 827 } 828 data->dim = dim; 829 data->Q1d = Q1d; 830 831 // Define CEED_Q_VLA 832 if (dim != 3 || useCollograd) { 833 code << "\n#define CEED_Q_VLA 1\n\n"; 834 } else { 835 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 836 } 837 838 code << qFunction; 839 840 // Setup 841 code << "\n// -----------------------------------------------------------------------------\n"; 842 code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n"; 843 for (CeedInt i = 0; i < numinputfields; i++) { 844 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 845 CeedChk(ierr); 846 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 847 code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 848 } 849 } 850 851 for (CeedInt i = 0; i < numoutputfields; i++) { 852 code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 853 } 854 855 code << " const CeedInt Dim = "<<dim<<";\n"; 856 code << " const CeedInt Q1d = "<<Q1d<<";\n"; 857 858 code << " HIP_DYNAMIC_SHARED( CeedScalar, slice)\n"; 859 code << " BackendData data;\n"; 860 code << " data.tidx = threadIdx.x;\n"; 861 code << " data.tidy = threadIdx.y;\n"; 862 code << " data.tidz = threadIdx.z;\n"; 863 code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 864 code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 865 866 code << "\n // -- Input field constants and basis data --\n"; 867 //Initialize constants, and matrices B and G 868 for (CeedInt i = 0; i < numinputfields; i++) { 869 code << " // ---- Input field "<<i<<" ----\n"; 870 // Get elemsize, emode, ncomp 871 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 872 CeedChk(ierr); 873 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 874 CeedChk(ierr); 875 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 876 CeedChk(ierr); 877 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 878 CeedChk(ierr); 879 880 // Set field constants 881 if (emode != CEED_EVAL_WEIGHT) { 882 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 883 if (basis != CEED_BASIS_COLLOCATED) { 884 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 885 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 886 } else { 887 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 888 } 889 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 890 } 891 892 // Load basis data 893 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 894 switch (emode) { 895 case CEED_EVAL_NONE: 896 break; 897 case CEED_EVAL_INTERP: 898 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 899 data->B.in[i] = basis_data->d_interp1d; 900 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 901 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 902 break; 903 case CEED_EVAL_GRAD: 904 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 905 data->B.in[i] = basis_data->d_interp1d; 906 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 907 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 908 if (useCollograd) { 909 data->G.in[i] = basis_data->d_collograd1d; 910 code << " __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 911 code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 912 } else { 913 data->G.in[i] = basis_data->d_grad1d; 914 code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 915 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 916 } 917 break; 918 case CEED_EVAL_WEIGHT: 919 break; // No action 920 case CEED_EVAL_DIV: 921 break; // TODO: Not implemented 922 case CEED_EVAL_CURL: 923 break; // TODO: Not implemented 924 } 925 } 926 927 code << "\n // -- Output field constants and basis data --\n"; 928 for (CeedInt i = 0; i < numoutputfields; i++) { 929 code << " // ---- Output field "<<i<<" ----\n"; 930 // Get elemsize, emode, ncomp 931 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 932 CeedChk(ierr); 933 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 934 CeedChk(ierr); 935 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 936 CeedChk(ierr); 937 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 938 CeedChk(ierr); 939 940 // Set field constants 941 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 942 if (basis != CEED_BASIS_COLLOCATED) { 943 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 944 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 945 } else { 946 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 947 } 948 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 949 950 // Load basis data 951 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 952 switch (emode) { 953 case CEED_EVAL_NONE: 954 break; // No action 955 case CEED_EVAL_INTERP: 956 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 957 data->B.out[i] = basis_data->d_interp1d; 958 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 959 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 960 break; 961 case CEED_EVAL_GRAD: 962 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 963 data->B.out[i] = basis_data->d_interp1d; 964 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 965 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 966 if (useCollograd) { 967 data->G.out[i] = basis_data->d_collograd1d; 968 code << " __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 969 code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 970 } else { 971 data->G.out[i] = basis_data->d_grad1d; 972 code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 973 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 974 } 975 break; 976 // LCOV_EXCL_START 977 case CEED_EVAL_WEIGHT: { 978 Ceed ceed; 979 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 980 return CeedError(ceed, 1, 981 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 982 break; // Should not occur 983 } 984 case CEED_EVAL_DIV: 985 break; // TODO: Not implemented 986 case CEED_EVAL_CURL: 987 break; // TODO: Not implemented 988 // LCOV_EXCL_STOP 989 } 990 } 991 code << "\n // -- Element loop --\n"; 992 code << " __syncthreads();\n"; 993 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 994 // Input basis apply if needed 995 // Generate the correct eval mode code for each input 996 code << " // -- Input field restrictions and basis actions --\n"; 997 for (CeedInt i = 0; i < numinputfields; i++) { 998 code << " // ---- Input field "<<i<<" ----\n"; 999 // Get elemsize, emode, ncomp 1000 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1001 CeedChk(ierr); 1002 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1003 CeedChk(ierr); 1004 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1005 CeedChk(ierr); 1006 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1007 CeedChk(ierr); 1008 1009 // Restriction 1010 if (emode != CEED_EVAL_WEIGHT && 1011 !((emode == CEED_EVAL_NONE) && useCollograd)) { 1012 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1013 1014 bool isStrided; 1015 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1016 if (!isStrided) { 1017 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1018 CeedChk(ierr); 1019 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1020 CeedInt compstride; 1021 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1022 code << " // CompStride: "<<compstride<<"\n"; 1023 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1024 data->indices.in[i] = restr_data->d_ind; 1025 code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1026 } else { 1027 bool backendstrides; 1028 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1029 CeedChk(ierr); 1030 CeedInt nelem; 1031 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1032 CeedChk(ierr); 1033 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1034 if (!backendstrides) { 1035 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1036 CeedChk(ierr); 1037 } 1038 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1039 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1040 } 1041 } 1042 1043 // Basis action 1044 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1045 switch (emode) { 1046 case CEED_EVAL_NONE: 1047 if (!useCollograd) { 1048 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1049 } 1050 break; 1051 case CEED_EVAL_INTERP: 1052 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1053 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1054 break; 1055 case CEED_EVAL_GRAD: 1056 if (useCollograd) { 1057 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1058 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1059 } else { 1060 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1061 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1062 } 1063 break; 1064 case CEED_EVAL_WEIGHT: 1065 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1066 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1067 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 1068 data->W = basis_data->d_qweight1d; 1069 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1070 break; // No action 1071 case CEED_EVAL_DIV: 1072 break; // TODO: Not implemented 1073 case CEED_EVAL_CURL: 1074 break; // TODO: Not implemented 1075 } 1076 } 1077 1078 // Q function 1079 code << "\n // -- Output field setup --\n"; 1080 for (CeedInt i = 0; i < numoutputfields; i++) { 1081 code << "\n // ---- Output field "<<i<<" ----\n"; 1082 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1083 CeedChk(ierr); 1084 if (emode==CEED_EVAL_GRAD) 1085 { 1086 if (useCollograd) { 1087 //Accumulator for gradient slices 1088 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1089 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1090 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1091 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1092 code << " }\n"; 1093 code << " }\n"; 1094 } else { 1095 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1096 } 1097 } 1098 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1099 { 1100 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1101 } 1102 } 1103 // We treat quadrature points per slice in 3d to save registers 1104 if (useCollograd) { 1105 code << "\n // Note: Collocated Gradient\n"; 1106 code << "#pragma unroll\n"; 1107 code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1108 code << " // -- Input fields --\n"; 1109 for (CeedInt i = 0; i < numinputfields; i++) { 1110 code << " // ---- Input field "<<i<<" ----\n"; 1111 // Get elemsize, emode, ncomp 1112 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1113 CeedChk(ierr); 1114 // Basis action 1115 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1116 switch (emode) { 1117 case CEED_EVAL_NONE: 1118 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1119 1120 bool isStrided; 1121 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr); 1122 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr); 1123 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1124 if (!isStrided) { 1125 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1126 CeedChk(ierr); 1127 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1128 CeedInt compstride; 1129 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1130 code << " // CompStride: "<<compstride<<"\n"; 1131 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1132 data->indices.in[i] = restr_data->d_ind; 1133 code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1134 } else { 1135 bool backendstrides; 1136 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1137 CeedChk(ierr); 1138 CeedInt nelem; 1139 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1140 CeedChk(ierr); 1141 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1142 if (!backendstrides) { 1143 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1144 CeedChk(ierr); 1145 } 1146 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1147 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1148 } 1149 break; 1150 case CEED_EVAL_INTERP: 1151 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1152 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1153 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1154 code << " }\n"; 1155 break; 1156 case CEED_EVAL_GRAD: 1157 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1158 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1159 break; 1160 case CEED_EVAL_WEIGHT: 1161 code << " CeedScalar r_q"<<i<<"[1];\n"; 1162 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1163 break; // No action 1164 case CEED_EVAL_DIV: 1165 break; // TODO: Not implemented 1166 case CEED_EVAL_CURL: 1167 break; // TODO: Not implemented 1168 } 1169 } 1170 code << "\n // -- Output fields --\n"; 1171 for (CeedInt i = 0; i < numoutputfields; i++) { 1172 code << " // ---- Output field "<<i<<" ----\n"; 1173 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1174 CeedChk(ierr); 1175 // Basis action 1176 switch (emode) { 1177 case CEED_EVAL_NONE: 1178 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1179 break; // No action 1180 case CEED_EVAL_INTERP: 1181 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1182 break; 1183 case CEED_EVAL_GRAD: 1184 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1185 break; 1186 case CEED_EVAL_WEIGHT: 1187 break; // Should not occur 1188 case CEED_EVAL_DIV: 1189 break; // TODO: Not implemented 1190 case CEED_EVAL_CURL: 1191 break; // TODO: Not implemented 1192 } 1193 } 1194 } else { 1195 code << "\n // Note: No Collocated Gradient\n"; 1196 code << " // -- Input fields --\n"; 1197 for (CeedInt i = 0; i < numinputfields; i++) { 1198 code << " // ---- Input field "<<i<<" ----\n"; 1199 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1200 } 1201 code << " // -- Output fields --\n"; 1202 for (CeedInt i = 0; i < numoutputfields; i++) { 1203 code << " // ---- Output field "<<i<<" ----\n"; 1204 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1205 } 1206 } 1207 code << "\n // -- QFunction Inputs and outputs --\n"; 1208 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1209 for (CeedInt i = 0; i < numinputfields; i++) { 1210 code << " // ---- Input field "<<i<<" ----\n"; 1211 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1212 } 1213 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1214 for (CeedInt i = 0; i < numoutputfields; i++) { 1215 code << " // ---- Output field "<<i<<" ----\n"; 1216 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1217 } 1218 code << "\n // -- Apply QFunction --\n"; 1219 code << " "<<qFunctionName<<"(ctx, "; 1220 if (dim != 3 || useCollograd) { 1221 code << "1"; 1222 } else { 1223 code << "Q1d"; 1224 } 1225 code << ", in, out);\n"; 1226 if (useCollograd) { 1227 code << "\n // Note: Collocated Gradient\n"; 1228 code << " // -- Output fields --\n"; 1229 for (CeedInt i = 0; i < numoutputfields; i++) { 1230 code << " // ---- Output field "<<i<<" ----\n"; 1231 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1232 CeedChk(ierr); 1233 // Basis action 1234 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1235 switch (emode) { 1236 case CEED_EVAL_NONE: 1237 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1238 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1239 code << " }\n"; 1240 break; // No action 1241 case CEED_EVAL_INTERP: 1242 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1243 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1244 code << " }\n"; 1245 break; 1246 case CEED_EVAL_GRAD: 1247 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1248 break; 1249 case CEED_EVAL_WEIGHT: 1250 break; // Should not occur 1251 case CEED_EVAL_DIV: 1252 break; // TODO: Not implemented 1253 case CEED_EVAL_CURL: 1254 break; // TODO: Not implemented 1255 } 1256 } 1257 code << " }\n"; 1258 } 1259 1260 // Output basis apply if needed 1261 // Generate the correct eval mode code for each output 1262 code << "\n // -- Output field basis action and restrictions --\n"; 1263 for (CeedInt i = 0; i < numoutputfields; i++) { 1264 code << " // ---- Output field "<<i<<" ----\n"; 1265 // Get elemsize, emode, ncomp 1266 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1267 CeedChk(ierr); 1268 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1269 CeedChk(ierr); 1270 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1271 CeedChk(ierr); 1272 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1273 CeedChk(ierr); 1274 // Basis action 1275 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1276 switch (emode) { 1277 case CEED_EVAL_NONE: 1278 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1279 break; // No action 1280 case CEED_EVAL_INTERP: 1281 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1282 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1283 break; 1284 case CEED_EVAL_GRAD: 1285 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1286 if (useCollograd) { 1287 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1288 } else { 1289 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1290 } 1291 break; 1292 // LCOV_EXCL_START 1293 case CEED_EVAL_WEIGHT: { 1294 Ceed ceed; 1295 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1296 return CeedError(ceed, 1, 1297 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1298 break; // Should not occur 1299 } 1300 case CEED_EVAL_DIV: 1301 break; // TODO: Not implemented 1302 case CEED_EVAL_CURL: 1303 break; // TODO: Not implemented 1304 // LCOV_EXCL_STOP 1305 } 1306 // Restriction 1307 bool isStrided; 1308 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1309 if (!isStrided) { 1310 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1311 CeedChk(ierr); 1312 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1313 CeedInt compstride; 1314 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1315 code << " // CompStride: "<<compstride<<"\n"; 1316 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1317 data->indices.out[i] = restr_data->d_ind; 1318 code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1319 } else { 1320 bool backendstrides; 1321 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1322 CeedChk(ierr); 1323 CeedInt nelem; 1324 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1325 CeedChk(ierr); 1326 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1327 if (!backendstrides) { 1328 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1329 CeedChk(ierr); 1330 } 1331 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1332 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1333 } 1334 } 1335 1336 code << " }\n"; 1337 code << "}\n"; 1338 code << "// -----------------------------------------------------------------------------\n\n"; 1339 1340 // View kernel for debugging 1341 CeedDebug(code.str().c_str()); 1342 1343 ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 1, 1344 "T1d", CeedIntMax(Q1d, data->maxP1d)); 1345 CeedChk(ierr); 1346 ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op); 1347 CeedChk(ierr); 1348 1349 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1350 return 0; 1351 } 1352 //------------------------------------------------------------------------------ 1353