1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 #define CEED_DEBUG_COLOR 12 17 18 #include "ceed-cuda-gen.h" 19 #include <iostream> 20 #include <sstream> 21 #include "../cuda-reg/ceed-cuda-reg.h" 22 #include "../cuda-shared/ceed-cuda-shared.h" 23 24 static const char *atomicAdd = QUOTE( 25 //------------------------------------------------------------------------------ 26 // Atomic add, for older CUDA 27 //------------------------------------------------------------------------------ 28 __device__ double atomicAdd(double *address, double val) { 29 unsigned long long int *address_as_ull = (unsigned long long int *)address; 30 unsigned long long int old = *address_as_ull, assumed; 31 do { 32 assumed = old; 33 old = 34 atomicCAS(address_as_ull, assumed, 35 __double_as_longlong(val + 36 __longlong_as_double(assumed))); 37 // Note: uses integer comparison to avoid hang in case of NaN 38 // (since NaN != NaN) 39 } while (assumed != old); 40 return __longlong_as_double(old); 41 } 42 ); 43 44 static const char *deviceFunctions = QUOTE( 45 46 //------------------------------------------------------------------------------ 47 // Typedefs 48 //------------------------------------------------------------------------------ 49 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 50 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 51 52 typedef struct { 53 CeedInt tidx; 54 CeedInt tidy; 55 CeedInt tidz; 56 CeedInt tid; 57 CeedScalar* slice; 58 } BackendData; 59 60 //------------------------------------------------------------------------------ 61 // Load matrices for basis actions 62 //------------------------------------------------------------------------------ 63 template <int P, int Q> 64 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 65 for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 66 B[i] = d_B[i]; 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D 71 //------------------------------------------------------------------------------ 72 73 //------------------------------------------------------------------------------ 74 // L-vector -> E-vector, offsets provided 75 //------------------------------------------------------------------------------ 76 template <int NCOMP, int COMPSTRIDE, int P1d> 77 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 78 if (data.tidx < P1d) { 79 const CeedInt node = data.tidx; 80 const CeedInt ind = indices[node + elem * P1d]; 81 for (CeedInt comp = 0; comp < NCOMP; ++comp) 82 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 83 } 84 } 85 86 //------------------------------------------------------------------------------ 87 // L-vector -> E-vector, strided 88 //------------------------------------------------------------------------------ 89 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 90 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 91 if (data.tidx < P1d) { 92 const CeedInt node = data.tidx; 93 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 94 for (CeedInt comp = 0; comp < NCOMP; ++comp) 95 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 96 } 97 } 98 99 //------------------------------------------------------------------------------ 100 // E-vector -> L-vector, offsets provided 101 //------------------------------------------------------------------------------ 102 template <int NCOMP, int COMPSTRIDE, int P1d> 103 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 104 if (data.tidx < P1d) { 105 const CeedInt node = data.tidx; 106 const CeedInt ind = indices[node + elem * P1d]; 107 for (CeedInt comp = 0; comp < NCOMP; ++comp) 108 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 109 } 110 } 111 112 //------------------------------------------------------------------------------ 113 // E-vector -> L-vector, strided 114 //------------------------------------------------------------------------------ 115 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 116 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 117 if (data.tidx < P1d) { 118 const CeedInt node = data.tidx; 119 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 120 for (CeedInt comp = 0; comp < NCOMP; ++comp) 121 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 122 } 123 } 124 125 //------------------------------------------------------------------------------ 126 // 1D tensor contraction x 127 //------------------------------------------------------------------------------ 128 template <int NCOMP, int P1d, int Q1d> 129 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 130 data.slice[data.tidx] = *U; 131 __syncthreads(); 132 *V = 0.0; 133 for (CeedInt i = 0; i < P1d; ++i) 134 *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 135 __syncthreads(); 136 } 137 138 //------------------------------------------------------------------------------ 139 // 1D transpose tensor contraction x 140 //------------------------------------------------------------------------------ 141 template <int NCOMP, int P1d, int Q1d> 142 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 143 data.slice[data.tidx] = *U; 144 __syncthreads(); 145 *V = 0.0; 146 for (CeedInt i = 0; i < Q1d; ++i) 147 *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 148 __syncthreads(); 149 } 150 151 //------------------------------------------------------------------------------ 152 // 1D interpolate to quadrature points 153 //------------------------------------------------------------------------------ 154 template <int NCOMP, int P1d, int Q1d> 155 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 156 for (CeedInt comp = 0; comp < NCOMP; comp++) 157 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 158 } 159 160 //------------------------------------------------------------------------------ 161 // 1D interpolate transpose 162 //------------------------------------------------------------------------------ 163 template <int NCOMP, int P1d, int Q1d> 164 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 165 for (CeedInt comp=0; comp<NCOMP; comp++) 166 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 167 } 168 169 //------------------------------------------------------------------------------ 170 // 1D derivatives at quadrature points 171 //------------------------------------------------------------------------------ 172 template <int NCOMP, int P1d, int Q1d> 173 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 174 for (CeedInt comp = 0; comp < NCOMP; comp++) 175 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 176 } 177 178 //------------------------------------------------------------------------------ 179 // 1D derivatives transpose 180 //------------------------------------------------------------------------------ 181 template <int NCOMP, int P1d, int Q1d> 182 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 183 for (CeedInt comp = 0; comp < NCOMP; comp++) 184 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 185 } 186 187 //------------------------------------------------------------------------------ 188 // 2D 189 //------------------------------------------------------------------------------ 190 191 //------------------------------------------------------------------------------ 192 // L-vector -> E-vector, offsets provided 193 //------------------------------------------------------------------------------ 194 template <int NCOMP, int COMPSTRIDE, int P1d> 195 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 196 if (data.tidx < P1d && data.tidy < P1d) { 197 const CeedInt node = data.tidx + data.tidy*P1d; 198 const CeedInt ind = indices[node + elem * P1d*P1d]; 199 for (CeedInt comp = 0; comp < NCOMP; ++comp) 200 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 201 } 202 } 203 204 //------------------------------------------------------------------------------ 205 // L-vector -> E-vector, strided 206 //------------------------------------------------------------------------------ 207 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 208 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 209 if (data.tidx < P1d && data.tidy < P1d) { 210 const CeedInt node = data.tidx + data.tidy*P1d; 211 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 212 for (CeedInt comp = 0; comp < NCOMP; ++comp) 213 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 214 } 215 } 216 217 //------------------------------------------------------------------------------ 218 // E-vector -> L-vector, offsets provided 219 //------------------------------------------------------------------------------ 220 template <int NCOMP, int COMPSTRIDE, int P1d> 221 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 222 if (data.tidx < P1d && data.tidy < P1d) { 223 const CeedInt node = data.tidx + data.tidy*P1d; 224 const CeedInt ind = indices[node + elem * P1d*P1d]; 225 for (CeedInt comp = 0; comp < NCOMP; ++comp) 226 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 227 } 228 } 229 230 //------------------------------------------------------------------------------ 231 // E-vector -> L-vector, strided 232 //------------------------------------------------------------------------------ 233 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 234 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 235 if (data.tidx < P1d && data.tidy < P1d) { 236 const CeedInt node = data.tidx + data.tidy*P1d; 237 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 238 for (CeedInt comp = 0; comp < NCOMP; ++comp) 239 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 240 } 241 } 242 243 //------------------------------------------------------------------------------ 244 // 2D tensor contraction x 245 //------------------------------------------------------------------------------ 246 template <int NCOMP, int P1d, int Q1d> 247 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 248 data.slice[data.tidx+data.tidy*Q1d] = *U; 249 __syncthreads(); 250 *V = 0.0; 251 for (CeedInt i = 0; i < P1d; ++i) 252 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction 253 __syncthreads(); 254 } 255 256 //------------------------------------------------------------------------------ 257 // 2D tensor contract y 258 //------------------------------------------------------------------------------ 259 template <int NCOMP, int P1d, int Q1d> 260 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 261 data.slice[data.tidx+data.tidy*Q1d] = *U; 262 __syncthreads(); 263 *V = 0.0; 264 for (CeedInt i = 0; i < P1d; ++i) 265 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction 266 __syncthreads(); 267 } 268 269 //------------------------------------------------------------------------------ 270 // 2D transpose tensor contract y 271 //------------------------------------------------------------------------------ 272 template <int NCOMP, int P1d, int Q1d> 273 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 274 data.slice[data.tidx+data.tidy*Q1d] = *U; 275 __syncthreads(); 276 *V = 0.0; 277 if (data.tidy < P1d) 278 for (CeedInt i = 0; i < Q1d; ++i) 279 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction 280 __syncthreads(); 281 } 282 283 //------------------------------------------------------------------------------ 284 // 2D transpose tensor contract x 285 //------------------------------------------------------------------------------ 286 template <int NCOMP, int P1d, int Q1d> 287 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 288 data.slice[data.tidx+data.tidy*Q1d] = *U; 289 __syncthreads(); 290 *V = 0.0; 291 if (data.tidx < P1d) 292 for (CeedInt i = 0; i < Q1d; ++i) 293 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction 294 __syncthreads(); 295 } 296 297 //------------------------------------------------------------------------------ 298 // 2D transpose tensor contract and add x 299 //------------------------------------------------------------------------------ 300 template <int NCOMP, int P1d, int Q1d> 301 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 302 data.slice[data.tidx+data.tidy*Q1d] = *U; 303 __syncthreads(); 304 if (data.tidx < P1d) 305 for (CeedInt i = 0; i < Q1d; ++i) 306 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction 307 __syncthreads(); 308 } 309 310 //------------------------------------------------------------------------------ 311 // 2D interpolate to quadrature points 312 //------------------------------------------------------------------------------ 313 template <int NCOMP, int P1d, int Q1d> 314 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 315 CeedScalar r_t[1]; 316 for (CeedInt comp = 0; comp < NCOMP; comp++) { 317 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 318 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 319 } 320 } 321 322 //------------------------------------------------------------------------------ 323 // 2D interpolate transpose 324 //------------------------------------------------------------------------------ 325 template <int NCOMP, int P1d, int Q1d> 326 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 327 CeedScalar r_t[1]; 328 for (CeedInt comp = 0; comp < NCOMP; comp++) { 329 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 330 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 331 } 332 } 333 334 //------------------------------------------------------------------------------ 335 // 2D derivatives at quadrature points 336 //------------------------------------------------------------------------------ 337 template <int NCOMP, int P1d, int Q1d> 338 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 339 CeedScalar r_t[1]; 340 for (CeedInt comp = 0; comp < NCOMP; comp++) { 341 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 342 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 343 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 344 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 345 } 346 } 347 348 //------------------------------------------------------------------------------ 349 // 2D derivatives transpose 350 //------------------------------------------------------------------------------ 351 template <int NCOMP, int P1d, int Q1d> 352 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 353 CeedScalar r_t[1]; 354 for (CeedInt comp = 0; comp < NCOMP; comp++) { 355 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 356 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 357 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 358 ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 359 } 360 } 361 362 //------------------------------------------------------------------------------ 363 // 3D 364 //------------------------------------------------------------------------------ 365 366 //------------------------------------------------------------------------------ 367 // L-vector -> E-vector, offsets provided 368 //------------------------------------------------------------------------------ 369 template <int NCOMP, int COMPSTRIDE, int P1d> 370 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 371 if (data.tidx < P1d && data.tidy < P1d) 372 for (CeedInt z = 0; z < P1d; ++z) { 373 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 374 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 375 for (CeedInt comp = 0; comp < NCOMP; ++comp) 376 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 377 } 378 } 379 380 //------------------------------------------------------------------------------ 381 // L-vector -> E-vector, strided 382 //------------------------------------------------------------------------------ 383 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 384 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 385 if (data.tidx < P1d && data.tidy < P1d) 386 for (CeedInt z = 0; z < P1d; ++z) { 387 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 388 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 389 for (CeedInt comp = 0; comp < NCOMP; ++comp) 390 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 391 } 392 } 393 394 //------------------------------------------------------------------------------ 395 // E-vector -> Q-vector, offests provided 396 //------------------------------------------------------------------------------ 397 template <int NCOMP, int COMPSTRIDE, int Q1d> 398 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 399 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 400 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 401 for (CeedInt comp = 0; comp < NCOMP; ++comp) 402 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 403 } 404 405 //------------------------------------------------------------------------------ 406 // E-vector -> Q-vector, strided 407 //------------------------------------------------------------------------------ 408 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 409 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 410 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 411 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 412 for (CeedInt comp = 0; comp < NCOMP; ++comp) 413 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 414 } 415 416 //------------------------------------------------------------------------------ 417 // E-vector -> L-vector, offsets provided 418 //------------------------------------------------------------------------------ 419 template <int NCOMP, int COMPSTRIDE, int P1d> 420 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 421 if (data.tidx < P1d && data.tidy < P1d) 422 for (CeedInt z = 0; z < P1d; ++z) { 423 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 424 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 425 for (CeedInt comp = 0; comp < NCOMP; ++comp) 426 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 427 } 428 } 429 430 //------------------------------------------------------------------------------ 431 // E-vector -> L-vector, strided 432 //------------------------------------------------------------------------------ 433 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 434 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 435 if (data.tidx < P1d && data.tidy < P1d) 436 for (CeedInt z = 0; z < P1d; ++z) { 437 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 438 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 439 for (CeedInt comp = 0; comp < NCOMP; ++comp) 440 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 441 } 442 } 443 444 //------------------------------------------------------------------------------ 445 // 3D tensor contract x 446 //------------------------------------------------------------------------------ 447 template <int NCOMP, int P1d, int Q1d> 448 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 449 CeedScalar r_B[P1d]; 450 for (CeedInt i = 0; i < P1d; ++i) 451 r_B[i] = B[i + data.tidx*P1d]; 452 453 for (CeedInt k = 0; k < P1d; ++k) { 454 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 455 __syncthreads(); 456 V[k] = 0.0; 457 for (CeedInt i = 0; i < P1d; ++i) 458 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction 459 __syncthreads(); 460 } 461 } 462 463 //------------------------------------------------------------------------------ 464 // 3D tensor contract y 465 //------------------------------------------------------------------------------ 466 template <int NCOMP, int P1d, int Q1d> 467 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 468 CeedScalar r_B[P1d]; 469 for (CeedInt i = 0; i < P1d; ++i) 470 r_B[i] = B[i + data.tidy*P1d]; 471 472 for (CeedInt k = 0; k < P1d; ++k) { 473 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 474 __syncthreads(); 475 V[k] = 0.0; 476 for (CeedInt i = 0; i < P1d; ++i) 477 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction 478 __syncthreads(); 479 } 480 } 481 482 //------------------------------------------------------------------------------ 483 // 3D tensor contract z 484 //------------------------------------------------------------------------------ 485 template <int NCOMP, int P1d, int Q1d> 486 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 487 for (CeedInt k = 0; k < Q1d; ++k) { 488 V[k] = 0.0; 489 for (CeedInt i = 0; i < P1d; ++i) 490 V[k] += B[i + k*P1d] * U[i]; // Contract z direction 491 } 492 } 493 494 //------------------------------------------------------------------------------ 495 // 3D transpose tensor contract z 496 //------------------------------------------------------------------------------ 497 template <int NCOMP, int P1d, int Q1d> 498 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 499 for (CeedInt k = 0; k < Q1d; ++k) { 500 V[k] = 0.0; 501 if (k < P1d) 502 for (CeedInt i = 0; i < Q1d; ++i) 503 V[k] += B[k + i*P1d] * U[i]; // Contract z direction 504 } 505 } 506 507 //------------------------------------------------------------------------------ 508 // 3D transpose tensor contract y 509 //------------------------------------------------------------------------------ 510 template <int NCOMP, int P1d, int Q1d> 511 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 512 CeedScalar r_B[Q1d]; 513 for (CeedInt i = 0; i < Q1d; ++i) 514 r_B[i] = B[data.tidy + i*P1d]; 515 516 for (CeedInt k = 0; k < P1d; ++k) { 517 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 518 __syncthreads(); 519 V[k] = 0.0; 520 if (data.tidy < P1d) 521 for (CeedInt i = 0; i < Q1d; ++i) 522 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction 523 __syncthreads(); 524 } 525 } 526 527 //------------------------------------------------------------------------------ 528 // 3D transpose tensor contract add y 529 //------------------------------------------------------------------------------ 530 template <int NCOMP, int P1d, int Q1d> 531 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 532 CeedScalar r_B[Q1d]; 533 for (CeedInt i = 0; i < Q1d; ++i) 534 r_B[i] = B[data.tidy + i*P1d]; 535 536 for (CeedInt k = 0; k < P1d; ++k) { 537 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 538 __syncthreads(); 539 if (data.tidy < P1d) 540 for (CeedInt i = 0; i < Q1d; ++i) 541 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction 542 __syncthreads(); 543 } 544 } 545 546 //------------------------------------------------------------------------------ 547 // 3D transpose tensor contract x 548 //------------------------------------------------------------------------------ 549 template <int NCOMP, int P1d, int Q1d> 550 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 551 CeedScalar r_B[Q1d]; 552 for (CeedInt i = 0; i < Q1d; ++i) 553 r_B[i] = B[data.tidx + i*P1d]; 554 555 for (CeedInt k = 0; k < P1d; ++k) { 556 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 557 __syncthreads(); 558 V[k] = 0.0; 559 if (data.tidx < P1d) 560 for (CeedInt i = 0; i < Q1d; ++i) 561 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction 562 __syncthreads(); 563 } 564 } 565 566 //------------------------------------------------------------------------------ 567 // 3D transpose tensor contract add x 568 //------------------------------------------------------------------------------ 569 template <int NCOMP, int P1d, int Q1d> 570 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 571 CeedScalar r_B[Q1d]; 572 for (CeedInt i = 0; i < Q1d; ++i) 573 r_B[i] = B[data.tidx + i*P1d]; 574 575 for (CeedInt k = 0; k < P1d; ++k) { 576 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 577 __syncthreads(); 578 if (data.tidx < P1d) 579 for (CeedInt i = 0; i < Q1d; ++i) 580 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction 581 __syncthreads(); 582 } 583 } 584 585 //------------------------------------------------------------------------------ 586 // 3D interpolate to quadrature points 587 //------------------------------------------------------------------------------ 588 template <int NCOMP, int P1d, int Q1d> 589 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 590 CeedScalar r_t1[Q1d]; 591 CeedScalar r_t2[Q1d]; 592 for (CeedInt comp = 0; comp < NCOMP; comp++) { 593 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 594 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 595 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 596 } 597 } 598 599 //------------------------------------------------------------------------------ 600 // 3D interpolate transpose 601 //------------------------------------------------------------------------------ 602 template <int NCOMP, int P1d, int Q1d> 603 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 604 CeedScalar r_t1[Q1d]; 605 CeedScalar r_t2[Q1d]; 606 for (CeedInt comp = 0; comp < NCOMP; comp++) { 607 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 608 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 609 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 610 } 611 } 612 613 //------------------------------------------------------------------------------ 614 // 3D derivatives at quadrature points 615 //------------------------------------------------------------------------------ 616 template <int NCOMP, int P1d, int Q1d> 617 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 618 CeedScalar r_t1[Q1d]; 619 CeedScalar r_t2[Q1d]; 620 for (CeedInt comp = 0; comp < NCOMP; comp++) { 621 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 622 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 623 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 624 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 625 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 626 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 627 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 628 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 629 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 630 } 631 } 632 633 //------------------------------------------------------------------------------ 634 // 3D derivatives transpose 635 //------------------------------------------------------------------------------ 636 template <int NCOMP, int P1d, int Q1d> 637 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 638 CeedScalar r_t1[Q1d]; 639 CeedScalar r_t2[Q1d]; 640 for (CeedInt comp = 0; comp < NCOMP; comp++) { 641 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 642 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 643 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 644 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 645 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 646 ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 647 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 648 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 649 ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 650 } 651 } 652 653 //------------------------------------------------------------------------------ 654 // 3D collocated derivatives computation 655 //------------------------------------------------------------------------------ 656 template <int NCOMP, int Q1d> 657 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 658 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 659 data.slice[data.tidx + data.tidy*Q1d] = r_U[q + comp*Q1d]; 660 __syncthreads(); 661 // X derivative 662 r_V[comp+0*NCOMP] = 0.0; 663 for (CeedInt i = 0; i < Q1d; ++i) 664 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative) 665 // Y derivative 666 r_V[comp+1*NCOMP] = 0.0; 667 for (CeedInt i = 0; i < Q1d; ++i) 668 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative) 669 // Z derivative 670 r_V[comp+2*NCOMP] = 0.0; 671 for (CeedInt i = 0; i < Q1d; ++i) 672 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 673 __syncthreads(); 674 } 675 } 676 677 //------------------------------------------------------------------------------ 678 // 3D collocated derivatives transpose 679 //------------------------------------------------------------------------------ 680 template <int NCOMP, int Q1d> 681 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 682 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 683 // X derivative 684 data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 685 __syncthreads(); 686 for (CeedInt i = 0; i < Q1d; ++i) 687 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative) 688 __syncthreads(); 689 // Y derivative 690 data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 691 __syncthreads(); 692 for (CeedInt i = 0; i < Q1d; ++i) 693 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative) 694 __syncthreads(); 695 // Z derivative 696 for (CeedInt i = 0; i < Q1d; ++i) 697 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 698 } 699 } 700 701 //------------------------------------------------------------------------------ 702 // 1D quadrature weights 703 //------------------------------------------------------------------------------ 704 template <int Q1d> 705 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 706 *w = qweight1d[data.tidx]; 707 } 708 709 //------------------------------------------------------------------------------ 710 // 2D quadrature weights 711 //------------------------------------------------------------------------------ 712 template <int Q1d> 713 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 714 *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 715 } 716 717 //------------------------------------------------------------------------------ 718 // 3D quadrature weights 719 //------------------------------------------------------------------------------ 720 template <int Q1d> 721 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 722 const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 723 for (CeedInt z = 0; z < Q1d; ++z) 724 w[z] = pw*qweight1d[z]; 725 } 726 727 ); 728 //------------------------------------------------------------------------------ 729 // Build singe operator kernel 730 //------------------------------------------------------------------------------ 731 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 732 733 using std::ostringstream; 734 using std::string; 735 int ierr; 736 bool setupdone; 737 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr); 738 if (setupdone) return 0; 739 Ceed ceed; 740 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 741 CeedOperator_Cuda_gen *data; 742 ierr = CeedOperatorGetData(op, &data); CeedChk(ierr); 743 CeedQFunction qf; 744 CeedQFunction_Cuda_gen *qf_data; 745 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 746 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr); 747 CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 748 numoutputfields, ncomp, dim = 0, lsize; 749 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 750 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 751 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 752 CeedChk(ierr); 753 CeedOperatorField *opinputfields, *opoutputfields; 754 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 755 CeedChk(ierr); 756 CeedQFunctionField *qfinputfields, *qfoutputfields; 757 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 758 CeedChk(ierr); 759 CeedEvalMode emode; 760 CeedBasis basis; 761 CeedBasis_Cuda_shared *basis_data; 762 CeedElemRestriction Erestrict; 763 CeedElemRestriction_Cuda_reg *restr_data; 764 765 ostringstream code; 766 string devFunctions(deviceFunctions); 767 768 // Add atomicAdd function for old NVidia architectures 769 struct cudaDeviceProp prop; 770 Ceed_Cuda *ceed_data; 771 ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr); 772 ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 773 if (prop.major<6){ 774 code << atomicAdd; 775 } 776 777 code << devFunctions; 778 779 string qFunction(qf_data->qFunctionSource); 780 string qFunctionName(qf_data->qFunctionName); 781 string oper; 782 oper = "CeedKernel_Cuda_gen_" + qFunctionName; 783 784 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 785 code << "\n#define CeedPragmaSIMD\n"; 786 787 // Find dim and Q1d 788 bool collograd = true; 789 for (CeedInt i = 0; i < numinputfields; i++) { 790 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 791 if (basis != CEED_BASIS_COLLOCATED) { 792 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 793 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 794 CeedChk(ierr); 795 796 // Check for collocated gradient 797 if (emode == CEED_EVAL_GRAD) 798 collograd = collograd && !!basis_data->d_collograd1d; 799 800 // Collect dim and Q1d 801 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 802 bool isTensor; 803 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 804 if (isTensor) { 805 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 806 } else { 807 // LCOV_EXCL_START 808 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 809 // LCOV_EXCL_STOP 810 } 811 } 812 } 813 // Check output bases for Q1d, dim as well 814 // The only imput basis might be CEED_BASIS_COLLOCATED 815 for (CeedInt i = 0; i < numoutputfields; i++) { 816 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 817 818 if (basis != CEED_BASIS_COLLOCATED) { 819 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 820 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 821 CeedChk(ierr); 822 823 // Collect dim and Q1d 824 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 825 bool isTensor; 826 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 827 if (isTensor) { 828 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 829 } else { 830 // LCOV_EXCL_START 831 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 832 // LCOV_EXCL_STOP 833 } 834 835 // Check for collocated gradient 836 if (emode == CEED_EVAL_GRAD) 837 collograd = collograd && !!basis_data->d_collograd1d; 838 } 839 } 840 data->dim = dim; 841 data->Q1d = Q1d; 842 843 // Define CEED_Q_VLA 844 if (dim != 3 || collograd) { 845 code << "\n#define CEED_Q_VLA 1\n\n"; 846 } else { 847 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 848 } 849 850 code << qFunction; 851 852 // Setup 853 code << "\n// -----------------------------------------------------------------------------\n"; 854 code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 855 for (CeedInt i = 0; i < numinputfields; i++) { 856 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 857 CeedChk(ierr); 858 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 859 code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 860 } 861 } 862 863 for (CeedInt i = 0; i < numoutputfields; i++) { 864 code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 865 } 866 867 code << " const CeedInt Dim = "<<dim<<";\n"; 868 code << " const CeedInt Q1d = "<<Q1d<<";\n"; 869 870 code << " extern __shared__ CeedScalar slice[];\n"; 871 code << " BackendData data;\n"; 872 code << " data.tidx = threadIdx.x;\n"; 873 code << " data.tidy = threadIdx.y;\n"; 874 code << " data.tidz = threadIdx.z;\n"; 875 code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 876 code << " data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 877 878 code << "\n // -- Input field constants and basis data --\n"; 879 //Initialize constants, and matrices B and G 880 for (CeedInt i = 0; i < numinputfields; i++) { 881 code << " // ---- Input field "<<i<<" ----\n"; 882 // Get elemsize, emode, ncomp 883 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 884 CeedChk(ierr); 885 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 886 CeedChk(ierr); 887 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 888 CeedChk(ierr); 889 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 890 CeedChk(ierr); 891 892 // Set field constants 893 if (emode != CEED_EVAL_WEIGHT) { 894 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 895 if (basis != CEED_BASIS_COLLOCATED) { 896 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 897 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 898 } else { 899 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 900 } 901 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 902 } 903 904 // Load basis data 905 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 906 switch (emode) { 907 case CEED_EVAL_NONE: 908 break; 909 case CEED_EVAL_INTERP: 910 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 911 data->B.in[i] = basis_data->d_interp1d; 912 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 913 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 914 break; 915 case CEED_EVAL_GRAD: 916 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 917 data->B.in[i] = basis_data->d_interp1d; 918 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 919 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 920 if (collograd) { 921 data->G.in[i] = basis_data->d_collograd1d; 922 code << " __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 923 code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 924 } else { 925 data->G.in[i] = basis_data->d_grad1d; 926 code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 927 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 928 } 929 break; 930 case CEED_EVAL_WEIGHT: 931 break; // No action 932 case CEED_EVAL_DIV: 933 break; // TODO: Not implemented 934 case CEED_EVAL_CURL: 935 break; // TODO: Not implemented 936 } 937 } 938 939 code << "\n // -- Output field constants and basis data --\n"; 940 for (CeedInt i = 0; i < numoutputfields; i++) { 941 code << " // ---- Output field "<<i<<" ----\n"; 942 // Get elemsize, emode, ncomp 943 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 944 CeedChk(ierr); 945 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 946 CeedChk(ierr); 947 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 948 CeedChk(ierr); 949 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 950 CeedChk(ierr); 951 952 // Set field constants 953 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 954 if (basis != CEED_BASIS_COLLOCATED) { 955 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 956 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 957 } else { 958 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 959 } 960 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 961 962 // Load basis data 963 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 964 switch (emode) { 965 case CEED_EVAL_NONE: 966 break; // No action 967 case CEED_EVAL_INTERP: 968 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 969 data->B.out[i] = basis_data->d_interp1d; 970 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 971 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 972 break; 973 case CEED_EVAL_GRAD: 974 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 975 data->B.out[i] = basis_data->d_interp1d; 976 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 977 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 978 if (collograd) { 979 data->G.out[i] = basis_data->d_collograd1d; 980 code << " __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 981 code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 982 } else { 983 data->G.out[i] = basis_data->d_grad1d; 984 code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 985 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 986 } 987 break; 988 // LCOV_EXCL_START 989 case CEED_EVAL_WEIGHT: { 990 Ceed ceed; 991 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 992 return CeedError(ceed, 1, 993 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 994 break; // Should not occur 995 } 996 case CEED_EVAL_DIV: 997 break; // TODO: Not implemented 998 case CEED_EVAL_CURL: 999 break; // TODO: Not implemented 1000 // LCOV_EXCL_STOP 1001 } 1002 } 1003 code << "\n // -- Element loop --\n"; 1004 code << " __syncthreads();\n"; 1005 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1006 // Input basis apply if needed 1007 // Generate the correct eval mode code for each input 1008 code << " // -- Input field restrictions and basis actions --\n"; 1009 for (CeedInt i = 0; i < numinputfields; i++) { 1010 code << " // ---- Input field "<<i<<" ----\n"; 1011 // Get elemsize, emode, ncomp 1012 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1013 CeedChk(ierr); 1014 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1015 CeedChk(ierr); 1016 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1017 CeedChk(ierr); 1018 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1019 CeedChk(ierr); 1020 1021 // Restriction 1022 if (emode != CEED_EVAL_WEIGHT && 1023 !((emode == CEED_EVAL_NONE) && collograd)) { 1024 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1025 1026 bool isStrided; 1027 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1028 if (!isStrided) { 1029 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1030 CeedChk(ierr); 1031 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1032 CeedInt compstride; 1033 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1034 code << " // CompStride: "<<compstride<<"\n"; 1035 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1036 data->indices.in[i] = restr_data->d_ind; 1037 code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1038 } else { 1039 bool backendstrides; 1040 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1041 CeedChk(ierr); 1042 CeedInt nelem; 1043 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1044 CeedChk(ierr); 1045 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1046 if (!backendstrides) { 1047 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1048 CeedChk(ierr); 1049 } 1050 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1051 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1052 } 1053 } 1054 1055 // Basis action 1056 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1057 switch (emode) { 1058 case CEED_EVAL_NONE: 1059 if (!collograd) { 1060 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1061 } 1062 break; 1063 case CEED_EVAL_INTERP: 1064 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1065 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1066 break; 1067 case CEED_EVAL_GRAD: 1068 if (collograd) { 1069 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1070 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1071 } else { 1072 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1073 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1074 } 1075 break; 1076 case CEED_EVAL_WEIGHT: 1077 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1078 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1079 ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 1080 data->W = basis_data->d_qweight1d; 1081 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1082 break; // No action 1083 case CEED_EVAL_DIV: 1084 break; // TODO: Not implemented 1085 case CEED_EVAL_CURL: 1086 break; // TODO: Not implemented 1087 } 1088 } 1089 1090 // Q function 1091 code << "\n // -- Output field setup --\n"; 1092 for (CeedInt i = 0; i < numoutputfields; i++) { 1093 code << "\n // ---- Output field "<<i<<" ----\n"; 1094 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1095 CeedChk(ierr); 1096 if (emode==CEED_EVAL_GRAD) 1097 { 1098 if (collograd) { 1099 //Accumulator for gradient slices 1100 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1101 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1102 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1103 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1104 code << " }\n"; 1105 code << " }\n"; 1106 } else { 1107 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1108 } 1109 } 1110 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1111 { 1112 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1113 } 1114 } 1115 // We treat quadrature points per slice in 3d to save registers 1116 if (collograd) { 1117 code << "\n // Note: Collocated Gradient\n"; 1118 code << "#pragma unroll\n"; 1119 code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1120 code << " // -- Input fields --\n"; 1121 for (CeedInt i = 0; i < numinputfields; i++) { 1122 code << " // ---- Input field "<<i<<" ----\n"; 1123 // Get elemsize, emode, ncomp 1124 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1125 CeedChk(ierr); 1126 // Basis action 1127 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1128 switch (emode) { 1129 case CEED_EVAL_NONE: 1130 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1131 1132 bool isStrided; 1133 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr); 1134 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr); 1135 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1136 if (!isStrided) { 1137 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1138 CeedChk(ierr); 1139 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1140 CeedInt compstride; 1141 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1142 code << " // CompStride: "<<compstride<<"\n"; 1143 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1144 data->indices.in[i] = restr_data->d_ind; 1145 code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1146 } else { 1147 bool backendstrides; 1148 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1149 CeedChk(ierr); 1150 CeedInt nelem; 1151 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1152 CeedChk(ierr); 1153 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1154 if (!backendstrides) { 1155 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1156 CeedChk(ierr); 1157 } 1158 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1159 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1160 } 1161 break; 1162 case CEED_EVAL_INTERP: 1163 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1164 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1165 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1166 code << " }\n"; 1167 break; 1168 case CEED_EVAL_GRAD: 1169 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1170 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1171 break; 1172 case CEED_EVAL_WEIGHT: 1173 code << " CeedScalar r_q"<<i<<"[1];\n"; 1174 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1175 break; // No action 1176 case CEED_EVAL_DIV: 1177 break; // TODO: Not implemented 1178 case CEED_EVAL_CURL: 1179 break; // TODO: Not implemented 1180 } 1181 } 1182 code << "\n // -- Output fields --\n"; 1183 for (CeedInt i = 0; i < numoutputfields; i++) { 1184 code << " // ---- Output field "<<i<<" ----\n"; 1185 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1186 CeedChk(ierr); 1187 // Basis action 1188 switch (emode) { 1189 case CEED_EVAL_NONE: 1190 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1191 break; // No action 1192 case CEED_EVAL_INTERP: 1193 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1194 break; 1195 case CEED_EVAL_GRAD: 1196 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1197 break; 1198 case CEED_EVAL_WEIGHT: 1199 break; // Should not occur 1200 case CEED_EVAL_DIV: 1201 break; // TODO: Not implemented 1202 case CEED_EVAL_CURL: 1203 break; // TODO: Not implemented 1204 } 1205 } 1206 } else { 1207 code << "\n // Note: No Collocated Gradient\n"; 1208 code << " // -- Input fields --\n"; 1209 for (CeedInt i = 0; i < numinputfields; i++) { 1210 code << " // ---- Input field "<<i<<" ----\n"; 1211 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1212 } 1213 code << " // -- Output fields --\n"; 1214 for (CeedInt i = 0; i < numoutputfields; i++) { 1215 code << " // ---- Output field "<<i<<" ----\n"; 1216 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1217 } 1218 } 1219 code << "\n // -- QFunction Inputs and outputs --\n"; 1220 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1221 for (CeedInt i = 0; i < numinputfields; i++) { 1222 code << " // ---- Input field "<<i<<" ----\n"; 1223 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1224 } 1225 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1226 for (CeedInt i = 0; i < numoutputfields; i++) { 1227 code << " // ---- Output field "<<i<<" ----\n"; 1228 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1229 } 1230 code << "\n // -- Apply QFunction --\n"; 1231 code << " "<<qFunctionName<<"(ctx, "; 1232 if (dim != 3 || collograd) { 1233 code << "1"; 1234 } else { 1235 code << "Q1d"; 1236 } 1237 code << ", in, out);\n"; 1238 if (collograd) { 1239 code << "\n // Note: Collocated Gradient\n"; 1240 code << " // -- Output fields --\n"; 1241 for (CeedInt i = 0; i < numoutputfields; i++) { 1242 code << " // ---- Output field "<<i<<" ----\n"; 1243 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1244 CeedChk(ierr); 1245 // Basis action 1246 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1247 switch (emode) { 1248 case CEED_EVAL_NONE: 1249 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1250 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1251 code << " }\n"; 1252 break; // No action 1253 case CEED_EVAL_INTERP: 1254 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1255 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1256 code << " }\n"; 1257 break; 1258 case CEED_EVAL_GRAD: 1259 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1260 break; 1261 case CEED_EVAL_WEIGHT: 1262 break; // Should not occur 1263 case CEED_EVAL_DIV: 1264 break; // TODO: Not implemented 1265 case CEED_EVAL_CURL: 1266 break; // TODO: Not implemented 1267 } 1268 } 1269 code << " }\n"; 1270 } 1271 1272 // Output basis apply if needed 1273 // Generate the correct eval mode code for each output 1274 code << "\n // -- Output field basis action and restrictions --\n"; 1275 for (CeedInt i = 0; i < numoutputfields; i++) { 1276 code << " // ---- Output field "<<i<<" ----\n"; 1277 // Get elemsize, emode, ncomp 1278 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1279 CeedChk(ierr); 1280 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1281 CeedChk(ierr); 1282 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1283 CeedChk(ierr); 1284 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1285 CeedChk(ierr); 1286 // Basis action 1287 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1288 switch (emode) { 1289 case CEED_EVAL_NONE: 1290 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1291 break; // No action 1292 case CEED_EVAL_INTERP: 1293 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1294 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1295 break; 1296 case CEED_EVAL_GRAD: 1297 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1298 if (collograd) { 1299 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1300 } else { 1301 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1302 } 1303 break; 1304 // LCOV_EXCL_START 1305 case CEED_EVAL_WEIGHT: { 1306 Ceed ceed; 1307 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1308 return CeedError(ceed, 1, 1309 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1310 break; // Should not occur 1311 } 1312 case CEED_EVAL_DIV: 1313 break; // TODO: Not implemented 1314 case CEED_EVAL_CURL: 1315 break; // TODO: Not implemented 1316 // LCOV_EXCL_STOP 1317 } 1318 // Restriction 1319 bool isStrided; 1320 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1321 if (!isStrided) { 1322 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1323 CeedChk(ierr); 1324 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1325 CeedInt compstride; 1326 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1327 code << " // CompStride: "<<compstride<<"\n"; 1328 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1329 data->indices.out[i] = restr_data->d_ind; 1330 code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1331 } else { 1332 bool backendstrides; 1333 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1334 CeedChk(ierr); 1335 CeedInt nelem; 1336 ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1337 CeedChk(ierr); 1338 CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1339 if (!backendstrides) { 1340 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1341 CeedChk(ierr); 1342 } 1343 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1344 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1345 } 1346 } 1347 1348 code << " }\n"; 1349 code << "}\n"; 1350 code << "// -----------------------------------------------------------------------------\n\n"; 1351 1352 // View kernel for debugging 1353 CeedDebug(code.str().c_str()); 1354 1355 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); 1356 CeedChk(ierr); 1357 ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1358 CeedChk(ierr); 1359 1360 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1361 return 0; 1362 } 1363 //------------------------------------------------------------------------------ 1364