1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include "ceed-cuda-gen.h" 18 #include <iostream> 19 #include <sstream> 20 #include "../cuda-reg/ceed-cuda-reg.h" 21 #include "../cuda-shared/ceed-cuda-shared.h" 22 23 static const char *atomicAdd = QUOTE( 24 //------------------------------------------------------------------------------ 25 // Atomic add, for older CUDA 26 //------------------------------------------------------------------------------ 27 __device__ double atomicAdd(double *address, double val) { 28 unsigned long long int *address_as_ull = (unsigned long long int *)address; 29 unsigned long long int old = *address_as_ull, assumed; 30 do { 31 assumed = old; 32 old = 33 atomicCAS(address_as_ull, assumed, 34 __double_as_longlong(val + 35 __longlong_as_double(assumed))); 36 // Note: uses integer comparison to avoid hang in case of NaN 37 // (since NaN != NaN) 38 } while (assumed != old); 39 return __longlong_as_double(old); 40 } 41 ); 42 43 static const char *deviceFunctions = QUOTE( 44 45 //------------------------------------------------------------------------------ 46 // Typedefs 47 //------------------------------------------------------------------------------ 48 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 49 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 50 51 typedef struct { 52 CeedInt tidx; 53 CeedInt tidy; 54 CeedInt tidz; 55 CeedInt tid; 56 CeedScalar* slice; 57 } BackendData; 58 59 //------------------------------------------------------------------------------ 60 // Load matrices for basis actions 61 //------------------------------------------------------------------------------ 62 template <int P, int Q> 63 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 64 for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 65 B[i] = d_B[i]; 66 } 67 68 //------------------------------------------------------------------------------ 69 // 1D 70 //------------------------------------------------------------------------------ 71 72 //------------------------------------------------------------------------------ 73 // L-vector -> E-vector, offsets provided 74 //------------------------------------------------------------------------------ 75 template <int NCOMP, int COMPSTRIDE, int P1d> 76 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77 if (data.tidx < P1d) { 78 const CeedInt node = data.tidx; 79 const CeedInt ind = indices[node + elem * P1d]; 80 for (CeedInt comp = 0; comp < NCOMP; ++comp) 81 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 82 } 83 } 84 85 //------------------------------------------------------------------------------ 86 // L-vector -> E-vector, strided 87 //------------------------------------------------------------------------------ 88 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 89 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 90 if (data.tidx < P1d) { 91 const CeedInt node = data.tidx; 92 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 93 for (CeedInt comp = 0; comp < NCOMP; ++comp) 94 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 95 } 96 } 97 98 //------------------------------------------------------------------------------ 99 // E-vector -> L-vector, offsets provided 100 //------------------------------------------------------------------------------ 101 template <int NCOMP, int COMPSTRIDE, int P1d> 102 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 103 if (data.tidx < P1d) { 104 const CeedInt node = data.tidx; 105 const CeedInt ind = indices[node + elem * P1d]; 106 for (CeedInt comp = 0; comp < NCOMP; ++comp) 107 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 108 } 109 } 110 111 //------------------------------------------------------------------------------ 112 // E-vector -> L-vector, strided 113 //------------------------------------------------------------------------------ 114 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 115 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 116 if (data.tidx < P1d) { 117 const CeedInt node = data.tidx; 118 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 119 for (CeedInt comp = 0; comp < NCOMP; ++comp) 120 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 121 } 122 } 123 124 //------------------------------------------------------------------------------ 125 // 1D tensor contraction x 126 //------------------------------------------------------------------------------ 127 template <int NCOMP, int P1d, int Q1d> 128 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 129 data.slice[data.tidx] = *U; 130 __syncthreads(); 131 *V = 0.0; 132 for (CeedInt i = 0; i < P1d; ++i) 133 *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 134 __syncthreads(); 135 } 136 137 //------------------------------------------------------------------------------ 138 // 1D transpose tensor contraction x 139 //------------------------------------------------------------------------------ 140 template <int NCOMP, int P1d, int Q1d> 141 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 142 data.slice[data.tidx] = *U; 143 __syncthreads(); 144 *V = 0.0; 145 for (CeedInt i = 0; i < Q1d; ++i) 146 *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 147 __syncthreads(); 148 } 149 150 //------------------------------------------------------------------------------ 151 // 1D interpolate to quadrature points 152 //------------------------------------------------------------------------------ 153 template <int NCOMP, int P1d, int Q1d> 154 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 155 for (CeedInt comp = 0; comp < NCOMP; comp++) 156 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 157 } 158 159 //------------------------------------------------------------------------------ 160 // 1D interpolate transpose 161 //------------------------------------------------------------------------------ 162 template <int NCOMP, int P1d, int Q1d> 163 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 164 for (CeedInt comp=0; comp<NCOMP; comp++) 165 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 166 } 167 168 //------------------------------------------------------------------------------ 169 // 1D derivatives at quadrature points 170 //------------------------------------------------------------------------------ 171 template <int NCOMP, int P1d, int Q1d> 172 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 173 for (CeedInt comp = 0; comp < NCOMP; comp++) 174 ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 175 } 176 177 //------------------------------------------------------------------------------ 178 // 1D derivatives transpose 179 //------------------------------------------------------------------------------ 180 template <int NCOMP, int P1d, int Q1d> 181 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 182 for (CeedInt comp = 0; comp < NCOMP; comp++) 183 ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 184 } 185 186 //------------------------------------------------------------------------------ 187 // 2D 188 //------------------------------------------------------------------------------ 189 190 //------------------------------------------------------------------------------ 191 // L-vector -> E-vector, offsets provided 192 //------------------------------------------------------------------------------ 193 template <int NCOMP, int COMPSTRIDE, int P1d> 194 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 195 if (data.tidx < P1d && data.tidy < P1d) { 196 const CeedInt node = data.tidx + data.tidy*P1d; 197 const CeedInt ind = indices[node + elem * P1d*P1d]; 198 for (CeedInt comp = 0; comp < NCOMP; ++comp) 199 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 200 } 201 } 202 203 //------------------------------------------------------------------------------ 204 // L-vector -> E-vector, strided 205 //------------------------------------------------------------------------------ 206 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 207 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 208 if (data.tidx < P1d && data.tidy < P1d) { 209 const CeedInt node = data.tidx + data.tidy*P1d; 210 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 211 for (CeedInt comp = 0; comp < NCOMP; ++comp) 212 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 213 } 214 } 215 216 //------------------------------------------------------------------------------ 217 // E-vector -> L-vector, offsets provided 218 //------------------------------------------------------------------------------ 219 template <int NCOMP, int COMPSTRIDE, int P1d> 220 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 221 if (data.tidx < P1d && data.tidy < P1d) { 222 const CeedInt node = data.tidx + data.tidy*P1d; 223 const CeedInt ind = indices[node + elem * P1d*P1d]; 224 for (CeedInt comp = 0; comp < NCOMP; ++comp) 225 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 226 } 227 } 228 229 //------------------------------------------------------------------------------ 230 // E-vector -> L-vector, strided 231 //------------------------------------------------------------------------------ 232 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 233 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 234 if (data.tidx < P1d && data.tidy < P1d) { 235 const CeedInt node = data.tidx + data.tidy*P1d; 236 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 237 for (CeedInt comp = 0; comp < NCOMP; ++comp) 238 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 239 } 240 } 241 242 //------------------------------------------------------------------------------ 243 // 2D tensor contraction x 244 //------------------------------------------------------------------------------ 245 template <int NCOMP, int P1d, int Q1d> 246 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 247 data.slice[data.tidx+data.tidy*Q1d] = *U; 248 __syncthreads(); 249 *V = 0.0; 250 for (CeedInt i = 0; i < P1d; ++i) 251 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction 252 __syncthreads(); 253 } 254 255 //------------------------------------------------------------------------------ 256 // 2D tensor contract y 257 //------------------------------------------------------------------------------ 258 template <int NCOMP, int P1d, int Q1d> 259 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 260 data.slice[data.tidx+data.tidy*Q1d] = *U; 261 __syncthreads(); 262 *V = 0.0; 263 for (CeedInt i = 0; i < P1d; ++i) 264 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction 265 __syncthreads(); 266 } 267 268 //------------------------------------------------------------------------------ 269 // 2D transpose tensor contract y 270 //------------------------------------------------------------------------------ 271 template <int NCOMP, int P1d, int Q1d> 272 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 273 data.slice[data.tidx+data.tidy*Q1d] = *U; 274 __syncthreads(); 275 *V = 0.0; 276 if (data.tidy < P1d) 277 for (CeedInt i = 0; i < Q1d; ++i) 278 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction 279 __syncthreads(); 280 } 281 282 //------------------------------------------------------------------------------ 283 // 2D transpose tensor contract x 284 //------------------------------------------------------------------------------ 285 template <int NCOMP, int P1d, int Q1d> 286 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 287 data.slice[data.tidx+data.tidy*Q1d] = *U; 288 __syncthreads(); 289 *V = 0.0; 290 if (data.tidx < P1d) 291 for (CeedInt i = 0; i < Q1d; ++i) 292 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction 293 __syncthreads(); 294 } 295 296 //------------------------------------------------------------------------------ 297 // 2D transpose tensor contract and add x 298 //------------------------------------------------------------------------------ 299 template <int NCOMP, int P1d, int Q1d> 300 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 301 data.slice[data.tidx+data.tidy*Q1d] = *U; 302 __syncthreads(); 303 if (data.tidx < P1d) 304 for (CeedInt i = 0; i < Q1d; ++i) 305 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction 306 __syncthreads(); 307 } 308 309 //------------------------------------------------------------------------------ 310 // 2D interpolate to quadrature points 311 //------------------------------------------------------------------------------ 312 template <int NCOMP, int P1d, int Q1d> 313 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 314 CeedScalar r_t[1]; 315 for (CeedInt comp = 0; comp < NCOMP; comp++) { 316 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 317 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 318 } 319 } 320 321 //------------------------------------------------------------------------------ 322 // 2D interpolate transpose 323 //------------------------------------------------------------------------------ 324 template <int NCOMP, int P1d, int Q1d> 325 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 326 CeedScalar r_t[1]; 327 for (CeedInt comp = 0; comp < NCOMP; comp++) { 328 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 329 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 330 } 331 } 332 333 //------------------------------------------------------------------------------ 334 // 2D derivatives at quadrature points 335 //------------------------------------------------------------------------------ 336 template <int NCOMP, int P1d, int Q1d> 337 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 338 CeedScalar r_t[1]; 339 for (CeedInt comp = 0; comp < NCOMP; comp++) { 340 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 341 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 342 ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 343 ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 344 } 345 } 346 347 //------------------------------------------------------------------------------ 348 // 2D derivatives transpose 349 //------------------------------------------------------------------------------ 350 template <int NCOMP, int P1d, int Q1d> 351 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 352 CeedScalar r_t[1]; 353 for (CeedInt comp = 0; comp < NCOMP; comp++) { 354 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 355 ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 356 ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 357 ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 358 } 359 } 360 361 //------------------------------------------------------------------------------ 362 // 3D 363 //------------------------------------------------------------------------------ 364 365 //------------------------------------------------------------------------------ 366 // L-vector -> E-vector, offsets provided 367 //------------------------------------------------------------------------------ 368 template <int NCOMP, int COMPSTRIDE, int P1d> 369 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 370 if (data.tidx < P1d && data.tidy < P1d) 371 for (CeedInt z = 0; z < P1d; ++z) { 372 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 373 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 374 for (CeedInt comp = 0; comp < NCOMP; ++comp) 375 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 376 } 377 } 378 379 //------------------------------------------------------------------------------ 380 // L-vector -> E-vector, strided 381 //------------------------------------------------------------------------------ 382 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 383 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 384 if (data.tidx < P1d && data.tidy < P1d) 385 for (CeedInt z = 0; z < P1d; ++z) { 386 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 387 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 388 for (CeedInt comp = 0; comp < NCOMP; ++comp) 389 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 390 } 391 } 392 393 //------------------------------------------------------------------------------ 394 // E-vector -> Q-vector, offests provided 395 //------------------------------------------------------------------------------ 396 template <int NCOMP, int COMPSTRIDE, int Q1d> 397 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 398 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 399 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 400 for (CeedInt comp = 0; comp < NCOMP; ++comp) 401 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 402 } 403 404 //------------------------------------------------------------------------------ 405 // E-vector -> Q-vector, strided 406 //------------------------------------------------------------------------------ 407 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 408 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 409 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 410 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 411 for (CeedInt comp = 0; comp < NCOMP; ++comp) 412 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 413 } 414 415 //------------------------------------------------------------------------------ 416 // E-vector -> L-vector, offsets provided 417 //------------------------------------------------------------------------------ 418 template <int NCOMP, int COMPSTRIDE, int P1d> 419 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 420 if (data.tidx < P1d && data.tidy < P1d) 421 for (CeedInt z = 0; z < P1d; ++z) { 422 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 423 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 424 for (CeedInt comp = 0; comp < NCOMP; ++comp) 425 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 426 } 427 } 428 429 //------------------------------------------------------------------------------ 430 // E-vector -> L-vector, strided 431 //------------------------------------------------------------------------------ 432 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 433 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 434 if (data.tidx < P1d && data.tidy < P1d) 435 for (CeedInt z = 0; z < P1d; ++z) { 436 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 437 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 438 for (CeedInt comp = 0; comp < NCOMP; ++comp) 439 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 440 } 441 } 442 443 //------------------------------------------------------------------------------ 444 // 3D tensor contract x 445 //------------------------------------------------------------------------------ 446 template <int NCOMP, int P1d, int Q1d> 447 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 448 CeedScalar r_B[P1d]; 449 for (CeedInt i = 0; i < P1d; ++i) 450 r_B[i] = B[i + data.tidx*P1d]; 451 452 for (CeedInt k = 0; k < P1d; ++k) { 453 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 454 __syncthreads(); 455 V[k] = 0.0; 456 for (CeedInt i = 0; i < P1d; ++i) 457 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction 458 __syncthreads(); 459 } 460 } 461 462 //------------------------------------------------------------------------------ 463 // 3D tensor contract y 464 //------------------------------------------------------------------------------ 465 template <int NCOMP, int P1d, int Q1d> 466 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 467 CeedScalar r_B[P1d]; 468 for (CeedInt i = 0; i < P1d; ++i) 469 r_B[i] = B[i + data.tidy*P1d]; 470 471 for (CeedInt k = 0; k < P1d; ++k) { 472 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 473 __syncthreads(); 474 V[k] = 0.0; 475 for (CeedInt i = 0; i < P1d; ++i) 476 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction 477 __syncthreads(); 478 } 479 } 480 481 //------------------------------------------------------------------------------ 482 // 3D tensor contract z 483 //------------------------------------------------------------------------------ 484 template <int NCOMP, int P1d, int Q1d> 485 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 486 for (CeedInt k = 0; k < Q1d; ++k) { 487 V[k] = 0.0; 488 for (CeedInt i = 0; i < P1d; ++i) 489 V[k] += B[i + k*P1d] * U[i]; // Contract z direction 490 } 491 } 492 493 //------------------------------------------------------------------------------ 494 // 3D transpose tensor contract z 495 //------------------------------------------------------------------------------ 496 template <int NCOMP, int P1d, int Q1d> 497 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 498 for (CeedInt k = 0; k < Q1d; ++k) { 499 V[k] = 0.0; 500 if (k < P1d) 501 for (CeedInt i = 0; i < Q1d; ++i) 502 V[k] += B[k + i*P1d] * U[i]; // Contract z direction 503 } 504 } 505 506 //------------------------------------------------------------------------------ 507 // 3D transpose tensor contract y 508 //------------------------------------------------------------------------------ 509 template <int NCOMP, int P1d, int Q1d> 510 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 511 CeedScalar r_B[Q1d]; 512 for (CeedInt i = 0; i < Q1d; ++i) 513 r_B[i] = B[data.tidy + i*P1d]; 514 515 for (CeedInt k = 0; k < P1d; ++k) { 516 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 517 __syncthreads(); 518 V[k] = 0.0; 519 if (data.tidy < P1d) 520 for (CeedInt i = 0; i < Q1d; ++i) 521 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction 522 __syncthreads(); 523 } 524 } 525 526 //------------------------------------------------------------------------------ 527 // 3D transpose tensor contract add y 528 //------------------------------------------------------------------------------ 529 template <int NCOMP, int P1d, int Q1d> 530 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 531 CeedScalar r_B[Q1d]; 532 for (CeedInt i = 0; i < Q1d; ++i) 533 r_B[i] = B[data.tidy + i*P1d]; 534 535 for (CeedInt k = 0; k < P1d; ++k) { 536 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 537 __syncthreads(); 538 if (data.tidy < P1d) 539 for (CeedInt i = 0; i < Q1d; ++i) 540 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction 541 __syncthreads(); 542 } 543 } 544 545 //------------------------------------------------------------------------------ 546 // 3D transpose tensor contract x 547 //------------------------------------------------------------------------------ 548 template <int NCOMP, int P1d, int Q1d> 549 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 550 CeedScalar r_B[Q1d]; 551 for (CeedInt i = 0; i < Q1d; ++i) 552 r_B[i] = B[data.tidx + i*P1d]; 553 554 for (CeedInt k = 0; k < P1d; ++k) { 555 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 556 __syncthreads(); 557 V[k] = 0.0; 558 if (data.tidx < P1d) 559 for (CeedInt i = 0; i < Q1d; ++i) 560 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction 561 __syncthreads(); 562 } 563 } 564 565 //------------------------------------------------------------------------------ 566 // 3D transpose tensor contract add x 567 //------------------------------------------------------------------------------ 568 template <int NCOMP, int P1d, int Q1d> 569 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 570 CeedScalar r_B[Q1d]; 571 for (CeedInt i = 0; i < Q1d; ++i) 572 r_B[i] = B[data.tidx + i*P1d]; 573 574 for (CeedInt k = 0; k < P1d; ++k) { 575 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 576 __syncthreads(); 577 if (data.tidx < P1d) 578 for (CeedInt i = 0; i < Q1d; ++i) 579 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction 580 __syncthreads(); 581 } 582 } 583 584 //------------------------------------------------------------------------------ 585 // 3D interpolate to quadrature points 586 //------------------------------------------------------------------------------ 587 template <int NCOMP, int P1d, int Q1d> 588 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 589 CeedScalar r_t1[Q1d]; 590 CeedScalar r_t2[Q1d]; 591 for (CeedInt comp = 0; comp < NCOMP; comp++) { 592 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 593 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 594 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 595 } 596 } 597 598 //------------------------------------------------------------------------------ 599 // 3D interpolate transpose 600 //------------------------------------------------------------------------------ 601 template <int NCOMP, int P1d, int Q1d> 602 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 603 CeedScalar r_t1[Q1d]; 604 CeedScalar r_t2[Q1d]; 605 for (CeedInt comp = 0; comp < NCOMP; comp++) { 606 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 607 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 608 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 609 } 610 } 611 612 //------------------------------------------------------------------------------ 613 // 3D derivatives at quadrature points 614 //------------------------------------------------------------------------------ 615 template <int NCOMP, int P1d, int Q1d> 616 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 617 CeedScalar r_t1[Q1d]; 618 CeedScalar r_t2[Q1d]; 619 for (CeedInt comp = 0; comp < NCOMP; comp++) { 620 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 621 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 622 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 623 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 624 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 625 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 626 ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 627 ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 628 ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 629 } 630 } 631 632 //------------------------------------------------------------------------------ 633 // 3D derivatives transpose 634 //------------------------------------------------------------------------------ 635 template <int NCOMP, int P1d, int Q1d> 636 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 637 CeedScalar r_t1[Q1d]; 638 CeedScalar r_t2[Q1d]; 639 for (CeedInt comp = 0; comp < NCOMP; comp++) { 640 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 641 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 642 ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 643 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 644 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 645 ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 646 ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 647 ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 648 ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 649 } 650 } 651 652 //------------------------------------------------------------------------------ 653 // 3D collocated derivatives computation 654 //------------------------------------------------------------------------------ 655 template <int NCOMP, int Q1d> 656 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 657 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 658 data.slice[data.tidx + data.tidy*Q1d] = r_U[q + comp*Q1d]; 659 __syncthreads(); 660 // X derivative 661 r_V[comp+0*NCOMP] = 0.0; 662 for (CeedInt i = 0; i < Q1d; ++i) 663 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative) 664 // Y derivative 665 r_V[comp+1*NCOMP] = 0.0; 666 for (CeedInt i = 0; i < Q1d; ++i) 667 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative) 668 // Z derivative 669 r_V[comp+2*NCOMP] = 0.0; 670 for (CeedInt i = 0; i < Q1d; ++i) 671 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 672 __syncthreads(); 673 } 674 } 675 676 //------------------------------------------------------------------------------ 677 // 3D collocated derivatives transpose 678 //------------------------------------------------------------------------------ 679 template <int NCOMP, int Q1d> 680 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 681 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 682 // X derivative 683 data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 684 __syncthreads(); 685 for (CeedInt i = 0; i < Q1d; ++i) 686 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative) 687 __syncthreads(); 688 // Y derivative 689 data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 690 __syncthreads(); 691 for (CeedInt i = 0; i < Q1d; ++i) 692 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative) 693 __syncthreads(); 694 // Z derivative 695 for (CeedInt i = 0; i < Q1d; ++i) 696 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 697 } 698 } 699 700 //------------------------------------------------------------------------------ 701 // 1D quadrature weights 702 //------------------------------------------------------------------------------ 703 template <int Q1d> 704 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 705 *w = qweight1d[data.tidx]; 706 } 707 708 //------------------------------------------------------------------------------ 709 // 2D quadrature weights 710 //------------------------------------------------------------------------------ 711 template <int Q1d> 712 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 713 *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 714 } 715 716 //------------------------------------------------------------------------------ 717 // 3D quadrature weights 718 //------------------------------------------------------------------------------ 719 template <int Q1d> 720 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 721 const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 722 for (CeedInt z = 0; z < Q1d; ++z) 723 w[z] = pw*qweight1d[z]; 724 } 725 726 ); 727 //------------------------------------------------------------------------------ 728 // Build singe operator kernel 729 //------------------------------------------------------------------------------ 730 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 731 732 using std::ostringstream; 733 using std::string; 734 int ierr; 735 bool setupdone; 736 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr); 737 if (setupdone) return 0; 738 Ceed ceed; 739 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 740 CeedOperator_Cuda_gen *data; 741 ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 742 CeedQFunction qf; 743 CeedQFunction_Cuda_gen *qf_data; 744 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 745 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 746 CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 747 numoutputfields, ncomp, dim = 0, lsize; 748 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 749 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 750 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 751 CeedChk(ierr); 752 CeedOperatorField *opinputfields, *opoutputfields; 753 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 754 CeedChk(ierr); 755 CeedQFunctionField *qfinputfields, *qfoutputfields; 756 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 757 CeedChk(ierr); 758 CeedEvalMode emode; 759 CeedBasis basis; 760 CeedBasis_Cuda_shared *basis_data; 761 CeedElemRestriction Erestrict; 762 CeedElemRestriction_Cuda_reg *restr_data; 763 764 ostringstream code; 765 string devFunctions(deviceFunctions); 766 767 // Add atomicAdd function for old NVidia architectures 768 struct cudaDeviceProp prop; 769 Ceed_Cuda *ceed_data; 770 ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr); 771 ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 772 if (prop.major<6){ 773 code << atomicAdd; 774 } 775 776 code << devFunctions; 777 778 string qFunction(qf_data->qFunctionSource); 779 780 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 781 code << "\n#define CeedPragmaSIMD\n"; 782 783 // Find dim and Q1d 784 bool collograd = false; 785 for (CeedInt i = 0; i < numinputfields; i++) { 786 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 787 if (basis != CEED_BASIS_COLLOCATED) { 788 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 789 790 // Check for collocated gradient 791 if (basis_data->d_collograd1d) 792 collograd = true; 793 794 // Collect dim and Q1d 795 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 796 bool isTensor; 797 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 798 if (isTensor) { 799 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 800 } else { 801 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 802 } 803 } 804 } 805 // Check output bases for Q1d, dim as well 806 // The only imput basis might be CEED_BASIS_COLLOCATED 807 for (CeedInt i = 0; i < numoutputfields; i++) { 808 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 809 if (basis != CEED_BASIS_COLLOCATED) { 810 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 811 // Collect dim and Q1d 812 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 813 bool isTensor; 814 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 815 if (isTensor) { 816 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 817 } else { 818 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 819 } 820 } 821 } 822 data->dim = dim; 823 data->Q1d = Q1d; 824 825 // Define CEED_Q_VLA 826 if (dim != 3 || collograd) { 827 code << "\n#define CEED_Q_VLA 1\n\n"; 828 } else { 829 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 830 } 831 832 code << qFunction; 833 834 // Setup 835 code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 836 // Input Evecs and Restriction 837 for (CeedInt i = 0; i < numinputfields; i++) { 838 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 839 CeedChk(ierr); 840 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 841 code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 842 } 843 } 844 845 for (CeedInt i = 0; i < numoutputfields; i++) { 846 code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 847 } 848 849 code << "const CeedInt Dim = "<<dim<<";\n"; 850 code << "const CeedInt Q1d = "<<Q1d<<";\n"; 851 852 code << "extern __shared__ CeedScalar slice[];\n"; 853 code << "BackendData data;\n"; 854 code << "data.tidx = threadIdx.x;\n"; 855 code << "data.tidy = threadIdx.y;\n"; 856 code << "data.tidz = threadIdx.z;\n"; 857 code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 858 code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 859 860 code << "\n// Input field constants and basis data\n"; 861 //Initialize constants, and matrices B and G 862 for (CeedInt i = 0; i < numinputfields; i++) { 863 code << "// -- Input field "<<i<<" --\n"; 864 // Get elemsize, emode, ncomp 865 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 866 CeedChk(ierr); 867 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 868 CeedChk(ierr); 869 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 870 CeedChk(ierr); 871 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 872 CeedChk(ierr); 873 874 // Set field constants 875 if (emode != CEED_EVAL_WEIGHT) { 876 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 877 if (basis != CEED_BASIS_COLLOCATED) { 878 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 879 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 880 } else { 881 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 882 } 883 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 884 } 885 886 // Load basis data 887 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 888 switch (emode) { 889 case CEED_EVAL_NONE: 890 break; 891 case CEED_EVAL_INTERP: 892 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 893 data->B.in[i] = basis_data->d_interp1d; 894 code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 895 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 896 break; 897 case CEED_EVAL_GRAD: 898 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 899 data->B.in[i] = basis_data->d_interp1d; 900 code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 901 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 902 if (basis_data->d_collograd1d) { 903 data->G.in[i] = basis_data->d_collograd1d; 904 code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 905 code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 906 } else { 907 data->G.in[i] = basis_data->d_grad1d; 908 code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 909 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 910 } 911 break; 912 case CEED_EVAL_WEIGHT: 913 break; // No action 914 case CEED_EVAL_DIV: 915 break; // TODO: Not implemented 916 case CEED_EVAL_CURL: 917 break; // TODO: Not implemented 918 } 919 } 920 921 code << "\n// Output field constants and basis data\n"; 922 for (CeedInt i = 0; i < numoutputfields; i++) { 923 code << "// -- Output field "<<i<<" --\n"; 924 // Get elemsize, emode, ncomp 925 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 926 CeedChk(ierr); 927 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 928 CeedChk(ierr); 929 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 930 CeedChk(ierr); 931 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 932 CeedChk(ierr); 933 934 // Set field constants 935 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 936 if (basis != CEED_BASIS_COLLOCATED) { 937 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 938 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 939 } else { 940 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 941 } 942 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 943 944 // Load basis data 945 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 946 switch (emode) { 947 case CEED_EVAL_NONE: 948 break; // No action 949 case CEED_EVAL_INTERP: 950 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 951 data->B.out[i] = basis_data->d_interp1d; 952 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 953 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 954 break; 955 case CEED_EVAL_GRAD: 956 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 957 data->B.out[i] = basis_data->d_interp1d; 958 code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 959 code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 960 if (basis_data->d_collograd1d) { 961 data->G.out[i] = basis_data->d_collograd1d; 962 code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 963 code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 964 } else { 965 data->G.out[i] = basis_data->d_grad1d; 966 code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 967 code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 968 } 969 break; 970 case CEED_EVAL_WEIGHT: { 971 Ceed ceed; 972 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 973 return CeedError(ceed, 1, 974 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 975 break; // Should not occur 976 } 977 case CEED_EVAL_DIV: 978 break; // TODO: Not implemented 979 case CEED_EVAL_CURL: 980 break; // TODO: Not implemented 981 } 982 } 983 code << "\n"; 984 code << "__syncthreads();\n"; 985 code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 986 // Input basis apply if needed 987 // Generate the correct eval mode code for each input 988 code << "\n// Input field restrictions and basis actions\n"; 989 for (CeedInt i = 0; i < numinputfields; i++) { 990 code << " // -- Input field "<<i<<" --\n"; 991 // Get elemsize, emode, ncomp 992 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 993 CeedChk(ierr); 994 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 995 CeedChk(ierr); 996 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 997 CeedChk(ierr); 998 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 999 CeedChk(ierr); 1000 1001 // Restriction 1002 if (emode != CEED_EVAL_WEIGHT && 1003 !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) { 1004 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1005 1006 bool isStrided; 1007 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1008 if (!isStrided) { 1009 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1010 CeedChk(ierr); 1011 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1012 CeedInt compstride; 1013 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1014 code << " // CompStride: "<<compstride<<"\n"; 1015 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1016 data->indices.in[i] = restr_data->d_ind; 1017 code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1018 } else { 1019 bool backendstrides; 1020 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1021 CeedChk(ierr); 1022 CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1023 if (!backendstrides) { 1024 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1025 CeedChk(ierr); 1026 } 1027 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1028 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1029 } 1030 } 1031 1032 // Basis action 1033 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1034 switch (emode) { 1035 case CEED_EVAL_NONE: 1036 if (!basis_data->d_collograd1d) { 1037 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1038 } 1039 break; 1040 case CEED_EVAL_INTERP: 1041 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1042 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1043 break; 1044 case CEED_EVAL_GRAD: 1045 if (basis_data->d_collograd1d) { 1046 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1047 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1048 } else { 1049 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1050 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1051 } 1052 break; 1053 case CEED_EVAL_WEIGHT: 1054 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1055 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1056 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1057 data->W = basis_data->d_qweight1d; 1058 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1059 break; // No action 1060 case CEED_EVAL_DIV: 1061 break; // TODO: Not implemented 1062 case CEED_EVAL_CURL: 1063 break; // TODO: Not implemented 1064 } 1065 } 1066 1067 // Q function 1068 code << "\n// Output field setup\n"; 1069 for (CeedInt i = 0; i < numoutputfields; i++) { 1070 code << " // -- Output field "<<i<<" --\n"; 1071 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1072 CeedChk(ierr); 1073 if (emode==CEED_EVAL_GRAD) 1074 { 1075 if (basis_data->d_collograd1d) { 1076 //Accumulator for gradient slices 1077 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1078 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1079 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1080 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1081 code << " }\n"; 1082 code << " }\n"; 1083 } else { 1084 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1085 } 1086 } 1087 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1088 { 1089 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1090 } 1091 } 1092 // We treat quadrature points per slice in 3d to save registers 1093 if (basis_data->d_collograd1d) { 1094 code << "\n // Note: Collocated Gradient\n"; 1095 code << "#pragma unroll\n"; 1096 code << "for (CeedInt q=0; q<Q1d; q++) {\n"; 1097 for (CeedInt i = 0; i < numinputfields; i++) { 1098 code << " // -- Input field "<<i<<" --\n"; 1099 // Get elemsize, emode, ncomp 1100 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1101 CeedChk(ierr); 1102 // Basis action 1103 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1104 switch (emode) { 1105 case CEED_EVAL_NONE: 1106 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1107 1108 bool isStrided; 1109 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1110 if (!isStrided) { 1111 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1112 CeedChk(ierr); 1113 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1114 CeedInt compstride; 1115 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1116 code << " // CompStride: "<<compstride<<"\n"; 1117 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1118 data->indices.in[i] = restr_data->d_ind; 1119 code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1120 } else { 1121 bool backendstrides; 1122 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1123 CeedChk(ierr); 1124 CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1125 if (!backendstrides) { 1126 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1127 CeedChk(ierr); 1128 } 1129 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1130 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1131 } 1132 break; 1133 case CEED_EVAL_INTERP: 1134 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1135 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1136 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1137 code << " }\n"; 1138 break; 1139 case CEED_EVAL_GRAD: 1140 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1141 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1142 break; 1143 case CEED_EVAL_WEIGHT: 1144 code << " CeedScalar r_q"<<i<<"[1];\n"; 1145 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1146 break; // No action 1147 case CEED_EVAL_DIV: 1148 break; // TODO: Not implemented 1149 case CEED_EVAL_CURL: 1150 break; // TODO: Not implemented 1151 } 1152 } 1153 for (CeedInt i = 0; i < numoutputfields; i++) { 1154 code << " // -- Output field "<<i<<" --\n"; 1155 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1156 CeedChk(ierr); 1157 // Basis action 1158 switch (emode) { 1159 case CEED_EVAL_NONE: 1160 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1161 break; // No action 1162 case CEED_EVAL_INTERP: 1163 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1164 break; 1165 case CEED_EVAL_GRAD: 1166 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1167 break; 1168 case CEED_EVAL_WEIGHT: 1169 break; // Should not occur 1170 case CEED_EVAL_DIV: 1171 break; // TODO: Not implemented 1172 case CEED_EVAL_CURL: 1173 break; // TODO: Not implemented 1174 } 1175 } 1176 } else { 1177 code << "\n // Note: No Collocated Gradient\n"; 1178 for (CeedInt i = 0; i < numinputfields; i++) { 1179 code << " // -- Input field "<<i<<" --\n"; 1180 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1181 } 1182 for (CeedInt i = 0; i < numoutputfields; i++) { 1183 code << " // -- Output field "<<i<<" --\n"; 1184 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1185 } 1186 } 1187 code << " // QFunction Inputs and outputs\n"; 1188 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1189 for (CeedInt i = 0; i < numinputfields; i++) { 1190 code << " // -- Input field "<<i<<" --\n"; 1191 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1192 } 1193 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1194 for (CeedInt i = 0; i < numoutputfields; i++) { 1195 code << " // -- Output field "<<i<<" --\n"; 1196 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1197 } 1198 code << "\n // Apply QFunction\n"; 1199 string qFunctionName(qf_data->qFunctionName); 1200 code << " "<<qFunctionName<<"(ctx, "; 1201 if (dim != 3 || basis_data->d_collograd1d) { 1202 code << "1 "; 1203 } else { 1204 code << "Q1d "; 1205 } 1206 code << ", in, out);\n"; 1207 if (basis_data->d_collograd1d) { 1208 code << "\n // Note: Collocated Gradient\n"; 1209 for (CeedInt i = 0; i < numoutputfields; i++) { 1210 code << " // -- Output field "<<i<<" --\n"; 1211 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1212 CeedChk(ierr); 1213 // Basis action 1214 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1215 switch (emode) { 1216 case CEED_EVAL_NONE: 1217 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1218 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1219 code << " }\n"; 1220 break; // No action 1221 case CEED_EVAL_INTERP: 1222 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1223 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1224 code << " }\n"; 1225 break; 1226 case CEED_EVAL_GRAD: 1227 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1228 break; 1229 case CEED_EVAL_WEIGHT: 1230 break; // Should not occur 1231 case CEED_EVAL_DIV: 1232 break; // TODO: Not implemented 1233 case CEED_EVAL_CURL: 1234 break; // TODO: Not implemented 1235 } 1236 } 1237 code << "}\n"; 1238 } 1239 1240 // Output basis apply if needed 1241 // Generate the correct eval mode code for each output 1242 code << "\n// Output field basis action and restrictions\n"; 1243 for (CeedInt i = 0; i < numoutputfields; i++) { 1244 code << " // -- Output field "<<i<<" --\n"; 1245 // Get elemsize, emode, ncomp 1246 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1247 CeedChk(ierr); 1248 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1249 CeedChk(ierr); 1250 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1251 CeedChk(ierr); 1252 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1253 CeedChk(ierr); 1254 // Basis action 1255 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1256 switch (emode) { 1257 case CEED_EVAL_NONE: 1258 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1259 break; // No action 1260 case CEED_EVAL_INTERP: 1261 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1262 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1263 break; 1264 case CEED_EVAL_GRAD: 1265 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1266 if (basis_data->d_collograd1d) { 1267 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1268 } else { 1269 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1270 } 1271 break; 1272 case CEED_EVAL_WEIGHT: { 1273 Ceed ceed; 1274 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1275 return CeedError(ceed, 1, 1276 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1277 break; // Should not occur 1278 } 1279 case CEED_EVAL_DIV: 1280 break; // TODO: Not implemented 1281 case CEED_EVAL_CURL: 1282 break; // TODO: Not implemented 1283 } 1284 // Restriction 1285 bool isStrided; 1286 ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1287 if (!isStrided) { 1288 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1289 CeedChk(ierr); 1290 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1291 CeedInt compstride; 1292 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1293 code << " // CompStride: "<<compstride<<"\n"; 1294 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1295 data->indices.out[i] = restr_data->d_ind; 1296 code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1297 } else { 1298 bool backendstrides; 1299 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1300 CeedChk(ierr); 1301 CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1302 if (!backendstrides) { 1303 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1304 CeedChk(ierr); 1305 } 1306 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1307 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1308 } 1309 } 1310 1311 code << " }\n"; 1312 code << "}\n\n"; 1313 1314 // View kernel for debugging 1315 // std::cout << code.str(); 1316 1317 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); 1318 CeedChk(ierr); 1319 ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1320 CeedChk(ierr); 1321 1322 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1323 return 0; 1324 } 1325 //------------------------------------------------------------------------------ 1326