1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 #include <ceed-backend.h> 17 #include "ceed-cuda-gen.h" 18 #include <iostream> 19 #include <sstream> 20 #include "../cuda/ceed-cuda.h" 21 #include "../cuda-reg/ceed-cuda-reg.h" 22 #include "../cuda-shared/ceed-cuda-shared.h" 23 24 static const char *deviceFunctions = QUOTE( 25 26 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 27 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 28 29 typedef struct { 30 CeedInt tidx; 31 CeedInt tidy; 32 CeedInt tidz; 33 CeedInt tid; 34 CeedScalar* slice; 35 } BackendData; 36 37 #if __CUDA_ARCH__ < 600 38 __device__ double atomicAdd(double *address, double val) { 39 unsigned long long int *address_as_ull = (unsigned long long int *)address; 40 unsigned long long int old = *address_as_ull, assumed; 41 do { 42 assumed = old; 43 old = 44 atomicCAS(address_as_ull, assumed, 45 __double_as_longlong(val + 46 __longlong_as_double(assumed))); 47 // Note: uses integer comparison to avoid hang in case of NaN 48 // (since NaN != NaN) 49 } while (assumed != old); 50 return __longlong_as_double(old); 51 } 52 #endif // __CUDA_ARCH__ < 600 53 54 template <int P, int Q> 55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56 for(int i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57 B[i] = d_B[i]; 58 } 59 __syncthreads; 60 } 61 62 //**** 63 // 1D 64 template <int NCOMP, int P1d> 65 inline __device__ void readDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 66 if (data.tidx<P1d) 67 { 68 const CeedInt dof = data.tidx; 69 const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 70 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 71 r_u[comp] = d_u[ind + ndofs * comp]; 72 } 73 } 74 } 75 76 template <int NCOMP, int P1d> 77 inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 78 if (data.tidx<P1d) 79 { 80 const CeedInt dof = data.tidx; 81 const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 82 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 83 r_u[comp] = d_u[ind * NCOMP + comp]; 84 } 85 } 86 } 87 88 template <int NCOMP, int Q1d> 89 inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 90 const CeedInt dof = data.tidx; 91 const CeedInt ind = dof + elem * Q1d; 92 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 93 r_u[comp] = d_u[ind + nquads * comp]; 94 } 95 } 96 97 template <int NCOMP, int Q1d> 98 inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 99 const CeedInt dof = data.tidx; 100 const CeedInt ind = dof + elem * Q1d; 101 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 102 r_u[comp] = d_u[ind * NCOMP + comp]; 103 } 104 } 105 106 template <int NCOMP, int P1d> 107 inline __device__ void writeDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 108 if (data.tidx<P1d) 109 { 110 const CeedInt dof = data.tidx; 111 const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 112 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 113 atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]); 114 } 115 } 116 } 117 118 template <int NCOMP, int P1d> 119 inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 120 if (data.tidx<P1d) 121 { 122 const CeedInt dof = data.tidx; 123 const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 124 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 125 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 126 } 127 } 128 } 129 130 template <int NCOMP, int Q1d> 131 inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 132 const CeedInt dof = data.tidx; 133 const CeedInt ind = dof + elem * Q1d; 134 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 135 d_v[ind + nquads * comp] = r_v[comp]; 136 } 137 } 138 139 template <int NCOMP, int Q1d> 140 inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 141 const CeedInt dof = data.tidx; 142 const CeedInt ind = dof + elem * Q1d; 143 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 144 d_v[ind * NCOMP + comp] = r_v[comp]; 145 } 146 } 147 148 template <int NCOMP, int P1d, int Q1d> 149 inline __device__ void ContractX1d(BackendData& data, 150 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 151 data.slice[data.tidx] = *U; 152 __syncthreads(); 153 *V = 0.0; 154 for (int i = 0; i < P1d; ++i) { 155 *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 156 } 157 __syncthreads(); 158 } 159 160 template <int NCOMP, int P1d, int Q1d> 161 inline __device__ void ContractTransposeX1d(BackendData& data, 162 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 163 data.slice[data.tidx] = *U; 164 __syncthreads(); 165 *V = 0.0; 166 for (int i = 0; i < Q1d; ++i) { 167 *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 168 } 169 __syncthreads(); 170 } 171 172 template <int NCOMP, int P1d, int Q1d> 173 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 174 CeedScalar *__restrict__ r_V) { 175 for(int comp=0; comp<NCOMP; comp++) { 176 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 177 } 178 } 179 180 template <int NCOMP, int P1d, int Q1d> 181 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 182 CeedScalar *__restrict__ r_V) { 183 for(int comp=0; comp<NCOMP; comp++) { 184 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 185 } 186 } 187 188 template <int NCOMP, int P1d, int Q1d> 189 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 190 const CeedScalar *c_B, const CeedScalar *c_G, 191 CeedScalar *__restrict__ r_V) { 192 for(int comp=0; comp<NCOMP; comp++) { 193 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 194 } 195 } 196 197 template <int NCOMP, int P1d, int Q1d> 198 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 199 const CeedScalar *c_B, const CeedScalar *c_G, 200 CeedScalar *__restrict__ r_V) { 201 for(int comp=0; comp<NCOMP; comp++) { 202 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 203 } 204 } 205 206 //**** 207 // 2D 208 template <int NCOMP, int P1d> 209 inline __device__ void readDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 210 if (data.tidx<P1d && data.tidy<P1d) 211 { 212 const CeedInt dof = data.tidx + data.tidy*P1d; 213 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 214 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 215 r_u[comp] = d_u[ind + ndofs * comp]; 216 } 217 } 218 } 219 220 template <int NCOMP, int P1d> 221 inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 222 if (data.tidx<P1d && data.tidy<P1d) 223 { 224 const CeedInt dof = data.tidx + data.tidy*P1d; 225 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 226 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 227 r_u[comp] = d_u[ind * NCOMP + comp]; 228 } 229 } 230 } 231 232 template <int NCOMP, int Q1d> 233 inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 234 const CeedInt dof = data.tidx + data.tidy*Q1d; 235 const CeedInt ind = dof + elem * Q1d*Q1d; 236 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 237 r_u[comp] = d_u[ind + nquads * comp]; 238 } 239 } 240 241 template <int NCOMP, int Q1d> 242 inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 243 const CeedInt dof = data.tidx + data.tidy*Q1d; 244 const CeedInt ind = dof + elem * Q1d*Q1d; 245 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 246 r_u[comp] = d_u[ind * NCOMP + comp]; 247 } 248 } 249 250 template <int NCOMP, int P1d> 251 inline __device__ void writeDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 252 if (data.tidx<P1d && data.tidy<P1d) 253 { 254 const CeedInt dof = data.tidx + data.tidy*P1d; 255 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 256 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 257 atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]); 258 } 259 } 260 } 261 262 template <int NCOMP, int P1d> 263 inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 264 if (data.tidx<P1d && data.tidy<P1d) 265 { 266 const CeedInt dof = data.tidx + data.tidy*P1d; 267 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 268 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 269 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 270 } 271 } 272 } 273 274 template <int NCOMP, int Q1d> 275 inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 276 const CeedInt dof = data.tidx + data.tidy*Q1d; 277 const CeedInt ind = dof + elem * Q1d*Q1d; 278 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 279 d_v[ind + nquads * comp] = r_v[comp]; 280 } 281 } 282 283 template <int NCOMP, int Q1d> 284 inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 285 const CeedInt dof = data.tidx + data.tidy*Q1d; 286 const CeedInt ind = dof + elem * Q1d*Q1d; 287 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 288 d_v[ind * NCOMP + comp] = r_v[comp]; 289 } 290 } 291 292 template <int NCOMP, int P1d, int Q1d> 293 inline __device__ void ContractX2d(BackendData& data, 294 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 295 data.slice[data.tidx+data.tidy*Q1d] = *U; 296 __syncthreads(); 297 *V = 0.0; 298 for (int i = 0; i < P1d; ++i) { 299 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 300 } 301 __syncthreads(); 302 } 303 304 template <int NCOMP, int P1d, int Q1d> 305 inline __device__ void ContractY2d(BackendData& data, 306 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 307 data.slice[data.tidx+data.tidy*Q1d] = *U; 308 __syncthreads(); 309 *V = 0.0; 310 for (int i = 0; i < P1d; ++i) { 311 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 312 } 313 __syncthreads(); 314 } 315 316 template <int NCOMP, int P1d, int Q1d> 317 inline __device__ void ContractYTranspose2d(BackendData& data, 318 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 319 data.slice[data.tidx+data.tidy*Q1d] = *U; 320 __syncthreads(); 321 *V = 0.0; 322 if (data.tidy<P1d) { 323 for (int i = 0; i < Q1d; ++i) { 324 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 325 } 326 } 327 __syncthreads(); 328 } 329 330 template <int NCOMP, int P1d, int Q1d> 331 inline __device__ void ContractXTranspose2d(BackendData& data, 332 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 333 data.slice[data.tidx+data.tidy*Q1d] = *U; 334 __syncthreads(); 335 *V = 0.0; 336 if (data.tidx<P1d) { 337 for (int i = 0; i < Q1d; ++i) { 338 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 339 } 340 } 341 __syncthreads(); 342 } 343 344 template <int NCOMP, int P1d, int Q1d> 345 inline __device__ void ContractXTransposeAdd2d(BackendData& data, 346 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 347 data.slice[data.tidx+data.tidy*Q1d] = *U; 348 __syncthreads(); 349 if (data.tidx<P1d) { 350 for (int i = 0; i < Q1d; ++i) { 351 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 352 } 353 } 354 __syncthreads(); 355 } 356 357 template <int NCOMP, int P1d, int Q1d> 358 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 359 CeedScalar *__restrict__ r_V) { 360 CeedScalar r_t[1]; 361 for(int comp=0; comp<NCOMP; comp++) { 362 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 363 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 364 } 365 } 366 367 template <int NCOMP, int P1d, int Q1d> 368 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 369 CeedScalar *__restrict__ r_V) { 370 CeedScalar r_t[1]; 371 for(int comp=0; comp<NCOMP; comp++) { 372 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 373 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 374 } 375 } 376 377 template <int NCOMP, int P1d, int Q1d> 378 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 379 const CeedScalar *c_B, const CeedScalar *c_G, 380 CeedScalar *__restrict__ r_V) { 381 CeedScalar r_t[1]; 382 for(int comp=0; comp<NCOMP; comp++) { 383 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 384 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 385 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 386 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 387 } 388 } 389 390 template <int NCOMP, int P1d, int Q1d> 391 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 392 const CeedScalar *c_B, const CeedScalar *c_G, 393 CeedScalar *__restrict__ r_V) { 394 CeedScalar r_t[1]; 395 for(int comp=0; comp<NCOMP; comp++) { 396 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 397 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 398 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 399 ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 400 } 401 } 402 403 //**** 404 // 3D 405 template <int NCOMP, int P1d> 406 inline __device__ void readDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 407 if (data.tidx<P1d && data.tidy<P1d) { 408 for (CeedInt z = 0; z < P1d; ++z) { 409 const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 410 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 411 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 412 r_u[z+comp*P1d] = d_u[ind + ndofs * comp]; 413 } 414 } 415 } 416 } 417 418 template <int NCOMP, int P1d> 419 inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 420 if (data.tidx<P1d && data.tidy<P1d) { 421 for (CeedInt z = 0; z < P1d; ++z) { 422 const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 423 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 424 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 425 r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 426 } 427 } 428 } 429 } 430 431 template <int NCOMP, int Q1d> 432 inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 433 for(CeedInt z=0; z < Q1d; ++z) { 434 const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 435 const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 436 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 437 r_u[z+comp*Q1d] = d_u[ind + nquads * comp]; 438 } 439 } 440 } 441 442 template <int NCOMP, int Q1d> 443 inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 444 for(CeedInt z=0; z < Q1d; ++z) { 445 const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 446 const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 447 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 448 r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp]; 449 } 450 } 451 } 452 453 template <int NCOMP, int P1d> 454 inline __device__ void writeDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 455 if (data.tidx<P1d && data.tidy<P1d) { 456 for (CeedInt z = 0; z < P1d; ++z) { 457 const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 458 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 459 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 460 atomicAdd(&d_v[ind + ndofs * comp], r_v[z+comp*P1d]); 461 } 462 } 463 } 464 } 465 466 template <int NCOMP, int P1d> 467 inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 468 if (data.tidx<P1d && data.tidy<P1d) { 469 for (CeedInt z = 0; z < P1d; ++z) { 470 const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 471 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 472 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 473 atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 474 } 475 } 476 } 477 } 478 479 template <int NCOMP, int Q1d> 480 inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 481 for(CeedInt z=0; z < Q1d; ++z) { 482 const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 483 const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 484 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 485 d_v[ind + nquads * comp] = r_v[z+comp*Q1d]; 486 } 487 } 488 } 489 490 template <int NCOMP, int Q1d> 491 inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 492 for(CeedInt z=0; z < Q1d; ++z) { 493 const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 494 const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 495 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 496 d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d]; 497 } 498 } 499 } 500 501 template <int NCOMP, int P1d, int Q1d> 502 inline __device__ void ContractX3d(BackendData& data, 503 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 504 for (int k = 0; k < P1d; ++k) { 505 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 506 __syncthreads(); 507 V[k] = 0.0; 508 for (int i = 0; i < P1d; ++i) { 509 V[k] += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 510 } 511 __syncthreads(); 512 } 513 } 514 515 template <int NCOMP, int P1d, int Q1d> 516 inline __device__ void ContractY3d(BackendData& data, 517 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 518 for (int k = 0; k < P1d; ++k) { 519 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 520 __syncthreads(); 521 V[k] = 0.0; 522 for (int i = 0; i < P1d; ++i) { 523 V[k] += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 524 } 525 __syncthreads(); 526 } 527 } 528 529 template <int NCOMP, int P1d, int Q1d> 530 inline __device__ void ContractZ3d(BackendData& data, 531 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 532 for (int k = 0; k < Q1d; ++k) { 533 V[k] = 0.0; 534 for (int i = 0; i < P1d; ++i) { 535 V[k] += B[i + k*P1d] * U[i];//contract z direction 536 } 537 } 538 } 539 540 template <int NCOMP, int P1d, int Q1d> 541 inline __device__ void ContractTransposeZ3d(BackendData& data, 542 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 543 for (int k = 0; k < Q1d; ++k) { 544 V[k] = 0.0; 545 if (k<P1d) { 546 for (int i = 0; i < Q1d; ++i) { 547 V[k] += B[k + i*P1d] * U[i];//contract z direction 548 } 549 } 550 } 551 } 552 553 template <int NCOMP, int P1d, int Q1d> 554 inline __device__ void ContractTransposeY3d(BackendData& data, 555 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 556 for (int k = 0; k < P1d; ++k) { 557 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 558 __syncthreads(); 559 V[k] = 0.0; 560 if (data.tidy<P1d) { 561 for (int i = 0; i < Q1d; ++i) { 562 V[k] += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 563 } 564 } 565 __syncthreads(); 566 } 567 } 568 569 template <int NCOMP, int P1d, int Q1d> 570 inline __device__ void ContractTransposeX3d(BackendData& data, 571 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 572 for (int k = 0; k < P1d; ++k) { 573 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 574 __syncthreads(); 575 V[k] = 0.0; 576 if (data.tidx<P1d) { 577 for (int i = 0; i < Q1d; ++i) { 578 V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 579 } 580 } 581 __syncthreads(); 582 } 583 } 584 585 template <int NCOMP, int P1d, int Q1d> 586 inline __device__ void ContractTransposeAddX3d(BackendData& data, 587 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 588 for (int k = 0; k < P1d; ++k) { 589 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 590 __syncthreads(); 591 if (data.tidx<P1d) { 592 for (int i = 0; i < Q1d; ++i) { 593 V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 594 } 595 } 596 __syncthreads(); 597 } 598 } 599 600 template <int NCOMP, int P1d, int Q1d> 601 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 602 CeedScalar *__restrict__ r_V) { 603 CeedScalar r_t1[Q1d]; 604 CeedScalar r_t2[Q1d]; 605 for(int comp=0; comp<NCOMP; comp++) { 606 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 607 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 608 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 609 } 610 } 611 612 template <int NCOMP, int P1d, int Q1d> 613 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 614 CeedScalar *__restrict__ r_V) { 615 CeedScalar r_t1[Q1d]; 616 CeedScalar r_t2[Q1d]; 617 for(int comp=0; comp<NCOMP; comp++) { 618 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 619 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 620 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 621 } 622 } 623 624 template <int NCOMP, int P1d, int Q1d> 625 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 626 const CeedScalar *c_B, const CeedScalar *c_G, 627 CeedScalar *__restrict__ r_V) { 628 CeedScalar r_t1[Q1d]; 629 CeedScalar r_t2[Q1d]; 630 for(int comp=0; comp<NCOMP; comp++) { 631 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 632 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 633 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 634 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 635 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 636 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 637 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 638 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 639 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 640 } 641 } 642 643 template <int NCOMP, int P1d, int Q1d> 644 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 645 const CeedScalar *c_B, const CeedScalar *c_G, 646 CeedScalar *__restrict__ r_V) { 647 CeedScalar r_t1[Q1d]; 648 CeedScalar r_t2[Q1d]; 649 for(int comp=0; comp<NCOMP; comp++) { 650 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 651 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 652 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 653 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 654 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 655 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 656 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 657 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 658 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 659 } 660 } 661 662 template <int Q1d> 663 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 664 *w = qweight1d[data.tidx]; 665 } 666 667 template <int Q1d> 668 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 669 *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 670 } 671 672 template <int Q1d> 673 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 674 const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 675 for (int z = 0; z < Q1d; ++z) 676 { 677 w[z] = pw*qweight1d[z]; 678 } 679 } 680 681 ); 682 683 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 684 685 using std::ostringstream; 686 using std::string; 687 int ierr; 688 bool setupdone; 689 ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 690 if (setupdone) return 0; 691 Ceed ceed; 692 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 693 CeedOperator_Cuda_gen *data; 694 ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 695 CeedQFunction qf; 696 CeedQFunction_Cuda_gen *qf_data; 697 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 698 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 699 CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, ndof; 700 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 701 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 702 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 703 CeedChk(ierr); 704 CeedOperatorField *opinputfields, *opoutputfields; 705 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 706 CeedChk(ierr); 707 CeedQFunctionField *qfinputfields, *qfoutputfields; 708 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 709 CeedChk(ierr); 710 CeedEvalMode emode; 711 CeedTransposeMode lmode; 712 CeedBasis basis; 713 CeedBasis_Cuda_shared *basis_data; 714 CeedElemRestriction Erestrict; 715 CeedElemRestriction_Cuda_reg *restr_data; 716 717 ostringstream code; 718 string devFunctions(deviceFunctions); 719 720 code << devFunctions; 721 722 string qFunction(qf_data->qFunctionSource); 723 code << qFunction; 724 725 // Setup 726 code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 727 // Input Evecs and Restriction 728 for (CeedInt i = 0; i < numinputfields; i++) { 729 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 730 CeedChk(ierr); 731 if (emode == CEED_EVAL_WEIGHT) { // Skip 732 } else { 733 code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 734 if (emode != CEED_EVAL_NONE) 735 { 736 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 737 bool isTensor; 738 ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 739 //TODO check that all are the same 740 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 741 if (isTensor) 742 { 743 //TODO check that all are the same 744 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 745 } else { 746 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 747 } 748 } 749 } 750 } 751 data->dim = dim; 752 data->Q1d = Q1d; 753 754 for (CeedInt i = 0; i < numoutputfields; i++) { 755 code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 756 } 757 code << "const CeedInt Dim = "<<dim<<";\n"; 758 code << "const CeedInt Q1d = "<<Q1d<<";\n"; 759 // code << "const CeedInt Q = "<<Q<<";\n"; 760 code << "extern __shared__ CeedScalar slice[];\n"; 761 code << "BackendData data;\n"; 762 code << "data.tidx = threadIdx.x;\n"; 763 code << "data.tidy = threadIdx.y;\n"; 764 code << "data.tidz = threadIdx.z;\n"; 765 code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 766 code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 767 for (CeedInt i = 0; i < numinputfields; i++) { 768 code << "// Input field "<<i<<"\n"; 769 // Get elemsize, emode, ncomp 770 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 771 CeedChk(ierr); 772 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 773 CeedChk(ierr); 774 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 775 CeedChk(ierr); 776 ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp); 777 CeedChk(ierr); 778 // Basis action 779 switch (emode) { 780 case CEED_EVAL_NONE: 781 ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 782 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 783 code << " const CeedInt nquads_in_"<<i<<" = "<<ndof<<";\n"; 784 break; 785 case CEED_EVAL_INTERP: 786 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 787 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 788 ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 789 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 790 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 791 code << " const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n"; 792 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 793 data->B.in[i] = basis_data->d_interp1d; 794 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 795 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 796 break; 797 case CEED_EVAL_GRAD: 798 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 799 ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 800 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 801 code << " const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n"; 802 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 803 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 804 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 805 data->B.in[i] = basis_data->d_interp1d; 806 data->G.in[i] = basis_data->d_grad1d; 807 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 808 code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 809 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 810 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 811 break; 812 case CEED_EVAL_WEIGHT: 813 break; // No action 814 case CEED_EVAL_DIV: 815 break; // TODO: Not implemented 816 case CEED_EVAL_CURL: 817 break; // TODO: Not implemented 818 } 819 } 820 for (CeedInt i = 0; i < numoutputfields; i++) { 821 code << "// Output field "<<i<<"\n"; 822 // Get elemsize, emode, ncomp 823 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 824 CeedChk(ierr); 825 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 826 CeedChk(ierr); 827 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 828 CeedChk(ierr); 829 ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp); 830 CeedChk(ierr); 831 // Basis action 832 switch (emode) { 833 case CEED_EVAL_NONE: 834 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 835 ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 836 ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 837 code << " const CeedInt nquads_out_"<<i<<" = "<<ndof<<";\n"; 838 break; // No action 839 case CEED_EVAL_INTERP: 840 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 841 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 842 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 843 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 844 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 845 data->B.out[i] = basis_data->d_interp1d; 846 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 847 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 848 ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 849 code << " const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n"; 850 break; 851 case CEED_EVAL_GRAD: 852 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 853 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 854 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 855 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 856 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 857 data->B.out[i] = basis_data->d_interp1d; 858 data->G.out[i] = basis_data->d_grad1d; 859 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 860 code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 861 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 862 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 863 ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 864 code << " const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n"; 865 break; 866 case CEED_EVAL_WEIGHT: { 867 Ceed ceed; 868 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 869 return CeedError(ceed, 1, 870 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 871 break; // Should not occur 872 } 873 case CEED_EVAL_DIV: 874 break; // TODO: Not implemented 875 case CEED_EVAL_CURL: 876 break; // TODO: Not implemented 877 } 878 } 879 code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 880 // Input basis apply if needed 881 for (CeedInt i = 0; i < numinputfields; i++) { 882 code << "// Input field "<<i<<"\n"; 883 // Get elemsize, emode, ncomp 884 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 885 CeedChk(ierr); 886 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 887 CeedChk(ierr); 888 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 889 CeedChk(ierr); 890 ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp); 891 CeedChk(ierr); 892 // Basis action 893 switch (emode) { 894 case CEED_EVAL_NONE: 895 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 896 ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 897 code << " readQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n"; 898 break; 899 case CEED_EVAL_INTERP: 900 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 901 ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 902 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 903 data->indices.in[i] = restr_data->d_ind; 904 code << " readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 905 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 906 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 907 break; 908 case CEED_EVAL_GRAD: 909 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 910 ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 911 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 912 data->indices.in[i] = restr_data->d_ind; 913 code << " readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 914 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 915 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 916 break; 917 case CEED_EVAL_WEIGHT: 918 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 919 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 920 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 921 data->W = basis_data->d_qweight1d; 922 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 923 break; // No action 924 case CEED_EVAL_DIV: 925 break; // TODO: Not implemented 926 case CEED_EVAL_CURL: 927 break; // TODO: Not implemented 928 } 929 } 930 // Q function 931 code << "// QFunction\n"; 932 for (CeedInt i = 0; i < numoutputfields; i++) { 933 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 934 CeedChk(ierr); 935 if (emode==CEED_EVAL_GRAD) 936 { 937 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 938 } 939 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 940 { 941 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 942 } 943 } 944 //TODO write qfunction load for this backend 945 string qFunctionName(qf_data->qFunctionName); 946 code << " "<<qFunctionName<<"(ctx, "<<(dim==3?"Q1d":"1")<<", "; 947 for (CeedInt i = 0; i < numinputfields; i++) { 948 code << "r_t"<<i<<", "; 949 } 950 for (CeedInt i = 0; i < numoutputfields; i++) { 951 code << "r_tt"<<i; 952 if (i<numoutputfields-1) 953 { 954 code << ", "; 955 } 956 } 957 code << ");\n"; 958 959 // Output basis apply if needed 960 for (CeedInt i = 0; i < numoutputfields; i++) { 961 code << "// Output field "<<i<<"\n"; 962 // Get elemsize, emode, ncomp 963 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 964 CeedChk(ierr); 965 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 966 CeedChk(ierr); 967 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 968 CeedChk(ierr); 969 ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp); 970 CeedChk(ierr); 971 // Basis action 972 switch (emode) { 973 case CEED_EVAL_NONE: 974 ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 975 code << " writeQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n"; 976 break; // No action 977 case CEED_EVAL_INTERP: 978 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 979 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 980 ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 981 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 982 data->indices.out[i] = restr_data->d_ind; 983 code << " writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 984 break; 985 case CEED_EVAL_GRAD: 986 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 987 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 988 ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 989 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 990 data->indices.out[i] = restr_data->d_ind; 991 code << " writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 992 break; 993 case CEED_EVAL_WEIGHT: { 994 Ceed ceed; 995 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 996 return CeedError(ceed, 1, 997 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 998 break; // Should not occur 999 } 1000 case CEED_EVAL_DIV: 1001 break; // TODO: Not implemented 1002 case CEED_EVAL_CURL: 1003 break; // TODO: Not implemented 1004 } 1005 } 1006 1007 code << " }\n"; 1008 code << "}\n\n"; 1009 1010 // std::cout << code.str(); 1011 1012 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr); 1013 ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1014 CeedChk(ierr); 1015 1016 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1017 1018 return 0; 1019 } 1020