1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 #include <ceed-backend.h> 17 #include "ceed-cuda-gen.h" 18 #include <iostream> 19 #include <sstream> 20 #include "../cuda/ceed-cuda.h" 21 #include "../cuda-reg/ceed-cuda-reg.h" 22 #include "../cuda-shared/ceed-cuda-shared.h" 23 24 static const char *atomicAdd = QUOTE( 25 __device__ double atomicAdd(double *address, double val) { 26 unsigned long long int *address_as_ull = (unsigned long long int *)address; 27 unsigned long long int old = *address_as_ull, assumed; 28 do { 29 assumed = old; 30 old = 31 atomicCAS(address_as_ull, assumed, 32 __double_as_longlong(val + 33 __longlong_as_double(assumed))); 34 // Note: uses integer comparison to avoid hang in case of NaN 35 // (since NaN != NaN) 36 } while (assumed != old); 37 return __longlong_as_double(old); 38 } 39 ); 40 41 static const char *deviceFunctions = QUOTE( 42 43 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45 46 typedef struct { 47 CeedInt tidx; 48 CeedInt tidy; 49 CeedInt tidz; 50 CeedInt tid; 51 CeedScalar* slice; 52 } BackendData; 53 54 template <int P, int Q> 55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56 for(int i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57 B[i] = d_B[i]; 58 } 59 } 60 61 //**** 62 // 1D 63 template <int NCOMP, int P1d> 64 inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 65 if (data.tidx<P1d) 66 { 67 const CeedInt node = data.tidx; 68 const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 69 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 70 r_u[comp] = d_u[ind + nnodes * comp]; 71 } 72 } 73 } 74 75 template <int NCOMP, int P1d> 76 inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77 if (data.tidx<P1d) 78 { 79 const CeedInt node = data.tidx; 80 const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 81 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 82 r_u[comp] = d_u[ind * NCOMP + comp]; 83 } 84 } 85 } 86 87 template <int NCOMP, int Q1d> 88 inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 89 const CeedInt node = data.tidx; 90 const CeedInt ind = node + elem * Q1d; 91 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 92 r_u[comp] = d_u[ind + nquads * comp]; 93 } 94 } 95 96 template <int NCOMP, int Q1d> 97 inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 98 const CeedInt node = data.tidx; 99 const CeedInt ind = node + elem * Q1d; 100 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 101 r_u[comp] = d_u[ind * NCOMP + comp]; 102 } 103 } 104 105 template <int NCOMP, int P1d> 106 inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 107 if (data.tidx<P1d) 108 { 109 const CeedInt node = data.tidx; 110 const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 111 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 112 atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 113 } 114 } 115 } 116 117 template <int NCOMP, int P1d> 118 inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 119 if (data.tidx<P1d) 120 { 121 const CeedInt node = data.tidx; 122 const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 123 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 124 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 125 } 126 } 127 } 128 129 template <int NCOMP, int Q1d> 130 inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 131 const CeedInt node = data.tidx; 132 const CeedInt ind = node + elem * Q1d; 133 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 134 d_v[ind + nquads * comp] = r_v[comp]; 135 } 136 } 137 138 template <int NCOMP, int Q1d> 139 inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 140 const CeedInt node = data.tidx; 141 const CeedInt ind = node + elem * Q1d; 142 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 143 d_v[ind * NCOMP + comp] = r_v[comp]; 144 } 145 } 146 147 template <int NCOMP, int P1d, int Q1d> 148 inline __device__ void ContractX1d(BackendData& data, 149 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 150 data.slice[data.tidx] = *U; 151 __syncthreads(); 152 *V = 0.0; 153 for (int i = 0; i < P1d; ++i) { 154 *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 155 } 156 __syncthreads(); 157 } 158 159 template <int NCOMP, int P1d, int Q1d> 160 inline __device__ void ContractTransposeX1d(BackendData& data, 161 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 162 data.slice[data.tidx] = *U; 163 __syncthreads(); 164 *V = 0.0; 165 for (int i = 0; i < Q1d; ++i) { 166 *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 167 } 168 __syncthreads(); 169 } 170 171 template <int NCOMP, int P1d, int Q1d> 172 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 173 CeedScalar *__restrict__ r_V) { 174 for(int comp=0; comp<NCOMP; comp++) { 175 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 176 } 177 } 178 179 template <int NCOMP, int P1d, int Q1d> 180 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 181 CeedScalar *__restrict__ r_V) { 182 for(int comp=0; comp<NCOMP; comp++) { 183 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 184 } 185 } 186 187 template <int NCOMP, int P1d, int Q1d> 188 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 189 const CeedScalar *c_B, const CeedScalar *c_G, 190 CeedScalar *__restrict__ r_V) { 191 for(int comp=0; comp<NCOMP; comp++) { 192 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 193 } 194 } 195 196 template <int NCOMP, int P1d, int Q1d> 197 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 198 const CeedScalar *c_B, const CeedScalar *c_G, 199 CeedScalar *__restrict__ r_V) { 200 for(int comp=0; comp<NCOMP; comp++) { 201 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 202 } 203 } 204 205 //**** 206 // 2D 207 template <int NCOMP, int P1d> 208 inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 209 if (data.tidx<P1d && data.tidy<P1d) 210 { 211 const CeedInt node = data.tidx + data.tidy*P1d; 212 const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 213 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 214 r_u[comp] = d_u[ind + nnodes * comp]; 215 } 216 } 217 } 218 219 template <int NCOMP, int P1d> 220 inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 221 if (data.tidx<P1d && data.tidy<P1d) 222 { 223 const CeedInt node = data.tidx + data.tidy*P1d; 224 const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 225 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 226 r_u[comp] = d_u[ind * NCOMP + comp]; 227 } 228 } 229 } 230 231 template <int NCOMP, int Q1d> 232 inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 233 const CeedInt node = data.tidx + data.tidy*Q1d; 234 const CeedInt ind = node + elem * Q1d*Q1d; 235 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 236 r_u[comp] = d_u[ind + nquads * comp]; 237 } 238 } 239 240 template <int NCOMP, int Q1d> 241 inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 242 const CeedInt node = data.tidx + data.tidy*Q1d; 243 const CeedInt ind = node + elem * Q1d*Q1d; 244 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 245 r_u[comp] = d_u[ind * NCOMP + comp]; 246 } 247 } 248 249 template <int NCOMP, int P1d> 250 inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 251 if (data.tidx<P1d && data.tidy<P1d) 252 { 253 const CeedInt node = data.tidx + data.tidy*P1d; 254 const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 255 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 256 atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 257 } 258 } 259 } 260 261 template <int NCOMP, int P1d> 262 inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 263 if (data.tidx<P1d && data.tidy<P1d) 264 { 265 const CeedInt node = data.tidx + data.tidy*P1d; 266 const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 267 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 268 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 269 } 270 } 271 } 272 273 template <int NCOMP, int Q1d> 274 inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 275 const CeedInt node = data.tidx + data.tidy*Q1d; 276 const CeedInt ind = node + elem * Q1d*Q1d; 277 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 278 d_v[ind + nquads * comp] = r_v[comp]; 279 } 280 } 281 282 template <int NCOMP, int Q1d> 283 inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 284 const CeedInt node = data.tidx + data.tidy*Q1d; 285 const CeedInt ind = node + elem * Q1d*Q1d; 286 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 287 d_v[ind * NCOMP + comp] = r_v[comp]; 288 } 289 } 290 291 template <int NCOMP, int P1d, int Q1d> 292 inline __device__ void ContractX2d(BackendData& data, 293 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 294 data.slice[data.tidx+data.tidy*Q1d] = *U; 295 __syncthreads(); 296 *V = 0.0; 297 for (int i = 0; i < P1d; ++i) { 298 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 299 } 300 __syncthreads(); 301 } 302 303 template <int NCOMP, int P1d, int Q1d> 304 inline __device__ void ContractY2d(BackendData& data, 305 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 306 data.slice[data.tidx+data.tidy*Q1d] = *U; 307 __syncthreads(); 308 *V = 0.0; 309 for (int i = 0; i < P1d; ++i) { 310 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 311 } 312 __syncthreads(); 313 } 314 315 template <int NCOMP, int P1d, int Q1d> 316 inline __device__ void ContractYTranspose2d(BackendData& data, 317 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 318 data.slice[data.tidx+data.tidy*Q1d] = *U; 319 __syncthreads(); 320 *V = 0.0; 321 if (data.tidy<P1d) { 322 for (int i = 0; i < Q1d; ++i) { 323 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 324 } 325 } 326 __syncthreads(); 327 } 328 329 template <int NCOMP, int P1d, int Q1d> 330 inline __device__ void ContractXTranspose2d(BackendData& data, 331 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 332 data.slice[data.tidx+data.tidy*Q1d] = *U; 333 __syncthreads(); 334 *V = 0.0; 335 if (data.tidx<P1d) { 336 for (int i = 0; i < Q1d; ++i) { 337 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 338 } 339 } 340 __syncthreads(); 341 } 342 343 template <int NCOMP, int P1d, int Q1d> 344 inline __device__ void ContractXTransposeAdd2d(BackendData& data, 345 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 346 data.slice[data.tidx+data.tidy*Q1d] = *U; 347 __syncthreads(); 348 if (data.tidx<P1d) { 349 for (int i = 0; i < Q1d; ++i) { 350 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 351 } 352 } 353 __syncthreads(); 354 } 355 356 template <int NCOMP, int P1d, int Q1d> 357 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 358 CeedScalar *__restrict__ r_V) { 359 CeedScalar r_t[1]; 360 for(int comp=0; comp<NCOMP; comp++) { 361 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 362 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 363 } 364 } 365 366 template <int NCOMP, int P1d, int Q1d> 367 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 368 CeedScalar *__restrict__ r_V) { 369 CeedScalar r_t[1]; 370 for(int comp=0; comp<NCOMP; comp++) { 371 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 372 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 373 } 374 } 375 376 template <int NCOMP, int P1d, int Q1d> 377 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 378 const CeedScalar *c_B, const CeedScalar *c_G, 379 CeedScalar *__restrict__ r_V) { 380 CeedScalar r_t[1]; 381 for(int comp=0; comp<NCOMP; comp++) { 382 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 383 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 384 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 385 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 386 } 387 } 388 389 template <int NCOMP, int P1d, int Q1d> 390 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 391 const CeedScalar *c_B, const CeedScalar *c_G, 392 CeedScalar *__restrict__ r_V) { 393 CeedScalar r_t[1]; 394 for(int comp=0; comp<NCOMP; comp++) { 395 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 396 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 397 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 398 ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 399 } 400 } 401 402 //**** 403 // 3D 404 template <int NCOMP, int P1d> 405 inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 406 if (data.tidx<P1d && data.tidy<P1d) { 407 for (CeedInt z = 0; z < P1d; ++z) { 408 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 409 const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 410 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 411 r_u[z+comp*P1d] = d_u[ind + nnodes * comp]; 412 } 413 } 414 } 415 } 416 417 template <int NCOMP, int P1d> 418 inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 419 if (data.tidx<P1d && data.tidy<P1d) { 420 for (CeedInt z = 0; z < P1d; ++z) { 421 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 422 const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 423 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 424 r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 425 } 426 } 427 } 428 } 429 430 template <int NCOMP, int Q1d> 431 inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 432 for(CeedInt z=0; z < Q1d; ++z) { 433 const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 434 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 435 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 436 r_u[z+comp*Q1d] = d_u[ind + nquads * comp]; 437 } 438 } 439 } 440 441 template <int NCOMP, int Q1d> 442 inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 443 for(CeedInt z=0; z < Q1d; ++z) { 444 const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 445 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 446 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 447 r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp]; 448 } 449 } 450 } 451 452 template <int NCOMP, int P1d> 453 inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 454 if (data.tidx<P1d && data.tidy<P1d) { 455 for (CeedInt z = 0; z < P1d; ++z) { 456 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 457 const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 458 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 459 atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]); 460 } 461 } 462 } 463 } 464 465 template <int NCOMP, int P1d> 466 inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 467 if (data.tidx<P1d && data.tidy<P1d) { 468 for (CeedInt z = 0; z < P1d; ++z) { 469 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 470 const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 471 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 472 atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 473 } 474 } 475 } 476 } 477 478 template <int NCOMP, int Q1d> 479 inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 480 for(CeedInt z=0; z < Q1d; ++z) { 481 const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 482 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 483 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 484 d_v[ind + nquads * comp] = r_v[z+comp*Q1d]; 485 } 486 } 487 } 488 489 template <int NCOMP, int Q1d> 490 inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 491 for(CeedInt z=0; z < Q1d; ++z) { 492 const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 493 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 494 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 495 d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d]; 496 } 497 } 498 } 499 500 template <int NCOMP, int P1d, int Q1d> 501 inline __device__ void ContractX3d(BackendData& data, 502 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 503 for (int k = 0; k < P1d; ++k) { 504 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 505 __syncthreads(); 506 V[k] = 0.0; 507 for (int i = 0; i < P1d; ++i) { 508 V[k] += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 509 } 510 __syncthreads(); 511 } 512 } 513 514 template <int NCOMP, int P1d, int Q1d> 515 inline __device__ void ContractY3d(BackendData& data, 516 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 517 for (int k = 0; k < P1d; ++k) { 518 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 519 __syncthreads(); 520 V[k] = 0.0; 521 for (int i = 0; i < P1d; ++i) { 522 V[k] += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 523 } 524 __syncthreads(); 525 } 526 } 527 528 template <int NCOMP, int P1d, int Q1d> 529 inline __device__ void ContractZ3d(BackendData& data, 530 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 531 for (int k = 0; k < Q1d; ++k) { 532 V[k] = 0.0; 533 for (int i = 0; i < P1d; ++i) { 534 V[k] += B[i + k*P1d] * U[i];//contract z direction 535 } 536 } 537 } 538 539 template <int NCOMP, int P1d, int Q1d> 540 inline __device__ void ContractTransposeZ3d(BackendData& data, 541 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 542 for (int k = 0; k < Q1d; ++k) { 543 V[k] = 0.0; 544 if (k<P1d) { 545 for (int i = 0; i < Q1d; ++i) { 546 V[k] += B[k + i*P1d] * U[i];//contract z direction 547 } 548 } 549 } 550 } 551 552 template <int NCOMP, int P1d, int Q1d> 553 inline __device__ void ContractTransposeY3d(BackendData& data, 554 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 555 for (int k = 0; k < P1d; ++k) { 556 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 557 __syncthreads(); 558 V[k] = 0.0; 559 if (data.tidy<P1d) { 560 for (int i = 0; i < Q1d; ++i) { 561 V[k] += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 562 } 563 } 564 __syncthreads(); 565 } 566 } 567 568 template <int NCOMP, int P1d, int Q1d> 569 inline __device__ void ContractTransposeX3d(BackendData& data, 570 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 571 for (int k = 0; k < P1d; ++k) { 572 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 573 __syncthreads(); 574 V[k] = 0.0; 575 if (data.tidx<P1d) { 576 for (int i = 0; i < Q1d; ++i) { 577 V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 578 } 579 } 580 __syncthreads(); 581 } 582 } 583 584 template <int NCOMP, int P1d, int Q1d> 585 inline __device__ void ContractTransposeAddX3d(BackendData& data, 586 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 587 for (int k = 0; k < P1d; ++k) { 588 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 589 __syncthreads(); 590 if (data.tidx<P1d) { 591 for (int i = 0; i < Q1d; ++i) { 592 V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 593 } 594 } 595 __syncthreads(); 596 } 597 } 598 599 template <int NCOMP, int P1d, int Q1d> 600 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 601 CeedScalar *__restrict__ r_V) { 602 CeedScalar r_t1[Q1d]; 603 CeedScalar r_t2[Q1d]; 604 for(int comp=0; comp<NCOMP; comp++) { 605 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 606 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 607 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 608 } 609 } 610 611 template <int NCOMP, int P1d, int Q1d> 612 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 613 CeedScalar *__restrict__ r_V) { 614 CeedScalar r_t1[Q1d]; 615 CeedScalar r_t2[Q1d]; 616 for(int comp=0; comp<NCOMP; comp++) { 617 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 618 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 619 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 620 } 621 } 622 623 template <int NCOMP, int P1d, int Q1d> 624 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 625 const CeedScalar *c_B, const CeedScalar *c_G, 626 CeedScalar *__restrict__ r_V) { 627 CeedScalar r_t1[Q1d]; 628 CeedScalar r_t2[Q1d]; 629 for(int comp=0; comp<NCOMP; comp++) { 630 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 631 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 632 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 633 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 634 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 635 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 636 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 637 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 638 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 639 } 640 } 641 642 template <int NCOMP, int P1d, int Q1d> 643 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 644 const CeedScalar *c_B, const CeedScalar *c_G, 645 CeedScalar *__restrict__ r_V) { 646 CeedScalar r_t1[Q1d]; 647 CeedScalar r_t2[Q1d]; 648 for(int comp=0; comp<NCOMP; comp++) { 649 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 650 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 651 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 652 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 653 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 654 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 655 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 656 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 657 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 658 } 659 } 660 661 template <int Q1d> 662 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 663 *w = qweight1d[data.tidx]; 664 } 665 666 template <int Q1d> 667 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 668 *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 669 } 670 671 template <int Q1d> 672 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 673 const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 674 for (int z = 0; z < Q1d; ++z) 675 { 676 w[z] = pw*qweight1d[z]; 677 } 678 } 679 680 ); 681 682 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 683 684 using std::ostringstream; 685 using std::string; 686 int ierr; 687 bool setupdone; 688 ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 689 if (setupdone) return 0; 690 Ceed ceed; 691 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 692 CeedOperator_Cuda_gen *data; 693 ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 694 CeedQFunction qf; 695 CeedQFunction_Cuda_gen *qf_data; 696 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 697 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 698 CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes; 699 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 700 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 701 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 702 CeedChk(ierr); 703 CeedOperatorField *opinputfields, *opoutputfields; 704 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 705 CeedChk(ierr); 706 CeedQFunctionField *qfinputfields, *qfoutputfields; 707 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 708 CeedChk(ierr); 709 CeedEvalMode emode; 710 CeedTransposeMode lmode; 711 CeedBasis basis; 712 CeedBasis_Cuda_shared *basis_data; 713 CeedElemRestriction Erestrict; 714 CeedElemRestriction_Cuda_reg *restr_data; 715 716 ostringstream code; 717 string devFunctions(deviceFunctions); 718 719 // Add atomicAdd function for old NVidia architectures 720 struct cudaDeviceProp prop; 721 Ceed delegate; 722 CeedGetDelegate(ceed, &delegate); 723 Ceed_Cuda *ceed_data; 724 ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr); 725 ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 726 if(prop.major<6){ 727 code << atomicAdd; 728 } 729 730 code << devFunctions; 731 732 string qFunction(qf_data->qFunctionSource); 733 734 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 735 code << qFunction; 736 737 // Setup 738 code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 739 // Input Evecs and Restriction 740 for (CeedInt i = 0; i < numinputfields; i++) { 741 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 742 CeedChk(ierr); 743 if (emode == CEED_EVAL_WEIGHT) { // Skip 744 } else { 745 code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 746 if (emode != CEED_EVAL_NONE) 747 { 748 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 749 bool isTensor; 750 ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 751 //TODO check that all are the same 752 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 753 if (isTensor) 754 { 755 //TODO check that all are the same 756 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 757 } else { 758 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 759 } 760 } 761 } 762 } 763 data->dim = dim; 764 data->Q1d = Q1d; 765 766 for (CeedInt i = 0; i < numoutputfields; i++) { 767 code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 768 } 769 code << "const CeedInt Dim = "<<dim<<";\n"; 770 code << "const CeedInt Q1d = "<<Q1d<<";\n"; 771 // code << "const CeedInt Q = "<<Q<<";\n"; 772 code << "extern __shared__ CeedScalar slice[];\n"; 773 code << "BackendData data;\n"; 774 code << "data.tidx = threadIdx.x;\n"; 775 code << "data.tidy = threadIdx.y;\n"; 776 code << "data.tidz = threadIdx.z;\n"; 777 code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 778 code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 779 for (CeedInt i = 0; i < numinputfields; i++) { 780 code << "// Input field "<<i<<"\n"; 781 // Get elemsize, emode, ncomp 782 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 783 CeedChk(ierr); 784 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 785 CeedChk(ierr); 786 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 787 CeedChk(ierr); 788 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 789 CeedChk(ierr); 790 // Basis action 791 switch (emode) { 792 case CEED_EVAL_NONE: 793 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 794 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 795 code << " const CeedInt nquads_in_"<<i<<" = "<<nnodes<<";\n"; 796 break; 797 case CEED_EVAL_INTERP: 798 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 799 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 800 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 801 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 802 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 803 code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 804 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 805 data->B.in[i] = basis_data->d_interp1d; 806 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 807 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 808 break; 809 case CEED_EVAL_GRAD: 810 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 811 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 812 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 813 code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 814 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 815 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 816 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 817 data->B.in[i] = basis_data->d_interp1d; 818 data->G.in[i] = basis_data->d_grad1d; 819 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 820 code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 821 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 822 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 823 break; 824 case CEED_EVAL_WEIGHT: 825 break; // No action 826 case CEED_EVAL_DIV: 827 break; // TODO: Not implemented 828 case CEED_EVAL_CURL: 829 break; // TODO: Not implemented 830 } 831 } 832 for (CeedInt i = 0; i < numoutputfields; i++) { 833 code << "// Output field "<<i<<"\n"; 834 // Get elemsize, emode, ncomp 835 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 836 CeedChk(ierr); 837 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 838 CeedChk(ierr); 839 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 840 CeedChk(ierr); 841 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 842 CeedChk(ierr); 843 // Basis action 844 switch (emode) { 845 case CEED_EVAL_NONE: 846 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 847 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 848 ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 849 code << " const CeedInt nquads_out_"<<i<<" = "<<nnodes<<";\n"; 850 break; // No action 851 case CEED_EVAL_INTERP: 852 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 853 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 854 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 855 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 856 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 857 data->B.out[i] = basis_data->d_interp1d; 858 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 859 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 860 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 861 code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 862 break; 863 case CEED_EVAL_GRAD: 864 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 865 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 866 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 867 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 868 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 869 data->B.out[i] = basis_data->d_interp1d; 870 data->G.out[i] = basis_data->d_grad1d; 871 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 872 code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 873 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 874 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 875 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 876 code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 877 break; 878 case CEED_EVAL_WEIGHT: { 879 Ceed ceed; 880 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 881 return CeedError(ceed, 1, 882 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 883 break; // Should not occur 884 } 885 case CEED_EVAL_DIV: 886 break; // TODO: Not implemented 887 case CEED_EVAL_CURL: 888 break; // TODO: Not implemented 889 } 890 } 891 code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 892 // Input basis apply if needed 893 for (CeedInt i = 0; i < numinputfields; i++) { 894 code << "// Input field "<<i<<"\n"; 895 // Get elemsize, emode, ncomp 896 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 897 CeedChk(ierr); 898 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 899 CeedChk(ierr); 900 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 901 CeedChk(ierr); 902 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 903 CeedChk(ierr); 904 // Basis action 905 switch (emode) { 906 case CEED_EVAL_NONE: 907 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 908 ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 909 code << " readQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n"; 910 break; 911 case CEED_EVAL_INTERP: 912 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 913 ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 914 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 915 data->indices.in[i] = restr_data->d_ind; 916 code << " readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 917 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 918 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 919 break; 920 case CEED_EVAL_GRAD: 921 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 922 ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 923 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 924 data->indices.in[i] = restr_data->d_ind; 925 code << " readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 926 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 927 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 928 break; 929 case CEED_EVAL_WEIGHT: 930 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 931 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 932 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 933 data->W = basis_data->d_qweight1d; 934 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 935 break; // No action 936 case CEED_EVAL_DIV: 937 break; // TODO: Not implemented 938 case CEED_EVAL_CURL: 939 break; // TODO: Not implemented 940 } 941 } 942 // Q function 943 code << "// QFunction\n"; 944 for (CeedInt i = 0; i < numoutputfields; i++) { 945 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 946 CeedChk(ierr); 947 if (emode==CEED_EVAL_GRAD) 948 { 949 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 950 } 951 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 952 { 953 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 954 } 955 } 956 code << " CeedScalar* in["<<numinputfields<<"];\n"; 957 for (CeedInt i = 0; i < numinputfields; i++) { 958 code << " in["<<i<<"] = r_t"<<i<<";\n"; 959 } 960 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 961 for (CeedInt i = 0; i < numoutputfields; i++) { 962 code << " out["<<i<<"] = r_tt"<<i<<";\n"; 963 } 964 string qFunctionName(qf_data->qFunctionName); 965 code << " "<<qFunctionName<<"(ctx, "<<(dim==3?"Q1d":"1")<<", "; 966 code << "in, out"; 967 code << ");\n"; 968 969 // Output basis apply if needed 970 for (CeedInt i = 0; i < numoutputfields; i++) { 971 code << "// Output field "<<i<<"\n"; 972 // Get elemsize, emode, ncomp 973 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 974 CeedChk(ierr); 975 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 976 CeedChk(ierr); 977 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 978 CeedChk(ierr); 979 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 980 CeedChk(ierr); 981 // Basis action 982 switch (emode) { 983 case CEED_EVAL_NONE: 984 ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 985 code << " writeQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n"; 986 break; // No action 987 case CEED_EVAL_INTERP: 988 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 989 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 990 ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 991 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 992 data->indices.out[i] = restr_data->d_ind; 993 code << " writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 994 break; 995 case CEED_EVAL_GRAD: 996 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 997 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 998 ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 999 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1000 data->indices.out[i] = restr_data->d_ind; 1001 code << " writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1002 break; 1003 case CEED_EVAL_WEIGHT: { 1004 Ceed ceed; 1005 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1006 return CeedError(ceed, 1, 1007 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1008 break; // Should not occur 1009 } 1010 case CEED_EVAL_DIV: 1011 break; // TODO: Not implemented 1012 case CEED_EVAL_CURL: 1013 break; // TODO: Not implemented 1014 } 1015 } 1016 1017 code << " }\n"; 1018 code << "}\n\n"; 1019 1020 // std::cout << code.str(); 1021 1022 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr); 1023 ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1024 CeedChk(ierr); 1025 1026 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1027 1028 return 0; 1029 } 1030