1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 #include <ceed-backend.h> 17 #include "ceed-cuda-gen.h" 18 #include <iostream> 19 #include <sstream> 20 #include "../cuda/ceed-cuda.h" 21 #include "../cuda-reg/ceed-cuda-reg.h" 22 #include "../cuda-shared/ceed-cuda-shared.h" 23 24 static const char *atomicAdd = QUOTE( 25 __device__ double atomicAdd(double *address, double val) { 26 unsigned long long int *address_as_ull = (unsigned long long int *)address; 27 unsigned long long int old = *address_as_ull, assumed; 28 do { 29 assumed = old; 30 old = 31 atomicCAS(address_as_ull, assumed, 32 __double_as_longlong(val + 33 __longlong_as_double(assumed))); 34 // Note: uses integer comparison to avoid hang in case of NaN 35 // (since NaN != NaN) 36 } while (assumed != old); 37 return __longlong_as_double(old); 38 } 39 ); 40 41 static const char *deviceFunctions = QUOTE( 42 43 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45 46 typedef struct { 47 CeedInt tidx; 48 CeedInt tidy; 49 CeedInt tidz; 50 CeedInt tid; 51 CeedScalar* slice; 52 } BackendData; 53 54 template <int P, int Q> 55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56 for(int i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57 B[i] = d_B[i]; 58 } 59 __syncthreads; 60 } 61 62 //**** 63 // 1D 64 template <int NCOMP, int P1d> 65 inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 66 if (data.tidx<P1d) 67 { 68 const CeedInt dof = data.tidx; 69 const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 70 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 71 r_u[comp] = d_u[ind + nnodes * comp]; 72 } 73 } 74 } 75 76 template <int NCOMP, int P1d> 77 inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 78 if (data.tidx<P1d) 79 { 80 const CeedInt dof = data.tidx; 81 const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 82 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 83 r_u[comp] = d_u[ind * NCOMP + comp]; 84 } 85 } 86 } 87 88 template <int NCOMP, int Q1d> 89 inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 90 const CeedInt dof = data.tidx; 91 const CeedInt ind = dof + elem * Q1d; 92 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 93 r_u[comp] = d_u[ind + nquads * comp]; 94 } 95 } 96 97 template <int NCOMP, int Q1d> 98 inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 99 const CeedInt dof = data.tidx; 100 const CeedInt ind = dof + elem * Q1d; 101 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 102 r_u[comp] = d_u[ind * NCOMP + comp]; 103 } 104 } 105 106 template <int NCOMP, int P1d> 107 inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 108 if (data.tidx<P1d) 109 { 110 const CeedInt dof = data.tidx; 111 const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 112 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 113 atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 114 } 115 } 116 } 117 118 template <int NCOMP, int P1d> 119 inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 120 if (data.tidx<P1d) 121 { 122 const CeedInt dof = data.tidx; 123 const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 124 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 125 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 126 } 127 } 128 } 129 130 template <int NCOMP, int Q1d> 131 inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 132 const CeedInt dof = data.tidx; 133 const CeedInt ind = dof + elem * Q1d; 134 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 135 d_v[ind + nquads * comp] = r_v[comp]; 136 } 137 } 138 139 template <int NCOMP, int Q1d> 140 inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 141 const CeedInt dof = data.tidx; 142 const CeedInt ind = dof + elem * Q1d; 143 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 144 d_v[ind * NCOMP + comp] = r_v[comp]; 145 } 146 } 147 148 template <int NCOMP, int P1d, int Q1d> 149 inline __device__ void ContractX1d(BackendData& data, 150 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 151 data.slice[data.tidx] = *U; 152 __syncthreads(); 153 *V = 0.0; 154 for (int i = 0; i < P1d; ++i) { 155 *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 156 } 157 __syncthreads(); 158 } 159 160 template <int NCOMP, int P1d, int Q1d> 161 inline __device__ void ContractTransposeX1d(BackendData& data, 162 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 163 data.slice[data.tidx] = *U; 164 __syncthreads(); 165 *V = 0.0; 166 for (int i = 0; i < Q1d; ++i) { 167 *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 168 } 169 __syncthreads(); 170 } 171 172 template <int NCOMP, int P1d, int Q1d> 173 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 174 CeedScalar *__restrict__ r_V) { 175 for(int comp=0; comp<NCOMP; comp++) { 176 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 177 } 178 } 179 180 template <int NCOMP, int P1d, int Q1d> 181 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 182 CeedScalar *__restrict__ r_V) { 183 for(int comp=0; comp<NCOMP; comp++) { 184 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 185 } 186 } 187 188 template <int NCOMP, int P1d, int Q1d> 189 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 190 const CeedScalar *c_B, const CeedScalar *c_G, 191 CeedScalar *__restrict__ r_V) { 192 for(int comp=0; comp<NCOMP; comp++) { 193 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 194 } 195 } 196 197 template <int NCOMP, int P1d, int Q1d> 198 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 199 const CeedScalar *c_B, const CeedScalar *c_G, 200 CeedScalar *__restrict__ r_V) { 201 for(int comp=0; comp<NCOMP; comp++) { 202 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 203 } 204 } 205 206 //**** 207 // 2D 208 template <int NCOMP, int P1d> 209 inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 210 if (data.tidx<P1d && data.tidy<P1d) 211 { 212 const CeedInt dof = data.tidx + data.tidy*P1d; 213 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 214 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 215 r_u[comp] = d_u[ind + nnodes * comp]; 216 } 217 } 218 } 219 220 template <int NCOMP, int P1d> 221 inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 222 if (data.tidx<P1d && data.tidy<P1d) 223 { 224 const CeedInt dof = data.tidx + data.tidy*P1d; 225 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 226 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 227 r_u[comp] = d_u[ind * NCOMP + comp]; 228 } 229 } 230 } 231 232 template <int NCOMP, int Q1d> 233 inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 234 const CeedInt dof = data.tidx + data.tidy*Q1d; 235 const CeedInt ind = dof + elem * Q1d*Q1d; 236 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 237 r_u[comp] = d_u[ind + nquads * comp]; 238 } 239 } 240 241 template <int NCOMP, int Q1d> 242 inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 243 const CeedInt dof = data.tidx + data.tidy*Q1d; 244 const CeedInt ind = dof + elem * Q1d*Q1d; 245 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 246 r_u[comp] = d_u[ind * NCOMP + comp]; 247 } 248 } 249 250 template <int NCOMP, int P1d> 251 inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 252 if (data.tidx<P1d && data.tidy<P1d) 253 { 254 const CeedInt dof = data.tidx + data.tidy*P1d; 255 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 256 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 257 atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 258 } 259 } 260 } 261 262 template <int NCOMP, int P1d> 263 inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 264 if (data.tidx<P1d && data.tidy<P1d) 265 { 266 const CeedInt dof = data.tidx + data.tidy*P1d; 267 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 268 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 269 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 270 } 271 } 272 } 273 274 template <int NCOMP, int Q1d> 275 inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 276 const CeedInt dof = data.tidx + data.tidy*Q1d; 277 const CeedInt ind = dof + elem * Q1d*Q1d; 278 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 279 d_v[ind + nquads * comp] = r_v[comp]; 280 } 281 } 282 283 template <int NCOMP, int Q1d> 284 inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 285 const CeedInt dof = data.tidx + data.tidy*Q1d; 286 const CeedInt ind = dof + elem * Q1d*Q1d; 287 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 288 d_v[ind * NCOMP + comp] = r_v[comp]; 289 } 290 } 291 292 template <int NCOMP, int P1d, int Q1d> 293 inline __device__ void ContractX2d(BackendData& data, 294 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 295 data.slice[data.tidx+data.tidy*Q1d] = *U; 296 __syncthreads(); 297 *V = 0.0; 298 for (int i = 0; i < P1d; ++i) { 299 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 300 } 301 __syncthreads(); 302 } 303 304 template <int NCOMP, int P1d, int Q1d> 305 inline __device__ void ContractY2d(BackendData& data, 306 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 307 data.slice[data.tidx+data.tidy*Q1d] = *U; 308 __syncthreads(); 309 *V = 0.0; 310 for (int i = 0; i < P1d; ++i) { 311 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 312 } 313 __syncthreads(); 314 } 315 316 template <int NCOMP, int P1d, int Q1d> 317 inline __device__ void ContractYTranspose2d(BackendData& data, 318 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 319 data.slice[data.tidx+data.tidy*Q1d] = *U; 320 __syncthreads(); 321 *V = 0.0; 322 if (data.tidy<P1d) { 323 for (int i = 0; i < Q1d; ++i) { 324 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 325 } 326 } 327 __syncthreads(); 328 } 329 330 template <int NCOMP, int P1d, int Q1d> 331 inline __device__ void ContractXTranspose2d(BackendData& data, 332 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 333 data.slice[data.tidx+data.tidy*Q1d] = *U; 334 __syncthreads(); 335 *V = 0.0; 336 if (data.tidx<P1d) { 337 for (int i = 0; i < Q1d; ++i) { 338 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 339 } 340 } 341 __syncthreads(); 342 } 343 344 template <int NCOMP, int P1d, int Q1d> 345 inline __device__ void ContractXTransposeAdd2d(BackendData& data, 346 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 347 data.slice[data.tidx+data.tidy*Q1d] = *U; 348 __syncthreads(); 349 if (data.tidx<P1d) { 350 for (int i = 0; i < Q1d; ++i) { 351 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 352 } 353 } 354 __syncthreads(); 355 } 356 357 template <int NCOMP, int P1d, int Q1d> 358 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 359 CeedScalar *__restrict__ r_V) { 360 CeedScalar r_t[1]; 361 for(int comp=0; comp<NCOMP; comp++) { 362 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 363 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 364 } 365 } 366 367 template <int NCOMP, int P1d, int Q1d> 368 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 369 CeedScalar *__restrict__ r_V) { 370 CeedScalar r_t[1]; 371 for(int comp=0; comp<NCOMP; comp++) { 372 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 373 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 374 } 375 } 376 377 template <int NCOMP, int P1d, int Q1d> 378 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 379 const CeedScalar *c_B, const CeedScalar *c_G, 380 CeedScalar *__restrict__ r_V) { 381 CeedScalar r_t[1]; 382 for(int comp=0; comp<NCOMP; comp++) { 383 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 384 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 385 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 386 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 387 } 388 } 389 390 template <int NCOMP, int P1d, int Q1d> 391 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 392 const CeedScalar *c_B, const CeedScalar *c_G, 393 CeedScalar *__restrict__ r_V) { 394 CeedScalar r_t[1]; 395 for(int comp=0; comp<NCOMP; comp++) { 396 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 397 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 398 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 399 ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 400 } 401 } 402 403 //**** 404 // 3D 405 template <int NCOMP, int P1d> 406 inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 407 if (data.tidx<P1d && data.tidy<P1d) { 408 for (CeedInt z = 0; z < P1d; ++z) { 409 const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 410 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 411 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 412 r_u[z+comp*P1d] = d_u[ind + nnodes * comp]; 413 } 414 } 415 } 416 } 417 418 template <int NCOMP, int P1d> 419 inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 420 if (data.tidx<P1d && data.tidy<P1d) { 421 for (CeedInt z = 0; z < P1d; ++z) { 422 const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 423 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 424 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 425 r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 426 } 427 } 428 } 429 } 430 431 template <int NCOMP, int Q1d> 432 inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 433 for(CeedInt z=0; z < Q1d; ++z) { 434 const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 435 const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 436 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 437 r_u[z+comp*Q1d] = d_u[ind + nquads * comp]; 438 } 439 } 440 } 441 442 template <int NCOMP, int Q1d> 443 inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 444 for(CeedInt z=0; z < Q1d; ++z) { 445 const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 446 const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 447 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 448 r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp]; 449 } 450 } 451 } 452 453 template <int NCOMP, int P1d> 454 inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 455 if (data.tidx<P1d && data.tidy<P1d) { 456 for (CeedInt z = 0; z < P1d; ++z) { 457 const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 458 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 459 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 460 atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]); 461 } 462 } 463 } 464 } 465 466 template <int NCOMP, int P1d> 467 inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 468 if (data.tidx<P1d && data.tidy<P1d) { 469 for (CeedInt z = 0; z < P1d; ++z) { 470 const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 471 const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 472 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 473 atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 474 } 475 } 476 } 477 } 478 479 template <int NCOMP, int Q1d> 480 inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 481 for(CeedInt z=0; z < Q1d; ++z) { 482 const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 483 const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 484 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 485 d_v[ind + nquads * comp] = r_v[z+comp*Q1d]; 486 } 487 } 488 } 489 490 template <int NCOMP, int Q1d> 491 inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 492 for(CeedInt z=0; z < Q1d; ++z) { 493 const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 494 const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 495 for(CeedInt comp = 0; comp < NCOMP; ++comp) { 496 d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d]; 497 } 498 } 499 } 500 501 template <int NCOMP, int P1d, int Q1d> 502 inline __device__ void ContractX3d(BackendData& data, 503 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 504 for (int k = 0; k < P1d; ++k) { 505 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 506 __syncthreads(); 507 V[k] = 0.0; 508 for (int i = 0; i < P1d; ++i) { 509 V[k] += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 510 } 511 __syncthreads(); 512 } 513 } 514 515 template <int NCOMP, int P1d, int Q1d> 516 inline __device__ void ContractY3d(BackendData& data, 517 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 518 for (int k = 0; k < P1d; ++k) { 519 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 520 __syncthreads(); 521 V[k] = 0.0; 522 for (int i = 0; i < P1d; ++i) { 523 V[k] += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 524 } 525 __syncthreads(); 526 } 527 } 528 529 template <int NCOMP, int P1d, int Q1d> 530 inline __device__ void ContractZ3d(BackendData& data, 531 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 532 for (int k = 0; k < Q1d; ++k) { 533 V[k] = 0.0; 534 for (int i = 0; i < P1d; ++i) { 535 V[k] += B[i + k*P1d] * U[i];//contract z direction 536 } 537 } 538 } 539 540 template <int NCOMP, int P1d, int Q1d> 541 inline __device__ void ContractTransposeZ3d(BackendData& data, 542 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 543 for (int k = 0; k < Q1d; ++k) { 544 V[k] = 0.0; 545 if (k<P1d) { 546 for (int i = 0; i < Q1d; ++i) { 547 V[k] += B[k + i*P1d] * U[i];//contract z direction 548 } 549 } 550 } 551 } 552 553 template <int NCOMP, int P1d, int Q1d> 554 inline __device__ void ContractTransposeY3d(BackendData& data, 555 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 556 for (int k = 0; k < P1d; ++k) { 557 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 558 __syncthreads(); 559 V[k] = 0.0; 560 if (data.tidy<P1d) { 561 for (int i = 0; i < Q1d; ++i) { 562 V[k] += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 563 } 564 } 565 __syncthreads(); 566 } 567 } 568 569 template <int NCOMP, int P1d, int Q1d> 570 inline __device__ void ContractTransposeX3d(BackendData& data, 571 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 572 for (int k = 0; k < P1d; ++k) { 573 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 574 __syncthreads(); 575 V[k] = 0.0; 576 if (data.tidx<P1d) { 577 for (int i = 0; i < Q1d; ++i) { 578 V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 579 } 580 } 581 __syncthreads(); 582 } 583 } 584 585 template <int NCOMP, int P1d, int Q1d> 586 inline __device__ void ContractTransposeAddX3d(BackendData& data, 587 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 588 for (int k = 0; k < P1d; ++k) { 589 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 590 __syncthreads(); 591 if (data.tidx<P1d) { 592 for (int i = 0; i < Q1d; ++i) { 593 V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 594 } 595 } 596 __syncthreads(); 597 } 598 } 599 600 template <int NCOMP, int P1d, int Q1d> 601 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 602 CeedScalar *__restrict__ r_V) { 603 CeedScalar r_t1[Q1d]; 604 CeedScalar r_t2[Q1d]; 605 for(int comp=0; comp<NCOMP; comp++) { 606 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 607 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 608 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 609 } 610 } 611 612 template <int NCOMP, int P1d, int Q1d> 613 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 614 CeedScalar *__restrict__ r_V) { 615 CeedScalar r_t1[Q1d]; 616 CeedScalar r_t2[Q1d]; 617 for(int comp=0; comp<NCOMP; comp++) { 618 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 619 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 620 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 621 } 622 } 623 624 template <int NCOMP, int P1d, int Q1d> 625 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 626 const CeedScalar *c_B, const CeedScalar *c_G, 627 CeedScalar *__restrict__ r_V) { 628 CeedScalar r_t1[Q1d]; 629 CeedScalar r_t2[Q1d]; 630 for(int comp=0; comp<NCOMP; comp++) { 631 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 632 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 633 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 634 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 635 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 636 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 637 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 638 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 639 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 640 } 641 } 642 643 template <int NCOMP, int P1d, int Q1d> 644 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 645 const CeedScalar *c_B, const CeedScalar *c_G, 646 CeedScalar *__restrict__ r_V) { 647 CeedScalar r_t1[Q1d]; 648 CeedScalar r_t2[Q1d]; 649 for(int comp=0; comp<NCOMP; comp++) { 650 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 651 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 652 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 653 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 654 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 655 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 656 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 657 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 658 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 659 } 660 } 661 662 template <int Q1d> 663 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 664 *w = qweight1d[data.tidx]; 665 } 666 667 template <int Q1d> 668 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 669 *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 670 } 671 672 template <int Q1d> 673 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 674 const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 675 for (int z = 0; z < Q1d; ++z) 676 { 677 w[z] = pw*qweight1d[z]; 678 } 679 } 680 681 ); 682 683 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 684 685 using std::ostringstream; 686 using std::string; 687 int ierr; 688 bool setupdone; 689 ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 690 if (setupdone) return 0; 691 Ceed ceed; 692 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 693 CeedOperator_Cuda_gen *data; 694 ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 695 CeedQFunction qf; 696 CeedQFunction_Cuda_gen *qf_data; 697 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 698 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 699 CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes; 700 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 701 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 702 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 703 CeedChk(ierr); 704 CeedOperatorField *opinputfields, *opoutputfields; 705 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 706 CeedChk(ierr); 707 CeedQFunctionField *qfinputfields, *qfoutputfields; 708 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 709 CeedChk(ierr); 710 CeedEvalMode emode; 711 CeedTransposeMode lmode; 712 CeedBasis basis; 713 CeedBasis_Cuda_shared *basis_data; 714 CeedElemRestriction Erestrict; 715 CeedElemRestriction_Cuda_reg *restr_data; 716 717 ostringstream code; 718 string devFunctions(deviceFunctions); 719 720 // Add atomicAdd function for old NVidia architectures 721 struct cudaDeviceProp prop; 722 Ceed delegate; 723 CeedGetDelegate(ceed, &delegate); 724 Ceed_Cuda *ceed_data; 725 ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr); 726 ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 727 if(prop.major<6){ 728 code << atomicAdd; 729 } 730 731 code << devFunctions; 732 733 string qFunction(qf_data->qFunctionSource); 734 code << qFunction; 735 736 // Setup 737 code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 738 // Input Evecs and Restriction 739 for (CeedInt i = 0; i < numinputfields; i++) { 740 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 741 CeedChk(ierr); 742 if (emode == CEED_EVAL_WEIGHT) { // Skip 743 } else { 744 code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 745 if (emode != CEED_EVAL_NONE) 746 { 747 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 748 bool isTensor; 749 ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 750 //TODO check that all are the same 751 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 752 if (isTensor) 753 { 754 //TODO check that all are the same 755 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 756 } else { 757 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 758 } 759 } 760 } 761 } 762 data->dim = dim; 763 data->Q1d = Q1d; 764 765 for (CeedInt i = 0; i < numoutputfields; i++) { 766 code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 767 } 768 code << "const CeedInt Dim = "<<dim<<";\n"; 769 code << "const CeedInt Q1d = "<<Q1d<<";\n"; 770 // code << "const CeedInt Q = "<<Q<<";\n"; 771 code << "extern __shared__ CeedScalar slice[];\n"; 772 code << "BackendData data;\n"; 773 code << "data.tidx = threadIdx.x;\n"; 774 code << "data.tidy = threadIdx.y;\n"; 775 code << "data.tidz = threadIdx.z;\n"; 776 code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 777 code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 778 for (CeedInt i = 0; i < numinputfields; i++) { 779 code << "// Input field "<<i<<"\n"; 780 // Get elemsize, emode, ncomp 781 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 782 CeedChk(ierr); 783 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 784 CeedChk(ierr); 785 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 786 CeedChk(ierr); 787 ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp); 788 CeedChk(ierr); 789 // Basis action 790 switch (emode) { 791 case CEED_EVAL_NONE: 792 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 793 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 794 code << " const CeedInt nquads_in_"<<i<<" = "<<nnodes<<";\n"; 795 break; 796 case CEED_EVAL_INTERP: 797 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 798 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 799 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 800 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 801 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 802 code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 803 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 804 data->B.in[i] = basis_data->d_interp1d; 805 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 806 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 807 break; 808 case CEED_EVAL_GRAD: 809 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 810 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 811 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 812 code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 813 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 814 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 815 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 816 data->B.in[i] = basis_data->d_interp1d; 817 data->G.in[i] = basis_data->d_grad1d; 818 code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 819 code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 820 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 821 code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 822 break; 823 case CEED_EVAL_WEIGHT: 824 break; // No action 825 case CEED_EVAL_DIV: 826 break; // TODO: Not implemented 827 case CEED_EVAL_CURL: 828 break; // TODO: Not implemented 829 } 830 } 831 for (CeedInt i = 0; i < numoutputfields; i++) { 832 code << "// Output field "<<i<<"\n"; 833 // Get elemsize, emode, ncomp 834 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 835 CeedChk(ierr); 836 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 837 CeedChk(ierr); 838 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 839 CeedChk(ierr); 840 ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp); 841 CeedChk(ierr); 842 // Basis action 843 switch (emode) { 844 case CEED_EVAL_NONE: 845 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 846 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 847 ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 848 code << " const CeedInt nquads_out_"<<i<<" = "<<nnodes<<";\n"; 849 break; // No action 850 case CEED_EVAL_INTERP: 851 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 852 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 853 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 854 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 855 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 856 data->B.out[i] = basis_data->d_interp1d; 857 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 858 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 859 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 860 code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 861 break; 862 case CEED_EVAL_GRAD: 863 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 864 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 865 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 866 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 867 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 868 data->B.out[i] = basis_data->d_interp1d; 869 data->G.out[i] = basis_data->d_grad1d; 870 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 871 code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 872 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 873 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 874 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 875 code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 876 break; 877 case CEED_EVAL_WEIGHT: { 878 Ceed ceed; 879 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 880 return CeedError(ceed, 1, 881 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 882 break; // Should not occur 883 } 884 case CEED_EVAL_DIV: 885 break; // TODO: Not implemented 886 case CEED_EVAL_CURL: 887 break; // TODO: Not implemented 888 } 889 } 890 code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 891 // Input basis apply if needed 892 for (CeedInt i = 0; i < numinputfields; i++) { 893 code << "// Input field "<<i<<"\n"; 894 // Get elemsize, emode, ncomp 895 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 896 CeedChk(ierr); 897 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 898 CeedChk(ierr); 899 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 900 CeedChk(ierr); 901 ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp); 902 CeedChk(ierr); 903 // Basis action 904 switch (emode) { 905 case CEED_EVAL_NONE: 906 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 907 ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 908 code << " readQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n"; 909 break; 910 case CEED_EVAL_INTERP: 911 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 912 ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 913 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 914 data->indices.in[i] = restr_data->d_ind; 915 code << " readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 916 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 917 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 918 break; 919 case CEED_EVAL_GRAD: 920 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 921 ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 922 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 923 data->indices.in[i] = restr_data->d_ind; 924 code << " readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 925 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 926 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 927 break; 928 case CEED_EVAL_WEIGHT: 929 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 930 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 931 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 932 data->W = basis_data->d_qweight1d; 933 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 934 break; // No action 935 case CEED_EVAL_DIV: 936 break; // TODO: Not implemented 937 case CEED_EVAL_CURL: 938 break; // TODO: Not implemented 939 } 940 } 941 // Q function 942 code << "// QFunction\n"; 943 for (CeedInt i = 0; i < numoutputfields; i++) { 944 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 945 CeedChk(ierr); 946 if (emode==CEED_EVAL_GRAD) 947 { 948 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 949 } 950 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 951 { 952 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 953 } 954 } 955 //TODO write qfunction load for this backend 956 string qFunctionName(qf_data->qFunctionName); 957 code << " "<<qFunctionName<<"(ctx, "<<(dim==3?"Q1d":"1")<<", "; 958 for (CeedInt i = 0; i < numinputfields; i++) { 959 code << "r_t"<<i<<", "; 960 } 961 for (CeedInt i = 0; i < numoutputfields; i++) { 962 code << "r_tt"<<i; 963 if (i<numoutputfields-1) 964 { 965 code << ", "; 966 } 967 } 968 code << ");\n"; 969 970 // Output basis apply if needed 971 for (CeedInt i = 0; i < numoutputfields; i++) { 972 code << "// Output field "<<i<<"\n"; 973 // Get elemsize, emode, ncomp 974 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 975 CeedChk(ierr); 976 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 977 CeedChk(ierr); 978 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 979 CeedChk(ierr); 980 ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp); 981 CeedChk(ierr); 982 // Basis action 983 switch (emode) { 984 case CEED_EVAL_NONE: 985 ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 986 code << " writeQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n"; 987 break; // No action 988 case CEED_EVAL_INTERP: 989 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 990 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 991 ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 992 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 993 data->indices.out[i] = restr_data->d_ind; 994 code << " writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 995 break; 996 case CEED_EVAL_GRAD: 997 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 998 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 999 ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 1000 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1001 data->indices.out[i] = restr_data->d_ind; 1002 code << " writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1003 break; 1004 case CEED_EVAL_WEIGHT: { 1005 Ceed ceed; 1006 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1007 return CeedError(ceed, 1, 1008 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1009 break; // Should not occur 1010 } 1011 case CEED_EVAL_DIV: 1012 break; // TODO: Not implemented 1013 case CEED_EVAL_CURL: 1014 break; // TODO: Not implemented 1015 } 1016 } 1017 1018 code << " }\n"; 1019 code << "}\n\n"; 1020 1021 // std::cout << code.str(); 1022 1023 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr); 1024 ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1025 CeedChk(ierr); 1026 1027 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1028 1029 return 0; 1030 } 1031