1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 #include <ceed-backend.h> 17 #include "ceed-cuda-gen.h" 18 #include <iostream> 19 #include <sstream> 20 #include "../cuda/ceed-cuda.h" 21 #include "../cuda-reg/ceed-cuda-reg.h" 22 #include "../cuda-shared/ceed-cuda-shared.h" 23 24 static const char *atomicAdd = QUOTE( 25 __device__ double atomicAdd(double *address, double val) { 26 unsigned long long int *address_as_ull = (unsigned long long int *)address; 27 unsigned long long int old = *address_as_ull, assumed; 28 do { 29 assumed = old; 30 old = 31 atomicCAS(address_as_ull, assumed, 32 __double_as_longlong(val + 33 __longlong_as_double(assumed))); 34 // Note: uses integer comparison to avoid hang in case of NaN 35 // (since NaN != NaN) 36 } while (assumed != old); 37 return __longlong_as_double(old); 38 } 39 ); 40 41 static const char *deviceFunctions = QUOTE( 42 43 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45 46 typedef struct { 47 CeedInt tidx; 48 CeedInt tidy; 49 CeedInt tidz; 50 CeedInt tid; 51 CeedScalar* slice; 52 } BackendData; 53 54 template <int P, int Q> 55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56 for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57 B[i] = d_B[i]; 58 } 59 } 60 61 //**** 62 // 1D 63 template <int NCOMP, int COMPSTRIDE, int P1d> 64 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 65 if (data.tidx<P1d) 66 { 67 const CeedInt node = data.tidx; 68 const CeedInt ind = indices[node + elem * P1d]; 69 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 70 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 71 } 72 } 73 } 74 75 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 76 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 77 if (data.tidx<P1d) 78 { 79 const CeedInt node = data.tidx; 80 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 81 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 82 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 83 } 84 } 85 } 86 87 template <int NCOMP, int COMPSTRIDE, int P1d> 88 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 89 if (data.tidx<P1d) 90 { 91 const CeedInt node = data.tidx; 92 const CeedInt ind = indices[node + elem * P1d]; 93 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 94 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 95 } 96 } 97 } 98 99 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 100 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 101 if (data.tidx<P1d) 102 { 103 const CeedInt node = data.tidx; 104 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 105 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 106 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 107 } 108 } 109 } 110 111 template <int NCOMP, int P1d, int Q1d> 112 inline __device__ void ContractX1d(BackendData& data, 113 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 114 data.slice[data.tidx] = *U; 115 __syncthreads(); 116 *V = 0.0; 117 for (CeedInt i = 0; i < P1d; ++i) { 118 *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 119 } 120 __syncthreads(); 121 } 122 123 template <int NCOMP, int P1d, int Q1d> 124 inline __device__ void ContractTransposeX1d(BackendData& data, 125 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 126 data.slice[data.tidx] = *U; 127 __syncthreads(); 128 *V = 0.0; 129 for (CeedInt i = 0; i < Q1d; ++i) { 130 *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 131 } 132 __syncthreads(); 133 } 134 135 template <int NCOMP, int P1d, int Q1d> 136 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 137 CeedScalar *__restrict__ r_V) { 138 for (CeedInt comp=0; comp<NCOMP; comp++) { 139 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 140 } 141 } 142 143 template <int NCOMP, int P1d, int Q1d> 144 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 145 CeedScalar *__restrict__ r_V) { 146 for (CeedInt comp=0; comp<NCOMP; comp++) { 147 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 148 } 149 } 150 151 template <int NCOMP, int P1d, int Q1d> 152 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 153 const CeedScalar *c_B, const CeedScalar *c_G, 154 CeedScalar *__restrict__ r_V) { 155 for (CeedInt comp=0; comp<NCOMP; comp++) { 156 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 157 } 158 } 159 160 template <int NCOMP, int P1d, int Q1d> 161 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 162 const CeedScalar *c_B, const CeedScalar *c_G, 163 CeedScalar *__restrict__ r_V) { 164 for (CeedInt comp=0; comp<NCOMP; comp++) { 165 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 166 } 167 } 168 169 //**** 170 // 2D 171 template <int NCOMP, int COMPSTRIDE, int P1d> 172 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 173 if (data.tidx<P1d && data.tidy<P1d) 174 { 175 const CeedInt node = data.tidx + data.tidy*P1d; 176 const CeedInt ind = indices[node + elem * P1d*P1d]; 177 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 178 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 179 } 180 } 181 } 182 183 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 184 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 185 if (data.tidx<P1d && data.tidy<P1d) 186 { 187 const CeedInt node = data.tidx + data.tidy*P1d; 188 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 189 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 190 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 191 } 192 } 193 } 194 195 template <int NCOMP, int COMPSTRIDE, int P1d> 196 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 197 if (data.tidx<P1d && data.tidy<P1d) 198 { 199 const CeedInt node = data.tidx + data.tidy*P1d; 200 const CeedInt ind = indices[node + elem * P1d*P1d]; 201 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 202 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 203 } 204 } 205 } 206 207 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 208 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 209 if (data.tidx<P1d && data.tidy<P1d) 210 { 211 const CeedInt node = data.tidx + data.tidy*P1d; 212 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 213 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 214 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 215 } 216 } 217 } 218 219 template <int NCOMP, int P1d, int Q1d> 220 inline __device__ void ContractX2d(BackendData& data, 221 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 222 data.slice[data.tidx+data.tidy*Q1d] = *U; 223 __syncthreads(); 224 *V = 0.0; 225 for (CeedInt i = 0; i < P1d; ++i) { 226 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 227 } 228 __syncthreads(); 229 } 230 231 template <int NCOMP, int P1d, int Q1d> 232 inline __device__ void ContractY2d(BackendData& data, 233 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 234 data.slice[data.tidx+data.tidy*Q1d] = *U; 235 __syncthreads(); 236 *V = 0.0; 237 for (CeedInt i = 0; i < P1d; ++i) { 238 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 239 } 240 __syncthreads(); 241 } 242 243 template <int NCOMP, int P1d, int Q1d> 244 inline __device__ void ContractYTranspose2d(BackendData& data, 245 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 246 data.slice[data.tidx+data.tidy*Q1d] = *U; 247 __syncthreads(); 248 *V = 0.0; 249 if (data.tidy<P1d) { 250 for (CeedInt i = 0; i < Q1d; ++i) { 251 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 252 } 253 } 254 __syncthreads(); 255 } 256 257 template <int NCOMP, int P1d, int Q1d> 258 inline __device__ void ContractXTranspose2d(BackendData& data, 259 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 260 data.slice[data.tidx+data.tidy*Q1d] = *U; 261 __syncthreads(); 262 *V = 0.0; 263 if (data.tidx<P1d) { 264 for (CeedInt i = 0; i < Q1d; ++i) { 265 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 266 } 267 } 268 __syncthreads(); 269 } 270 271 template <int NCOMP, int P1d, int Q1d> 272 inline __device__ void ContractXTransposeAdd2d(BackendData& data, 273 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 274 data.slice[data.tidx+data.tidy*Q1d] = *U; 275 __syncthreads(); 276 if (data.tidx<P1d) { 277 for (CeedInt i = 0; i < Q1d; ++i) { 278 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 279 } 280 } 281 __syncthreads(); 282 } 283 284 template <int NCOMP, int P1d, int Q1d> 285 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 286 CeedScalar *__restrict__ r_V) { 287 CeedScalar r_t[1]; 288 for (CeedInt comp=0; comp<NCOMP; comp++) { 289 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 290 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 291 } 292 } 293 294 template <int NCOMP, int P1d, int Q1d> 295 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 296 CeedScalar *__restrict__ r_V) { 297 CeedScalar r_t[1]; 298 for (CeedInt comp=0; comp<NCOMP; comp++) { 299 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 300 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 301 } 302 } 303 304 template <int NCOMP, int P1d, int Q1d> 305 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 306 const CeedScalar *c_B, const CeedScalar *c_G, 307 CeedScalar *__restrict__ r_V) { 308 CeedScalar r_t[1]; 309 for (CeedInt comp=0; comp<NCOMP; comp++) { 310 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 311 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 312 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 313 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 314 } 315 } 316 317 template <int NCOMP, int P1d, int Q1d> 318 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 319 const CeedScalar *c_B, const CeedScalar *c_G, 320 CeedScalar *__restrict__ r_V) { 321 CeedScalar r_t[1]; 322 for (CeedInt comp=0; comp<NCOMP; comp++) { 323 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 324 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 325 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 326 ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 327 } 328 } 329 330 //**** 331 // 3D 332 template <int NCOMP, int COMPSTRIDE, int P1d> 333 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 334 if (data.tidx<P1d && data.tidy<P1d) { 335 for (CeedInt z = 0; z < P1d; ++z) { 336 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 337 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 338 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 339 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 340 } 341 } 342 } 343 } 344 345 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 346 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 347 if (data.tidx<P1d && data.tidy<P1d) { 348 for (CeedInt z = 0; z < P1d; ++z) { 349 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 350 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 351 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 352 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 353 } 354 } 355 } 356 } 357 358 template <int NCOMP, int COMPSTRIDE, int Q1d> 359 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 360 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 361 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 362 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 363 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 364 } 365 } 366 367 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 368 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 369 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 370 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 371 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 372 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 373 } 374 } 375 376 template <int NCOMP, int COMPSTRIDE, int P1d> 377 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 378 if (data.tidx<P1d && data.tidy<P1d) { 379 for (CeedInt z = 0; z < P1d; ++z) { 380 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 381 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 382 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 383 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 384 } 385 } 386 } 387 } 388 389 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 390 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 391 if (data.tidx<P1d && data.tidy<P1d) { 392 for (CeedInt z = 0; z < P1d; ++z) { 393 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 394 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 395 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 396 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 397 } 398 } 399 } 400 } 401 402 template <int NCOMP, int P1d, int Q1d> 403 inline __device__ void ContractX3d(BackendData& data, 404 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 405 CeedScalar r_B[P1d]; 406 for (CeedInt i = 0; i < P1d; ++i) { 407 r_B[i] = B[i + data.tidx*P1d]; 408 } 409 for (CeedInt k = 0; k < P1d; ++k) { 410 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 411 __syncthreads(); 412 V[k] = 0.0; 413 for (CeedInt i = 0; i < P1d; ++i) { 414 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 415 } 416 __syncthreads(); 417 } 418 } 419 420 template <int NCOMP, int P1d, int Q1d> 421 inline __device__ void ContractY3d(BackendData& data, 422 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 423 CeedScalar r_B[P1d]; 424 for (CeedInt i = 0; i < P1d; ++i) { 425 r_B[i] = B[i + data.tidy*P1d]; 426 } 427 for (CeedInt k = 0; k < P1d; ++k) { 428 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 429 __syncthreads(); 430 V[k] = 0.0; 431 for (CeedInt i = 0; i < P1d; ++i) { 432 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 433 } 434 __syncthreads(); 435 } 436 } 437 438 template <int NCOMP, int P1d, int Q1d> 439 inline __device__ void ContractZ3d(BackendData& data, 440 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 441 for (CeedInt k = 0; k < Q1d; ++k) { 442 V[k] = 0.0; 443 for (CeedInt i = 0; i < P1d; ++i) { 444 V[k] += B[i + k*P1d] * U[i];//contract z direction 445 } 446 } 447 } 448 449 template <int NCOMP, int P1d, int Q1d> 450 inline __device__ void ContractTransposeZ3d(BackendData& data, 451 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 452 for (CeedInt k = 0; k < Q1d; ++k) { 453 V[k] = 0.0; 454 if (k<P1d) { 455 for (CeedInt i = 0; i < Q1d; ++i) { 456 V[k] += B[k + i*P1d] * U[i];//contract z direction 457 } 458 } 459 } 460 } 461 462 template <int NCOMP, int P1d, int Q1d> 463 inline __device__ void ContractTransposeY3d(BackendData& data, 464 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 465 CeedScalar r_B[Q1d]; 466 for (CeedInt i = 0; i < Q1d; ++i) 467 { 468 r_B[i] = B[data.tidy + i*P1d]; 469 } 470 for (CeedInt k = 0; k < P1d; ++k) { 471 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 472 __syncthreads(); 473 V[k] = 0.0; 474 if (data.tidy<P1d) { 475 for (CeedInt i = 0; i < Q1d; ++i) { 476 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 477 } 478 } 479 __syncthreads(); 480 } 481 } 482 483 template <int NCOMP, int P1d, int Q1d> 484 inline __device__ void ContractTransposeAddY3d(BackendData& data, 485 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 486 CeedScalar r_B[Q1d]; 487 for (CeedInt i = 0; i < Q1d; ++i) { 488 r_B[i] = B[data.tidy + i*P1d]; 489 } 490 for (CeedInt k = 0; k < P1d; ++k) { 491 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 492 __syncthreads(); 493 if (data.tidy<P1d) { 494 for (CeedInt i = 0; i < Q1d; ++i) { 495 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 496 } 497 } 498 __syncthreads(); 499 } 500 } 501 502 template <int NCOMP, int P1d, int Q1d> 503 inline __device__ void ContractTransposeX3d(BackendData& data, 504 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 505 CeedScalar r_B[Q1d]; 506 for (CeedInt i = 0; i < Q1d; ++i) { 507 r_B[i] = B[data.tidx + i*P1d]; 508 } 509 for (CeedInt k = 0; k < P1d; ++k) { 510 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 511 __syncthreads(); 512 V[k] = 0.0; 513 if (data.tidx<P1d) { 514 for (CeedInt i = 0; i < Q1d; ++i) { 515 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 516 } 517 } 518 __syncthreads(); 519 } 520 } 521 522 template <int NCOMP, int P1d, int Q1d> 523 inline __device__ void ContractTransposeAddX3d(BackendData& data, 524 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 525 CeedScalar r_B[Q1d]; 526 for (CeedInt i = 0; i < Q1d; ++i) { 527 r_B[i] = B[data.tidx + i*P1d]; 528 } 529 for (CeedInt k = 0; k < P1d; ++k) { 530 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 531 __syncthreads(); 532 if (data.tidx<P1d) { 533 for (CeedInt i = 0; i < Q1d; ++i) { 534 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 535 } 536 } 537 __syncthreads(); 538 } 539 } 540 541 template <int NCOMP, int P1d, int Q1d> 542 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 543 CeedScalar *__restrict__ r_V) { 544 CeedScalar r_t1[Q1d]; 545 CeedScalar r_t2[Q1d]; 546 for (CeedInt comp=0; comp<NCOMP; comp++) { 547 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 548 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 549 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 550 } 551 } 552 553 template <int NCOMP, int P1d, int Q1d> 554 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 555 CeedScalar *__restrict__ r_V) { 556 CeedScalar r_t1[Q1d]; 557 CeedScalar r_t2[Q1d]; 558 for (CeedInt comp=0; comp<NCOMP; comp++) { 559 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 560 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 561 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 562 } 563 } 564 565 template <int NCOMP, int P1d, int Q1d> 566 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 567 const CeedScalar *c_B, const CeedScalar *c_G, 568 CeedScalar *__restrict__ r_V) { 569 CeedScalar r_t1[Q1d]; 570 CeedScalar r_t2[Q1d]; 571 for (CeedInt comp=0; comp<NCOMP; comp++) { 572 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 573 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 574 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 575 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 576 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 577 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 578 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 579 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 580 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 581 } 582 } 583 584 template <int NCOMP, int P1d, int Q1d> 585 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 586 const CeedScalar *c_B, const CeedScalar *c_G, 587 CeedScalar *__restrict__ r_V) { 588 CeedScalar r_t1[Q1d]; 589 CeedScalar r_t2[Q1d]; 590 for (CeedInt comp=0; comp<NCOMP; comp++) { 591 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 592 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 593 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 594 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 595 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 596 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 597 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 598 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 599 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 600 } 601 } 602 603 template <int NCOMP, int Q1d> 604 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, 605 const CeedScalar *__restrict__ r_U, 606 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 607 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 608 data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d]; 609 __syncthreads(); 610 // X derivative 611 r_V[comp+0*NCOMP] = 0.0; 612 for (CeedInt i = 0; i < Q1d; ++i) { 613 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 614 } 615 // Y derivative 616 r_V[comp+1*NCOMP] = 0.0; 617 for (CeedInt i = 0; i < Q1d; ++i) { 618 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 619 } 620 // Z derivative 621 r_V[comp+2*NCOMP] = 0.0; 622 for (CeedInt i = 0; i < Q1d; ++i) { 623 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative) 624 } 625 __syncthreads(); 626 } 627 } 628 629 template <int NCOMP, int Q1d> 630 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, 631 const CeedScalar *__restrict__ r_U, 632 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 633 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 634 // X derivative 635 data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 636 __syncthreads(); 637 for (CeedInt i = 0; i < Q1d; ++i) { 638 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 639 } 640 __syncthreads(); 641 // Y derivative 642 data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 643 __syncthreads(); 644 for (CeedInt i = 0; i < Q1d; ++i) { 645 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 646 } 647 __syncthreads(); 648 // Z derivative 649 for (CeedInt i = 0; i < Q1d; ++i) { 650 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative) 651 } 652 } 653 } 654 655 template <int Q1d> 656 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 657 *w = qweight1d[data.tidx]; 658 } 659 660 template <int Q1d> 661 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 662 *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 663 } 664 665 template <int Q1d> 666 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 667 const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 668 for (CeedInt z = 0; z < Q1d; ++z) 669 { 670 w[z] = pw*qweight1d[z]; 671 } 672 } 673 674 ); 675 676 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 677 678 using std::ostringstream; 679 using std::string; 680 int ierr; 681 bool setupdone; 682 ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 683 if (setupdone) return 0; 684 Ceed ceed; 685 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 686 CeedOperator_Cuda_gen *data; 687 ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 688 CeedQFunction qf; 689 CeedQFunction_Cuda_gen *qf_data; 690 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 691 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 692 CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 693 numoutputfields, ncomp, dim = 0, lsize; 694 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 695 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 696 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 697 CeedChk(ierr); 698 CeedOperatorField *opinputfields, *opoutputfields; 699 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 700 CeedChk(ierr); 701 CeedQFunctionField *qfinputfields, *qfoutputfields; 702 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 703 CeedChk(ierr); 704 CeedEvalMode emode; 705 CeedBasis basis; 706 CeedBasis_Cuda_shared *basis_data; 707 CeedElemRestriction Erestrict; 708 CeedElemRestriction_Cuda_reg *restr_data; 709 710 ostringstream code; 711 string devFunctions(deviceFunctions); 712 713 // Add atomicAdd function for old NVidia architectures 714 struct cudaDeviceProp prop; 715 Ceed_Cuda *ceed_data; 716 ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr); 717 ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 718 if (prop.major<6){ 719 code << atomicAdd; 720 } 721 722 code << devFunctions; 723 724 string qFunction(qf_data->qFunctionSource); 725 726 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 727 code << "\n#define CeedPragmaSIMD\n"; 728 729 // Find dim and Q1d 730 bool collograd = false; 731 for (CeedInt i = 0; i < numinputfields; i++) { 732 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 733 if (basis != CEED_BASIS_COLLOCATED) { 734 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 735 736 // Check for collocated gradient 737 if (basis_data->d_collograd1d) 738 collograd = true; 739 740 // Collect dim and Q1d 741 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 742 bool isTensor; 743 ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 744 if (isTensor) { 745 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 746 } else { 747 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 748 } 749 } 750 } 751 // Check output bases for Q1d, dim as well 752 // The only imput basis might be CEED_BASIS_COLLOCATED 753 for (CeedInt i = 0; i < numoutputfields; i++) { 754 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 755 if (basis != CEED_BASIS_COLLOCATED) { 756 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 757 // Collect dim and Q1d 758 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 759 bool isTensor; 760 ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 761 if (isTensor) { 762 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 763 } else { 764 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 765 } 766 } 767 } 768 data->dim = dim; 769 data->Q1d = Q1d; 770 771 // Define CEED_Q_VLA 772 if (dim != 3 || collograd) { 773 code << "\n#define CEED_Q_VLA 1\n\n"; 774 } else { 775 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 776 } 777 778 code << qFunction; 779 780 // Setup 781 code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 782 // Input Evecs and Restriction 783 for (CeedInt i = 0; i < numinputfields; i++) { 784 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 785 CeedChk(ierr); 786 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 787 code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 788 } 789 } 790 791 for (CeedInt i = 0; i < numoutputfields; i++) { 792 code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 793 } 794 795 code << "const CeedInt Dim = "<<dim<<";\n"; 796 code << "const CeedInt Q1d = "<<Q1d<<";\n"; 797 798 code << "extern __shared__ CeedScalar slice[];\n"; 799 code << "BackendData data;\n"; 800 code << "data.tidx = threadIdx.x;\n"; 801 code << "data.tidy = threadIdx.y;\n"; 802 code << "data.tidz = threadIdx.z;\n"; 803 code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 804 code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 805 806 code << "\n// Input field constants and basis data\n"; 807 //Initialize constants, and matrices B and G 808 for (CeedInt i = 0; i < numinputfields; i++) { 809 code << "// -- Input field "<<i<<" --\n"; 810 // Get elemsize, emode, ncomp 811 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 812 CeedChk(ierr); 813 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 814 CeedChk(ierr); 815 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 816 CeedChk(ierr); 817 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 818 CeedChk(ierr); 819 820 // Set field constants 821 if (emode != CEED_EVAL_WEIGHT) { 822 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 823 if (basis != CEED_BASIS_COLLOCATED) { 824 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 825 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 826 } else { 827 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 828 } 829 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 830 } 831 832 // Load basis data 833 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 834 switch (emode) { 835 case CEED_EVAL_NONE: 836 break; 837 case CEED_EVAL_INTERP: 838 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 839 data->B.in[i] = basis_data->d_interp1d; 840 code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 841 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 842 break; 843 case CEED_EVAL_GRAD: 844 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 845 data->B.in[i] = basis_data->d_interp1d; 846 code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 847 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 848 if (basis_data->d_collograd1d) { 849 data->G.in[i] = basis_data->d_collograd1d; 850 code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 851 code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 852 } else { 853 data->G.in[i] = basis_data->d_grad1d; 854 code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 855 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 856 } 857 break; 858 case CEED_EVAL_WEIGHT: 859 break; // No action 860 case CEED_EVAL_DIV: 861 break; // TODO: Not implemented 862 case CEED_EVAL_CURL: 863 break; // TODO: Not implemented 864 } 865 } 866 867 code << "\n// Output field constants and basis data\n"; 868 for (CeedInt i = 0; i < numoutputfields; i++) { 869 code << "// -- Output field "<<i<<" --\n"; 870 // Get elemsize, emode, ncomp 871 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 872 CeedChk(ierr); 873 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 874 CeedChk(ierr); 875 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 876 CeedChk(ierr); 877 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 878 CeedChk(ierr); 879 880 // Set field constants 881 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 882 if (basis != CEED_BASIS_COLLOCATED) { 883 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 884 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 885 } else { 886 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 887 } 888 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 889 890 // Load basis data 891 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 892 switch (emode) { 893 case CEED_EVAL_NONE: 894 break; // No action 895 case CEED_EVAL_INTERP: 896 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 897 data->B.out[i] = basis_data->d_interp1d; 898 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 899 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 900 break; 901 case CEED_EVAL_GRAD: 902 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 903 data->B.out[i] = basis_data->d_interp1d; 904 code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 905 code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 906 if (basis_data->d_collograd1d) { 907 data->G.out[i] = basis_data->d_collograd1d; 908 code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 909 code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 910 } else { 911 data->G.out[i] = basis_data->d_grad1d; 912 code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 913 code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 914 } 915 break; 916 case CEED_EVAL_WEIGHT: { 917 Ceed ceed; 918 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 919 return CeedError(ceed, 1, 920 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 921 break; // Should not occur 922 } 923 case CEED_EVAL_DIV: 924 break; // TODO: Not implemented 925 case CEED_EVAL_CURL: 926 break; // TODO: Not implemented 927 } 928 } 929 code << "\n"; 930 code << "__syncthreads();\n"; 931 code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 932 // Input basis apply if needed 933 // Generate the correct eval mode code for each input 934 code << "\n// Input field restrictions and basis actions\n"; 935 for (CeedInt i = 0; i < numinputfields; i++) { 936 code << " // -- Input field "<<i<<" --\n"; 937 // Get elemsize, emode, ncomp 938 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 939 CeedChk(ierr); 940 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 941 CeedChk(ierr); 942 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 943 CeedChk(ierr); 944 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 945 CeedChk(ierr); 946 947 // Restriction 948 if (emode != CEED_EVAL_WEIGHT && 949 !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) { 950 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 951 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 952 data->indices.in[i] = restr_data->d_ind; 953 if (data->indices.in[i]) { 954 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 955 CeedChk(ierr); 956 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 957 CeedInt compstride; 958 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 959 code << " // CompStride: "<<compstride<<"\n"; 960 code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 961 } else { 962 bool backendstrides; 963 ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 964 &backendstrides); 965 CeedChk(ierr); 966 CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 967 if (!backendstrides) { 968 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 969 CeedChk(ierr); 970 } 971 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 972 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 973 } 974 } 975 976 // Basis action 977 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 978 switch (emode) { 979 case CEED_EVAL_NONE: 980 if (!basis_data->d_collograd1d) { 981 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 982 } 983 break; 984 case CEED_EVAL_INTERP: 985 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 986 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 987 break; 988 case CEED_EVAL_GRAD: 989 if (basis_data->d_collograd1d) { 990 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 991 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 992 } else { 993 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 994 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 995 } 996 break; 997 case CEED_EVAL_WEIGHT: 998 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 999 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1000 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1001 data->W = basis_data->d_qweight1d; 1002 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1003 break; // No action 1004 case CEED_EVAL_DIV: 1005 break; // TODO: Not implemented 1006 case CEED_EVAL_CURL: 1007 break; // TODO: Not implemented 1008 } 1009 } 1010 1011 // Q function 1012 code << "\n// Output field setup\n"; 1013 for (CeedInt i = 0; i < numoutputfields; i++) { 1014 code << " // -- Output field "<<i<<" --\n"; 1015 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1016 CeedChk(ierr); 1017 if (emode==CEED_EVAL_GRAD) 1018 { 1019 if (basis_data->d_collograd1d) { 1020 //Accumulator for gradient slices 1021 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1022 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1023 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1024 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1025 code << " }\n"; 1026 code << " }\n"; 1027 } else { 1028 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1029 } 1030 } 1031 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1032 { 1033 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1034 } 1035 } 1036 //We treat quadrature points per slice in 3d to save registers 1037 if (basis_data->d_collograd1d) { 1038 code << "\n // Note: Collocated Gradient\n"; 1039 code << "#pragma unroll\n"; 1040 code << "for (CeedInt q=0; q<Q1d; q++) {\n"; 1041 for (CeedInt i = 0; i < numinputfields; i++) { 1042 code << " // -- Input field "<<i<<" --\n"; 1043 // Get elemsize, emode, ncomp 1044 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1045 CeedChk(ierr); 1046 // Basis action 1047 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1048 switch (emode) { 1049 case CEED_EVAL_NONE: 1050 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1051 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1052 data->indices.in[i] = restr_data->d_ind; 1053 if (data->indices.in[i]) { 1054 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1055 CeedChk(ierr); 1056 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1057 CeedInt compstride; 1058 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1059 code << " // CompStride: "<<compstride<<"\n"; 1060 code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1061 } else { 1062 bool backendstrides; 1063 ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1064 &backendstrides); 1065 CeedChk(ierr); 1066 CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1067 if (!backendstrides) { 1068 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1069 CeedChk(ierr); 1070 } 1071 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1072 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1073 } 1074 break; 1075 case CEED_EVAL_INTERP: 1076 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1077 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1078 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1079 code << " }\n"; 1080 break; 1081 case CEED_EVAL_GRAD: 1082 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1083 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1084 break; 1085 case CEED_EVAL_WEIGHT: 1086 code << " CeedScalar r_q"<<i<<"[1];\n"; 1087 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1088 break; // No action 1089 case CEED_EVAL_DIV: 1090 break; // TODO: Not implemented 1091 case CEED_EVAL_CURL: 1092 break; // TODO: Not implemented 1093 } 1094 } 1095 for (CeedInt i = 0; i < numoutputfields; i++) { 1096 code << " // -- Output field "<<i<<" --\n"; 1097 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1098 CeedChk(ierr); 1099 // Basis action 1100 switch (emode) { 1101 case CEED_EVAL_NONE: 1102 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1103 break; // No action 1104 case CEED_EVAL_INTERP: 1105 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1106 break; 1107 case CEED_EVAL_GRAD: 1108 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1109 break; 1110 case CEED_EVAL_WEIGHT: 1111 break; // Should not occur 1112 case CEED_EVAL_DIV: 1113 break; // TODO: Not implemented 1114 case CEED_EVAL_CURL: 1115 break; // TODO: Not implemented 1116 } 1117 } 1118 } else { 1119 code << "\n // Note: No Collocated Gradient\n"; 1120 for (CeedInt i = 0; i < numinputfields; i++) { 1121 code << " // -- Input field "<<i<<" --\n"; 1122 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1123 } 1124 for (CeedInt i = 0; i < numoutputfields; i++) { 1125 code << " // -- Output field "<<i<<" --\n"; 1126 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1127 } 1128 } 1129 code << " // QFunction Inputs and outputs\n"; 1130 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1131 for (CeedInt i = 0; i < numinputfields; i++) { 1132 code << " // -- Input field "<<i<<" --\n"; 1133 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1134 } 1135 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1136 for (CeedInt i = 0; i < numoutputfields; i++) { 1137 code << " // -- Output field "<<i<<" --\n"; 1138 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1139 } 1140 code << "\n // Apply QFunction\n"; 1141 string qFunctionName(qf_data->qFunctionName); 1142 code << " "<<qFunctionName<<"(ctx, "; 1143 if (dim != 3 || basis_data->d_collograd1d) { 1144 code << "1 "; 1145 } else { 1146 code << "Q1d "; 1147 } 1148 code << ", in, out);\n"; 1149 if (basis_data->d_collograd1d) { 1150 code << "\n // Note: Collocated Gradient\n"; 1151 for (CeedInt i = 0; i < numoutputfields; i++) { 1152 code << " // -- Output field "<<i<<" --\n"; 1153 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1154 CeedChk(ierr); 1155 // Basis action 1156 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1157 switch (emode) { 1158 case CEED_EVAL_NONE: 1159 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1160 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1161 code << " }\n"; 1162 break; // No action 1163 case CEED_EVAL_INTERP: 1164 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1165 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1166 code << " }\n"; 1167 break; 1168 case CEED_EVAL_GRAD: 1169 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1170 break; 1171 case CEED_EVAL_WEIGHT: 1172 break; // Should not occur 1173 case CEED_EVAL_DIV: 1174 break; // TODO: Not implemented 1175 case CEED_EVAL_CURL: 1176 break; // TODO: Not implemented 1177 } 1178 } 1179 code << "}\n"; 1180 } 1181 1182 // Output basis apply if needed 1183 // Generate the correct eval mode code for each output 1184 code << "\n// Output field basis action and restrictions\n"; 1185 for (CeedInt i = 0; i < numoutputfields; i++) { 1186 code << " // -- Output field "<<i<<" --\n"; 1187 // Get elemsize, emode, ncomp 1188 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1189 CeedChk(ierr); 1190 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1191 CeedChk(ierr); 1192 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1193 CeedChk(ierr); 1194 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1195 CeedChk(ierr); 1196 // Basis action 1197 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1198 switch (emode) { 1199 case CEED_EVAL_NONE: 1200 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1201 break; // No action 1202 case CEED_EVAL_INTERP: 1203 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1204 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1205 break; 1206 case CEED_EVAL_GRAD: 1207 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1208 if (basis_data->d_collograd1d) { 1209 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1210 } else { 1211 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1212 } 1213 break; 1214 case CEED_EVAL_WEIGHT: { 1215 Ceed ceed; 1216 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1217 return CeedError(ceed, 1, 1218 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1219 break; // Should not occur 1220 } 1221 case CEED_EVAL_DIV: 1222 break; // TODO: Not implemented 1223 case CEED_EVAL_CURL: 1224 break; // TODO: Not implemented 1225 } 1226 // Restriction 1227 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1228 data->indices.out[i] = restr_data->d_ind; 1229 if (data->indices.out[i]) { 1230 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1231 CeedChk(ierr); 1232 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1233 CeedInt compstride; 1234 ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1235 code << " // CompStride: "<<compstride<<"\n"; 1236 code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1237 } else { 1238 bool backendstrides; 1239 ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1240 &backendstrides); 1241 CeedChk(ierr); 1242 CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1243 if (!backendstrides) { 1244 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1245 CeedChk(ierr); 1246 } 1247 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1248 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1249 } 1250 } 1251 1252 code << " }\n"; 1253 code << "}\n\n"; 1254 1255 // std::cout << code.str(); 1256 1257 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); 1258 CeedChk(ierr); 1259 ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1260 CeedChk(ierr); 1261 1262 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1263 1264 return 0; 1265 } 1266