1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 #include <ceed-backend.h> 17 #include "ceed-cuda-gen.h" 18 #include <iostream> 19 #include <sstream> 20 #include "../cuda/ceed-cuda.h" 21 #include "../cuda-reg/ceed-cuda-reg.h" 22 #include "../cuda-shared/ceed-cuda-shared.h" 23 24 static const char *atomicAdd = QUOTE( 25 __device__ double atomicAdd(double *address, double val) { 26 unsigned long long int *address_as_ull = (unsigned long long int *)address; 27 unsigned long long int old = *address_as_ull, assumed; 28 do { 29 assumed = old; 30 old = 31 atomicCAS(address_as_ull, assumed, 32 __double_as_longlong(val + 33 __longlong_as_double(assumed))); 34 // Note: uses integer comparison to avoid hang in case of NaN 35 // (since NaN != NaN) 36 } while (assumed != old); 37 return __longlong_as_double(old); 38 } 39 ); 40 41 static const char *deviceFunctions = QUOTE( 42 43 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45 46 typedef struct { 47 CeedInt tidx; 48 CeedInt tidy; 49 CeedInt tidz; 50 CeedInt tid; 51 CeedScalar* slice; 52 } BackendData; 53 54 template <int P, int Q> 55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56 for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57 B[i] = d_B[i]; 58 } 59 } 60 61 //**** 62 // 1D 63 template <int NCOMP, int P1d> 64 inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 65 if (data.tidx<P1d) 66 { 67 const CeedInt node = data.tidx; 68 const CeedInt ind = indices[node + elem * P1d]; 69 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 70 r_u[comp] = d_u[ind + nnodes * comp]; 71 } 72 } 73 } 74 75 template <int NCOMP, int P1d> 76 inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77 if (data.tidx<P1d) 78 { 79 const CeedInt node = data.tidx; 80 const CeedInt ind = indices[node + elem * P1d]; 81 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 82 r_u[comp] = d_u[ind * NCOMP + comp]; 83 } 84 } 85 } 86 87 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 88 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 89 if (data.tidx<P1d) 90 { 91 const CeedInt node = data.tidx; 92 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 93 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 94 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 95 } 96 } 97 } 98 99 template <int NCOMP, int P1d> 100 inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 101 if (data.tidx<P1d) 102 { 103 const CeedInt node = data.tidx; 104 const CeedInt ind = indices[node + elem * P1d]; 105 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 106 atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 107 } 108 } 109 } 110 111 template <int NCOMP, int P1d> 112 inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 113 if (data.tidx<P1d) 114 { 115 const CeedInt node = data.tidx; 116 const CeedInt ind = indices[node + elem * P1d]; 117 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 118 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 119 } 120 } 121 } 122 123 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 124 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 125 if (data.tidx<P1d) 126 { 127 const CeedInt node = data.tidx; 128 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 129 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 130 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 131 } 132 } 133 } 134 135 template <int NCOMP, int P1d, int Q1d> 136 inline __device__ void ContractX1d(BackendData& data, 137 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 138 data.slice[data.tidx] = *U; 139 __syncthreads(); 140 *V = 0.0; 141 for (CeedInt i = 0; i < P1d; ++i) { 142 *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 143 } 144 __syncthreads(); 145 } 146 147 template <int NCOMP, int P1d, int Q1d> 148 inline __device__ void ContractTransposeX1d(BackendData& data, 149 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 150 data.slice[data.tidx] = *U; 151 __syncthreads(); 152 *V = 0.0; 153 for (CeedInt i = 0; i < Q1d; ++i) { 154 *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 155 } 156 __syncthreads(); 157 } 158 159 template <int NCOMP, int P1d, int Q1d> 160 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 161 CeedScalar *__restrict__ r_V) { 162 for (CeedInt comp=0; comp<NCOMP; comp++) { 163 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 164 } 165 } 166 167 template <int NCOMP, int P1d, int Q1d> 168 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 169 CeedScalar *__restrict__ r_V) { 170 for (CeedInt comp=0; comp<NCOMP; comp++) { 171 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 172 } 173 } 174 175 template <int NCOMP, int P1d, int Q1d> 176 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 177 const CeedScalar *c_B, const CeedScalar *c_G, 178 CeedScalar *__restrict__ r_V) { 179 for (CeedInt comp=0; comp<NCOMP; comp++) { 180 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 181 } 182 } 183 184 template <int NCOMP, int P1d, int Q1d> 185 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 186 const CeedScalar *c_B, const CeedScalar *c_G, 187 CeedScalar *__restrict__ r_V) { 188 for (CeedInt comp=0; comp<NCOMP; comp++) { 189 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 190 } 191 } 192 193 //**** 194 // 2D 195 template <int NCOMP, int P1d> 196 inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 197 if (data.tidx<P1d && data.tidy<P1d) 198 { 199 const CeedInt node = data.tidx + data.tidy*P1d; 200 const CeedInt ind = indices[node + elem * P1d*P1d]; 201 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 202 r_u[comp] = d_u[ind + nnodes * comp]; 203 } 204 } 205 } 206 207 template <int NCOMP, int P1d> 208 inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 209 if (data.tidx<P1d && data.tidy<P1d) 210 { 211 const CeedInt node = data.tidx + data.tidy*P1d; 212 const CeedInt ind = indices[node + elem * P1d*P1d]; 213 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 214 r_u[comp] = d_u[ind * NCOMP + comp]; 215 } 216 } 217 } 218 219 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 220 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 221 if (data.tidx<P1d && data.tidy<P1d) 222 { 223 const CeedInt node = data.tidx + data.tidy*P1d; 224 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 225 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 226 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 227 } 228 } 229 } 230 231 template <int NCOMP, int P1d> 232 inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 233 if (data.tidx<P1d && data.tidy<P1d) 234 { 235 const CeedInt node = data.tidx + data.tidy*P1d; 236 const CeedInt ind = indices[node + elem * P1d*P1d]; 237 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 238 atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 239 } 240 } 241 } 242 243 template <int NCOMP, int P1d> 244 inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 245 if (data.tidx<P1d && data.tidy<P1d) 246 { 247 const CeedInt node = data.tidx + data.tidy*P1d; 248 const CeedInt ind = indices[node + elem * P1d*P1d]; 249 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 250 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 251 } 252 } 253 } 254 255 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 256 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 257 if (data.tidx<P1d && data.tidy<P1d) 258 { 259 const CeedInt node = data.tidx + data.tidy*P1d; 260 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 261 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 262 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 263 } 264 } 265 } 266 267 template <int NCOMP, int P1d, int Q1d> 268 inline __device__ void ContractX2d(BackendData& data, 269 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 270 data.slice[data.tidx+data.tidy*Q1d] = *U; 271 __syncthreads(); 272 *V = 0.0; 273 for (CeedInt i = 0; i < P1d; ++i) { 274 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 275 } 276 __syncthreads(); 277 } 278 279 template <int NCOMP, int P1d, int Q1d> 280 inline __device__ void ContractY2d(BackendData& data, 281 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 282 data.slice[data.tidx+data.tidy*Q1d] = *U; 283 __syncthreads(); 284 *V = 0.0; 285 for (CeedInt i = 0; i < P1d; ++i) { 286 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 287 } 288 __syncthreads(); 289 } 290 291 template <int NCOMP, int P1d, int Q1d> 292 inline __device__ void ContractYTranspose2d(BackendData& data, 293 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 294 data.slice[data.tidx+data.tidy*Q1d] = *U; 295 __syncthreads(); 296 *V = 0.0; 297 if (data.tidy<P1d) { 298 for (CeedInt i = 0; i < Q1d; ++i) { 299 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 300 } 301 } 302 __syncthreads(); 303 } 304 305 template <int NCOMP, int P1d, int Q1d> 306 inline __device__ void ContractXTranspose2d(BackendData& data, 307 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 308 data.slice[data.tidx+data.tidy*Q1d] = *U; 309 __syncthreads(); 310 *V = 0.0; 311 if (data.tidx<P1d) { 312 for (CeedInt i = 0; i < Q1d; ++i) { 313 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 314 } 315 } 316 __syncthreads(); 317 } 318 319 template <int NCOMP, int P1d, int Q1d> 320 inline __device__ void ContractXTransposeAdd2d(BackendData& data, 321 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 322 data.slice[data.tidx+data.tidy*Q1d] = *U; 323 __syncthreads(); 324 if (data.tidx<P1d) { 325 for (CeedInt i = 0; i < Q1d; ++i) { 326 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 327 } 328 } 329 __syncthreads(); 330 } 331 332 template <int NCOMP, int P1d, int Q1d> 333 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 334 CeedScalar *__restrict__ r_V) { 335 CeedScalar r_t[1]; 336 for (CeedInt comp=0; comp<NCOMP; comp++) { 337 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 338 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 339 } 340 } 341 342 template <int NCOMP, int P1d, int Q1d> 343 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 344 CeedScalar *__restrict__ r_V) { 345 CeedScalar r_t[1]; 346 for (CeedInt comp=0; comp<NCOMP; comp++) { 347 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 348 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 349 } 350 } 351 352 template <int NCOMP, int P1d, int Q1d> 353 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 354 const CeedScalar *c_B, const CeedScalar *c_G, 355 CeedScalar *__restrict__ r_V) { 356 CeedScalar r_t[1]; 357 for (CeedInt comp=0; comp<NCOMP; comp++) { 358 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 359 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 360 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 361 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 362 } 363 } 364 365 template <int NCOMP, int P1d, int Q1d> 366 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 367 const CeedScalar *c_B, const CeedScalar *c_G, 368 CeedScalar *__restrict__ r_V) { 369 CeedScalar r_t[1]; 370 for (CeedInt comp=0; comp<NCOMP; comp++) { 371 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 372 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 373 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 374 ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 375 } 376 } 377 378 //**** 379 // 3D 380 template <int NCOMP, int P1d> 381 inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 382 if (data.tidx<P1d && data.tidy<P1d) { 383 for (CeedInt z = 0; z < P1d; ++z) { 384 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 385 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 386 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 387 r_u[z+comp*P1d] = d_u[ind + nnodes * comp]; 388 } 389 } 390 } 391 } 392 393 template <int NCOMP, int P1d> 394 inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 395 if (data.tidx<P1d && data.tidy<P1d) { 396 for (CeedInt z = 0; z < P1d; ++z) { 397 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 398 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 399 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 400 r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 401 } 402 } 403 } 404 } 405 406 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 407 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 408 if (data.tidx<P1d && data.tidy<P1d) { 409 for (CeedInt z = 0; z < P1d; ++z) { 410 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 411 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 412 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 413 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 414 } 415 } 416 } 417 } 418 419 template <int NCOMP, int Q1d> 420 inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 421 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 422 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 423 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 424 r_u[comp] = d_u[ind + nquads * comp]; 425 } 426 } 427 428 template <int NCOMP, int Q1d> 429 inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 430 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 431 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d]; 432 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 433 r_u[comp] = d_u[ind * NCOMP + comp]; 434 } 435 } 436 437 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 438 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 439 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 440 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 441 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 442 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 443 } 444 } 445 446 template <int NCOMP, int P1d> 447 inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 448 if (data.tidx<P1d && data.tidy<P1d) { 449 for (CeedInt z = 0; z < P1d; ++z) { 450 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 451 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 452 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 453 atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]); 454 } 455 } 456 } 457 } 458 459 template <int NCOMP, int P1d> 460 inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 461 if (data.tidx<P1d && data.tidy<P1d) { 462 for (CeedInt z = 0; z < P1d; ++z) { 463 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 464 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 465 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 466 atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 467 } 468 } 469 } 470 } 471 472 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 473 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 474 if (data.tidx<P1d && data.tidy<P1d) { 475 for (CeedInt z = 0; z < P1d; ++z) { 476 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 477 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 478 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 479 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 480 } 481 } 482 } 483 } 484 485 template <int NCOMP, int P1d, int Q1d> 486 inline __device__ void ContractX3d(BackendData& data, 487 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 488 CeedScalar r_B[P1d]; 489 for (CeedInt i = 0; i < P1d; ++i) { 490 r_B[i] = B[i + data.tidx*P1d]; 491 } 492 for (CeedInt k = 0; k < P1d; ++k) { 493 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 494 __syncthreads(); 495 V[k] = 0.0; 496 for (CeedInt i = 0; i < P1d; ++i) { 497 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 498 } 499 __syncthreads(); 500 } 501 } 502 503 template <int NCOMP, int P1d, int Q1d> 504 inline __device__ void ContractY3d(BackendData& data, 505 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 506 CeedScalar r_B[P1d]; 507 for (CeedInt i = 0; i < P1d; ++i) { 508 r_B[i] = B[i + data.tidy*P1d]; 509 } 510 for (CeedInt k = 0; k < P1d; ++k) { 511 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 512 __syncthreads(); 513 V[k] = 0.0; 514 for (CeedInt i = 0; i < P1d; ++i) { 515 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 516 } 517 __syncthreads(); 518 } 519 } 520 521 template <int NCOMP, int P1d, int Q1d> 522 inline __device__ void ContractZ3d(BackendData& data, 523 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 524 for (CeedInt k = 0; k < Q1d; ++k) { 525 V[k] = 0.0; 526 for (CeedInt i = 0; i < P1d; ++i) { 527 V[k] += B[i + k*P1d] * U[i];//contract z direction 528 } 529 } 530 } 531 532 template <int NCOMP, int P1d, int Q1d> 533 inline __device__ void ContractTransposeZ3d(BackendData& data, 534 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 535 for (CeedInt k = 0; k < Q1d; ++k) { 536 V[k] = 0.0; 537 if (k<P1d) { 538 for (CeedInt i = 0; i < Q1d; ++i) { 539 V[k] += B[k + i*P1d] * U[i];//contract z direction 540 } 541 } 542 } 543 } 544 545 template <int NCOMP, int P1d, int Q1d> 546 inline __device__ void ContractTransposeY3d(BackendData& data, 547 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 548 CeedScalar r_B[Q1d]; 549 for (CeedInt i = 0; i < Q1d; ++i) 550 { 551 r_B[i] = B[data.tidy + i*P1d]; 552 } 553 for (CeedInt k = 0; k < P1d; ++k) { 554 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 555 __syncthreads(); 556 V[k] = 0.0; 557 if (data.tidy<P1d) { 558 for (CeedInt i = 0; i < Q1d; ++i) { 559 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 560 } 561 } 562 __syncthreads(); 563 } 564 } 565 566 template <int NCOMP, int P1d, int Q1d> 567 inline __device__ void ContractTransposeAddY3d(BackendData& data, 568 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 569 CeedScalar r_B[Q1d]; 570 for (CeedInt i = 0; i < Q1d; ++i) { 571 r_B[i] = B[data.tidy + i*P1d]; 572 } 573 for (CeedInt k = 0; k < P1d; ++k) { 574 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 575 __syncthreads(); 576 if (data.tidy<P1d) { 577 for (CeedInt i = 0; i < Q1d; ++i) { 578 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 579 } 580 } 581 __syncthreads(); 582 } 583 } 584 585 template <int NCOMP, int P1d, int Q1d> 586 inline __device__ void ContractTransposeX3d(BackendData& data, 587 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 588 CeedScalar r_B[Q1d]; 589 for (CeedInt i = 0; i < Q1d; ++i) { 590 r_B[i] = B[data.tidx + i*P1d]; 591 } 592 for (CeedInt k = 0; k < P1d; ++k) { 593 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 594 __syncthreads(); 595 V[k] = 0.0; 596 if (data.tidx<P1d) { 597 for (CeedInt i = 0; i < Q1d; ++i) { 598 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 599 } 600 } 601 __syncthreads(); 602 } 603 } 604 605 template <int NCOMP, int P1d, int Q1d> 606 inline __device__ void ContractTransposeAddX3d(BackendData& data, 607 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 608 CeedScalar r_B[Q1d]; 609 for (CeedInt i = 0; i < Q1d; ++i) { 610 r_B[i] = B[data.tidx + i*P1d]; 611 } 612 for (CeedInt k = 0; k < P1d; ++k) { 613 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 614 __syncthreads(); 615 if (data.tidx<P1d) { 616 for (CeedInt i = 0; i < Q1d; ++i) { 617 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 618 } 619 } 620 __syncthreads(); 621 } 622 } 623 624 template <int NCOMP, int P1d, int Q1d> 625 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 626 CeedScalar *__restrict__ r_V) { 627 CeedScalar r_t1[Q1d]; 628 CeedScalar r_t2[Q1d]; 629 for (CeedInt comp=0; comp<NCOMP; comp++) { 630 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 631 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 632 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 633 } 634 } 635 636 template <int NCOMP, int P1d, int Q1d> 637 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 638 CeedScalar *__restrict__ r_V) { 639 CeedScalar r_t1[Q1d]; 640 CeedScalar r_t2[Q1d]; 641 for (CeedInt comp=0; comp<NCOMP; comp++) { 642 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 643 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 644 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 645 } 646 } 647 648 template <int NCOMP, int P1d, int Q1d> 649 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 650 const CeedScalar *c_B, const CeedScalar *c_G, 651 CeedScalar *__restrict__ r_V) { 652 CeedScalar r_t1[Q1d]; 653 CeedScalar r_t2[Q1d]; 654 for (CeedInt comp=0; comp<NCOMP; comp++) { 655 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 656 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 657 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 658 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 659 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 660 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 661 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 662 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 663 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 664 } 665 } 666 667 template <int NCOMP, int P1d, int Q1d> 668 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 669 const CeedScalar *c_B, const CeedScalar *c_G, 670 CeedScalar *__restrict__ r_V) { 671 CeedScalar r_t1[Q1d]; 672 CeedScalar r_t2[Q1d]; 673 for (CeedInt comp=0; comp<NCOMP; comp++) { 674 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 675 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 676 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 677 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 678 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 679 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 680 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 681 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 682 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 683 } 684 } 685 686 template <int NCOMP, int Q1d> 687 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, 688 const CeedScalar *__restrict__ r_U, 689 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 690 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 691 data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d]; 692 __syncthreads(); 693 // X derivative 694 r_V[comp+0*NCOMP] = 0.0; 695 for (CeedInt i = 0; i < Q1d; ++i) { 696 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 697 } 698 // Y derivative 699 r_V[comp+1*NCOMP] = 0.0; 700 for (CeedInt i = 0; i < Q1d; ++i) { 701 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 702 } 703 // Z derivative 704 r_V[comp+2*NCOMP] = 0.0; 705 for (CeedInt i = 0; i < Q1d; ++i) { 706 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative) 707 } 708 __syncthreads(); 709 } 710 } 711 712 template <int NCOMP, int Q1d> 713 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, 714 const CeedScalar *__restrict__ r_U, 715 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 716 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 717 // X derivative 718 data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 719 __syncthreads(); 720 for (CeedInt i = 0; i < Q1d; ++i) { 721 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 722 } 723 __syncthreads(); 724 // Y derivative 725 data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 726 __syncthreads(); 727 for (CeedInt i = 0; i < Q1d; ++i) { 728 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 729 } 730 __syncthreads(); 731 // Z derivative 732 for (CeedInt i = 0; i < Q1d; ++i) { 733 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative) 734 } 735 } 736 } 737 738 template <int Q1d> 739 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 740 *w = qweight1d[data.tidx]; 741 } 742 743 template <int Q1d> 744 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 745 *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 746 } 747 748 template <int Q1d> 749 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 750 const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 751 for (CeedInt z = 0; z < Q1d; ++z) 752 { 753 w[z] = pw*qweight1d[z]; 754 } 755 } 756 757 ); 758 759 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 760 761 using std::ostringstream; 762 using std::string; 763 int ierr; 764 bool setupdone; 765 ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 766 if (setupdone) return 0; 767 Ceed ceed; 768 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 769 CeedOperator_Cuda_gen *data; 770 ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 771 CeedQFunction qf; 772 CeedQFunction_Cuda_gen *qf_data; 773 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 774 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 775 CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 776 numoutputfields, ncomp, dim = 0, nnodes; 777 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 778 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 779 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 780 CeedChk(ierr); 781 CeedOperatorField *opinputfields, *opoutputfields; 782 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 783 CeedChk(ierr); 784 CeedQFunctionField *qfinputfields, *qfoutputfields; 785 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 786 CeedChk(ierr); 787 CeedEvalMode emode; 788 CeedInterlaceMode imode; 789 CeedBasis basis; 790 CeedBasis_Cuda_shared *basis_data; 791 CeedElemRestriction Erestrict; 792 CeedElemRestriction_Cuda_reg *restr_data; 793 794 ostringstream code; 795 string devFunctions(deviceFunctions); 796 797 // Add atomicAdd function for old NVidia architectures 798 struct cudaDeviceProp prop; 799 Ceed_Cuda *ceed_data; 800 ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr); 801 ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 802 if (prop.major<6){ 803 code << atomicAdd; 804 } 805 806 code << devFunctions; 807 808 string qFunction(qf_data->qFunctionSource); 809 810 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 811 code << "\n#define CeedPragmaSIMD\n"; 812 813 // Find dim and Q1d 814 bool collograd = false; 815 for (CeedInt i = 0; i < numinputfields; i++) { 816 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 817 if (basis != CEED_BASIS_COLLOCATED) { 818 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 819 820 // Check for collocated gradient 821 if (basis_data->d_collograd1d) 822 collograd = true; 823 824 // Collect dim and Q1d 825 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 826 bool isTensor; 827 ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 828 if (isTensor) { 829 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 830 } else { 831 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 832 } 833 } 834 } 835 // Check output bases for Q1d, dim as well 836 // The only imput basis might be CEED_BASIS_COLLOCATED 837 for (CeedInt i = 0; i < numoutputfields; i++) { 838 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 839 if (basis != CEED_BASIS_COLLOCATED) { 840 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 841 // Collect dim and Q1d 842 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 843 bool isTensor; 844 ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 845 if (isTensor) { 846 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 847 } else { 848 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 849 } 850 } 851 } 852 data->dim = dim; 853 data->Q1d = Q1d; 854 855 // Define CEED_Q_VLA 856 if (dim != 3 || collograd) { 857 code << "\n#define CEED_Q_VLA 1\n\n"; 858 } else { 859 code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 860 } 861 862 code << qFunction; 863 864 // Setup 865 code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 866 // Input Evecs and Restriction 867 for (CeedInt i = 0; i < numinputfields; i++) { 868 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 869 CeedChk(ierr); 870 if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 871 code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 872 } 873 } 874 875 for (CeedInt i = 0; i < numoutputfields; i++) { 876 code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 877 } 878 879 code << "const CeedInt Dim = "<<dim<<";\n"; 880 code << "const CeedInt Q1d = "<<Q1d<<";\n"; 881 882 code << "extern __shared__ CeedScalar slice[];\n"; 883 code << "BackendData data;\n"; 884 code << "data.tidx = threadIdx.x;\n"; 885 code << "data.tidy = threadIdx.y;\n"; 886 code << "data.tidz = threadIdx.z;\n"; 887 code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 888 code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 889 890 code << "\n// Input field constants and basis data\n"; 891 //Initialize constants, and matrices B and G 892 for (CeedInt i = 0; i < numinputfields; i++) { 893 code << "// -- Input field "<<i<<" --\n"; 894 // Get elemsize, emode, ncomp 895 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 896 CeedChk(ierr); 897 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 898 CeedChk(ierr); 899 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 900 CeedChk(ierr); 901 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 902 CeedChk(ierr); 903 904 // Set field constants 905 if (emode != CEED_EVAL_WEIGHT) { 906 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 907 if (basis != CEED_BASIS_COLLOCATED) { 908 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 909 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 910 } else { 911 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 912 } 913 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 914 } 915 916 // Load basis data 917 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 918 switch (emode) { 919 case CEED_EVAL_NONE: 920 break; 921 case CEED_EVAL_INTERP: 922 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 923 data->B.in[i] = basis_data->d_interp1d; 924 code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 925 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 926 break; 927 case CEED_EVAL_GRAD: 928 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 929 data->B.in[i] = basis_data->d_interp1d; 930 code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 931 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 932 if (basis_data->d_collograd1d) { 933 data->G.in[i] = basis_data->d_collograd1d; 934 code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 935 code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 936 } else { 937 data->G.in[i] = basis_data->d_grad1d; 938 code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 939 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 940 } 941 break; 942 case CEED_EVAL_WEIGHT: 943 break; // No action 944 case CEED_EVAL_DIV: 945 break; // TODO: Not implemented 946 case CEED_EVAL_CURL: 947 break; // TODO: Not implemented 948 } 949 } 950 951 code << "\n// Output field constants and basis data\n"; 952 for (CeedInt i = 0; i < numoutputfields; i++) { 953 code << "// -- Output field "<<i<<" --\n"; 954 // Get elemsize, emode, ncomp 955 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 956 CeedChk(ierr); 957 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 958 CeedChk(ierr); 959 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 960 CeedChk(ierr); 961 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 962 CeedChk(ierr); 963 964 // Set field constants 965 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 966 if (basis != CEED_BASIS_COLLOCATED) { 967 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 968 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 969 } else { 970 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 971 } 972 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 973 974 // Load basis data 975 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 976 switch (emode) { 977 case CEED_EVAL_NONE: 978 break; // No action 979 case CEED_EVAL_INTERP: 980 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 981 data->B.out[i] = basis_data->d_interp1d; 982 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 983 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 984 break; 985 case CEED_EVAL_GRAD: 986 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 987 data->B.out[i] = basis_data->d_interp1d; 988 code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 989 code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 990 if (basis_data->d_collograd1d) { 991 data->G.out[i] = basis_data->d_collograd1d; 992 code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 993 code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 994 } else { 995 data->G.out[i] = basis_data->d_grad1d; 996 code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 997 code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 998 } 999 break; 1000 case CEED_EVAL_WEIGHT: { 1001 Ceed ceed; 1002 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1003 return CeedError(ceed, 1, 1004 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1005 break; // Should not occur 1006 } 1007 case CEED_EVAL_DIV: 1008 break; // TODO: Not implemented 1009 case CEED_EVAL_CURL: 1010 break; // TODO: Not implemented 1011 } 1012 } 1013 code << "\n"; 1014 code << "__syncthreads();\n"; 1015 code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1016 // Input basis apply if needed 1017 // Generate the correct eval mode code for each input 1018 code << "\n// Input field restrictions and basis actions\n"; 1019 for (CeedInt i = 0; i < numinputfields; i++) { 1020 code << " // -- Input field "<<i<<" --\n"; 1021 // Get elemsize, emode, ncomp 1022 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1023 CeedChk(ierr); 1024 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1025 CeedChk(ierr); 1026 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1027 CeedChk(ierr); 1028 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1029 CeedChk(ierr); 1030 1031 // Restriction 1032 if (emode != CEED_EVAL_WEIGHT && 1033 !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) { 1034 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1035 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1036 data->indices.in[i] = restr_data->d_ind; 1037 if (data->indices.in[i]) { 1038 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 1039 code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 1040 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1041 code << " // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n"; 1042 code << " readDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1043 } else { 1044 bool backendstrides; 1045 ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1046 &backendstrides); 1047 CeedChk(ierr); 1048 CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1049 if (!backendstrides) { 1050 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1051 CeedChk(ierr); 1052 } 1053 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1054 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1055 } 1056 } 1057 1058 // Basis action 1059 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1060 switch (emode) { 1061 case CEED_EVAL_NONE: 1062 if (!basis_data->d_collograd1d) { 1063 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1064 } 1065 break; 1066 case CEED_EVAL_INTERP: 1067 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1068 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1069 break; 1070 case CEED_EVAL_GRAD: 1071 if (basis_data->d_collograd1d) { 1072 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1073 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1074 } else { 1075 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1076 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1077 } 1078 break; 1079 case CEED_EVAL_WEIGHT: 1080 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1081 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1082 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1083 data->W = basis_data->d_qweight1d; 1084 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1085 break; // No action 1086 case CEED_EVAL_DIV: 1087 break; // TODO: Not implemented 1088 case CEED_EVAL_CURL: 1089 break; // TODO: Not implemented 1090 } 1091 } 1092 1093 // Q function 1094 code << "\n// Output field setup\n"; 1095 for (CeedInt i = 0; i < numoutputfields; i++) { 1096 code << " // -- Output field "<<i<<" --\n"; 1097 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1098 CeedChk(ierr); 1099 if (emode==CEED_EVAL_GRAD) 1100 { 1101 if (basis_data->d_collograd1d) { 1102 //Accumulator for gradient slices 1103 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1104 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1105 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1106 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1107 code << " }\n"; 1108 code << " }\n"; 1109 } else { 1110 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1111 } 1112 } 1113 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1114 { 1115 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1116 } 1117 } 1118 //We treat quadrature points per slice in 3d to save registers 1119 if (basis_data->d_collograd1d) { 1120 code << "\n // Note: Collocated Gradient\n"; 1121 code << "#pragma unroll\n"; 1122 code << "for (CeedInt q=0; q<Q1d; q++) {\n"; 1123 for (CeedInt i = 0; i < numinputfields; i++) { 1124 code << " // -- Input field "<<i<<" --\n"; 1125 // Get elemsize, emode, ncomp 1126 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1127 CeedChk(ierr); 1128 // Basis action 1129 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1130 switch (emode) { 1131 case CEED_EVAL_NONE: 1132 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1133 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1134 data->indices.in[i] = restr_data->d_ind; 1135 if (data->indices.in[i]) { 1136 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 1137 code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 1138 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1139 code << " // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n"; 1140 code << " readSliceQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nnodes_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1141 } else { 1142 bool backendstrides; 1143 ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1144 &backendstrides); 1145 CeedChk(ierr); 1146 CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1147 if (!backendstrides) { 1148 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1149 CeedChk(ierr); 1150 } 1151 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1152 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1153 } 1154 break; 1155 case CEED_EVAL_INTERP: 1156 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1157 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1158 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1159 code << " }\n"; 1160 break; 1161 case CEED_EVAL_GRAD: 1162 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1163 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1164 break; 1165 case CEED_EVAL_WEIGHT: 1166 code << " CeedScalar r_q"<<i<<"[1];\n"; 1167 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1168 break; // No action 1169 case CEED_EVAL_DIV: 1170 break; // TODO: Not implemented 1171 case CEED_EVAL_CURL: 1172 break; // TODO: Not implemented 1173 } 1174 } 1175 for (CeedInt i = 0; i < numoutputfields; i++) { 1176 code << " // -- Output field "<<i<<" --\n"; 1177 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1178 CeedChk(ierr); 1179 // Basis action 1180 switch (emode) { 1181 case CEED_EVAL_NONE: 1182 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1183 break; // No action 1184 case CEED_EVAL_INTERP: 1185 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1186 break; 1187 case CEED_EVAL_GRAD: 1188 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1189 break; 1190 case CEED_EVAL_WEIGHT: 1191 break; // Should not occur 1192 case CEED_EVAL_DIV: 1193 break; // TODO: Not implemented 1194 case CEED_EVAL_CURL: 1195 break; // TODO: Not implemented 1196 } 1197 } 1198 } else { 1199 code << "\n // Note: No Collocated Gradient\n"; 1200 for (CeedInt i = 0; i < numinputfields; i++) { 1201 code << " // -- Input field "<<i<<" --\n"; 1202 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1203 } 1204 for (CeedInt i = 0; i < numoutputfields; i++) { 1205 code << " // -- Output field "<<i<<" --\n"; 1206 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1207 } 1208 } 1209 code << " // QFunction Inputs and outputs\n"; 1210 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1211 for (CeedInt i = 0; i < numinputfields; i++) { 1212 code << " // -- Input field "<<i<<" --\n"; 1213 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1214 } 1215 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1216 for (CeedInt i = 0; i < numoutputfields; i++) { 1217 code << " // -- Output field "<<i<<" --\n"; 1218 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1219 } 1220 code << "\n // Apply QFunction\n"; 1221 string qFunctionName(qf_data->qFunctionName); 1222 code << " "<<qFunctionName<<"(ctx, "; 1223 if (dim != 3 || basis_data->d_collograd1d) { 1224 code << "1 "; 1225 } else { 1226 code << "Q1d "; 1227 } 1228 code << ", in, out);\n"; 1229 if (basis_data->d_collograd1d) { 1230 code << "\n // Note: Collocated Gradient\n"; 1231 for (CeedInt i = 0; i < numoutputfields; i++) { 1232 code << " // -- Output field "<<i<<" --\n"; 1233 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1234 CeedChk(ierr); 1235 // Basis action 1236 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1237 switch (emode) { 1238 case CEED_EVAL_NONE: 1239 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1240 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1241 code << " }\n"; 1242 break; // No action 1243 case CEED_EVAL_INTERP: 1244 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1245 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1246 code << " }\n"; 1247 break; 1248 case CEED_EVAL_GRAD: 1249 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1250 break; 1251 case CEED_EVAL_WEIGHT: 1252 break; // Should not occur 1253 case CEED_EVAL_DIV: 1254 break; // TODO: Not implemented 1255 case CEED_EVAL_CURL: 1256 break; // TODO: Not implemented 1257 } 1258 } 1259 code << "}\n"; 1260 } 1261 1262 // Output basis apply if needed 1263 // Generate the correct eval mode code for each output 1264 code << "\n// Output field basis action and restrictions\n"; 1265 for (CeedInt i = 0; i < numoutputfields; i++) { 1266 code << " // -- Output field "<<i<<" --\n"; 1267 // Get elemsize, emode, ncomp 1268 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1269 CeedChk(ierr); 1270 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1271 CeedChk(ierr); 1272 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1273 CeedChk(ierr); 1274 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1275 CeedChk(ierr); 1276 // Basis action 1277 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1278 switch (emode) { 1279 case CEED_EVAL_NONE: 1280 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1281 break; // No action 1282 case CEED_EVAL_INTERP: 1283 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1284 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1285 break; 1286 case CEED_EVAL_GRAD: 1287 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1288 if (basis_data->d_collograd1d) { 1289 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1290 } else { 1291 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1292 } 1293 break; 1294 case CEED_EVAL_WEIGHT: { 1295 Ceed ceed; 1296 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1297 return CeedError(ceed, 1, 1298 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1299 break; // Should not occur 1300 } 1301 case CEED_EVAL_DIV: 1302 break; // TODO: Not implemented 1303 case CEED_EVAL_CURL: 1304 break; // TODO: Not implemented 1305 } 1306 // Restriction 1307 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1308 data->indices.out[i] = restr_data->d_ind; 1309 if (data->indices.out[i]) { 1310 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 1311 code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 1312 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1313 code << " // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n"; 1314 code << " writeDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1315 } else { 1316 bool backendstrides; 1317 ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1318 &backendstrides); 1319 CeedChk(ierr); 1320 CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1321 if (!backendstrides) { 1322 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1323 CeedChk(ierr); 1324 } 1325 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1326 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1327 } 1328 } 1329 1330 code << " }\n"; 1331 code << "}\n\n"; 1332 1333 // std::cout << code.str(); 1334 1335 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); 1336 CeedChk(ierr); 1337 ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1338 CeedChk(ierr); 1339 1340 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1341 1342 return 0; 1343 } 1344