1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 #include <ceed-backend.h> 17 #include "ceed-cuda-gen.h" 18 #include <iostream> 19 #include <sstream> 20 #include "../cuda/ceed-cuda.h" 21 #include "../cuda-reg/ceed-cuda-reg.h" 22 #include "../cuda-shared/ceed-cuda-shared.h" 23 24 static const char *atomicAdd = QUOTE( 25 __device__ double atomicAdd(double *address, double val) { 26 unsigned long long int *address_as_ull = (unsigned long long int *)address; 27 unsigned long long int old = *address_as_ull, assumed; 28 do { 29 assumed = old; 30 old = 31 atomicCAS(address_as_ull, assumed, 32 __double_as_longlong(val + 33 __longlong_as_double(assumed))); 34 // Note: uses integer comparison to avoid hang in case of NaN 35 // (since NaN != NaN) 36 } while (assumed != old); 37 return __longlong_as_double(old); 38 } 39 ); 40 41 static const char *deviceFunctions = QUOTE( 42 43 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45 46 typedef struct { 47 CeedInt tidx; 48 CeedInt tidy; 49 CeedInt tidz; 50 CeedInt tid; 51 CeedScalar* slice; 52 } BackendData; 53 54 template <int P, int Q> 55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56 for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57 B[i] = d_B[i]; 58 } 59 } 60 61 //**** 62 // 1D 63 template <int NCOMP, int P1d> 64 inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 65 if (data.tidx<P1d) 66 { 67 const CeedInt node = data.tidx; 68 const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 69 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 70 r_u[comp] = d_u[ind + nnodes * comp]; 71 } 72 } 73 } 74 75 template <int NCOMP, int P1d> 76 inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77 if (data.tidx<P1d) 78 { 79 const CeedInt node = data.tidx; 80 const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 81 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 82 r_u[comp] = d_u[ind * NCOMP + comp]; 83 } 84 } 85 } 86 87 template <int NCOMP, int Q1d> 88 inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 89 const CeedInt node = data.tidx; 90 const CeedInt ind = node + elem * Q1d; 91 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 92 r_u[comp] = d_u[ind + nquads * comp]; 93 } 94 } 95 96 template <int NCOMP, int Q1d> 97 inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 98 const CeedInt node = data.tidx; 99 const CeedInt ind = node + elem * Q1d; 100 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 101 r_u[comp] = d_u[ind * NCOMP + comp]; 102 } 103 } 104 105 template <int NCOMP, int P1d> 106 inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 107 if (data.tidx<P1d) 108 { 109 const CeedInt node = data.tidx; 110 const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 111 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 112 atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 113 } 114 } 115 } 116 117 template <int NCOMP, int P1d> 118 inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 119 if (data.tidx<P1d) 120 { 121 const CeedInt node = data.tidx; 122 const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 123 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 124 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 125 } 126 } 127 } 128 129 template <int NCOMP, int Q1d> 130 inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 131 const CeedInt node = data.tidx; 132 const CeedInt ind = node + elem * Q1d; 133 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 134 d_v[ind + nquads * comp] = r_v[comp]; 135 } 136 } 137 138 template <int NCOMP, int Q1d> 139 inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 140 const CeedInt node = data.tidx; 141 const CeedInt ind = node + elem * Q1d; 142 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 143 d_v[ind * NCOMP + comp] = r_v[comp]; 144 } 145 } 146 147 template <int NCOMP, int P1d, int Q1d> 148 inline __device__ void ContractX1d(BackendData& data, 149 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 150 data.slice[data.tidx] = *U; 151 __syncthreads(); 152 *V = 0.0; 153 for (CeedInt i = 0; i < P1d; ++i) { 154 *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 155 } 156 __syncthreads(); 157 } 158 159 template <int NCOMP, int P1d, int Q1d> 160 inline __device__ void ContractTransposeX1d(BackendData& data, 161 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 162 data.slice[data.tidx] = *U; 163 __syncthreads(); 164 *V = 0.0; 165 for (CeedInt i = 0; i < Q1d; ++i) { 166 *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 167 } 168 __syncthreads(); 169 } 170 171 template <int NCOMP, int P1d, int Q1d> 172 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 173 CeedScalar *__restrict__ r_V) { 174 for (CeedInt comp=0; comp<NCOMP; comp++) { 175 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 176 } 177 } 178 179 template <int NCOMP, int P1d, int Q1d> 180 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 181 CeedScalar *__restrict__ r_V) { 182 for (CeedInt comp=0; comp<NCOMP; comp++) { 183 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 184 } 185 } 186 187 template <int NCOMP, int P1d, int Q1d> 188 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 189 const CeedScalar *c_B, const CeedScalar *c_G, 190 CeedScalar *__restrict__ r_V) { 191 for (CeedInt comp=0; comp<NCOMP; comp++) { 192 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 193 } 194 } 195 196 template <int NCOMP, int P1d, int Q1d> 197 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 198 const CeedScalar *c_B, const CeedScalar *c_G, 199 CeedScalar *__restrict__ r_V) { 200 for (CeedInt comp=0; comp<NCOMP; comp++) { 201 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 202 } 203 } 204 205 //**** 206 // 2D 207 template <int NCOMP, int P1d> 208 inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 209 if (data.tidx<P1d && data.tidy<P1d) 210 { 211 const CeedInt node = data.tidx + data.tidy*P1d; 212 const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 213 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 214 r_u[comp] = d_u[ind + nnodes * comp]; 215 } 216 } 217 } 218 219 template <int NCOMP, int P1d> 220 inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 221 if (data.tidx<P1d && data.tidy<P1d) 222 { 223 const CeedInt node = data.tidx + data.tidy*P1d; 224 const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 225 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 226 r_u[comp] = d_u[ind * NCOMP + comp]; 227 } 228 } 229 } 230 231 template <int NCOMP, int Q1d> 232 inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 233 const CeedInt node = data.tidx + data.tidy*Q1d; 234 const CeedInt ind = node + elem * Q1d*Q1d; 235 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 236 r_u[comp] = d_u[ind + nquads * comp]; 237 } 238 } 239 240 template <int NCOMP, int Q1d> 241 inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 242 const CeedInt node = data.tidx + data.tidy*Q1d; 243 const CeedInt ind = node + elem * Q1d*Q1d; 244 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 245 r_u[comp] = d_u[ind * NCOMP + comp]; 246 } 247 } 248 249 template <int NCOMP, int P1d> 250 inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 251 if (data.tidx<P1d && data.tidy<P1d) 252 { 253 const CeedInt node = data.tidx + data.tidy*P1d; 254 const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 255 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 256 atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 257 } 258 } 259 } 260 261 template <int NCOMP, int P1d> 262 inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 263 if (data.tidx<P1d && data.tidy<P1d) 264 { 265 const CeedInt node = data.tidx + data.tidy*P1d; 266 const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 267 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 268 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 269 } 270 } 271 } 272 273 template <int NCOMP, int Q1d> 274 inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 275 const CeedInt node = data.tidx + data.tidy*Q1d; 276 const CeedInt ind = node + elem * Q1d*Q1d; 277 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 278 d_v[ind + nquads * comp] = r_v[comp]; 279 } 280 } 281 282 template <int NCOMP, int Q1d> 283 inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 284 const CeedInt node = data.tidx + data.tidy*Q1d; 285 const CeedInt ind = node + elem * Q1d*Q1d; 286 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 287 d_v[ind * NCOMP + comp] = r_v[comp]; 288 } 289 } 290 291 template <int NCOMP, int P1d, int Q1d> 292 inline __device__ void ContractX2d(BackendData& data, 293 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 294 data.slice[data.tidx+data.tidy*Q1d] = *U; 295 __syncthreads(); 296 *V = 0.0; 297 for (CeedInt i = 0; i < P1d; ++i) { 298 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 299 } 300 __syncthreads(); 301 } 302 303 template <int NCOMP, int P1d, int Q1d> 304 inline __device__ void ContractY2d(BackendData& data, 305 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 306 data.slice[data.tidx+data.tidy*Q1d] = *U; 307 __syncthreads(); 308 *V = 0.0; 309 for (CeedInt i = 0; i < P1d; ++i) { 310 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 311 } 312 __syncthreads(); 313 } 314 315 template <int NCOMP, int P1d, int Q1d> 316 inline __device__ void ContractYTranspose2d(BackendData& data, 317 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 318 data.slice[data.tidx+data.tidy*Q1d] = *U; 319 __syncthreads(); 320 *V = 0.0; 321 if (data.tidy<P1d) { 322 for (CeedInt i = 0; i < Q1d; ++i) { 323 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 324 } 325 } 326 __syncthreads(); 327 } 328 329 template <int NCOMP, int P1d, int Q1d> 330 inline __device__ void ContractXTranspose2d(BackendData& data, 331 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 332 data.slice[data.tidx+data.tidy*Q1d] = *U; 333 __syncthreads(); 334 *V = 0.0; 335 if (data.tidx<P1d) { 336 for (CeedInt i = 0; i < Q1d; ++i) { 337 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 338 } 339 } 340 __syncthreads(); 341 } 342 343 template <int NCOMP, int P1d, int Q1d> 344 inline __device__ void ContractXTransposeAdd2d(BackendData& data, 345 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 346 data.slice[data.tidx+data.tidy*Q1d] = *U; 347 __syncthreads(); 348 if (data.tidx<P1d) { 349 for (CeedInt i = 0; i < Q1d; ++i) { 350 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 351 } 352 } 353 __syncthreads(); 354 } 355 356 template <int NCOMP, int P1d, int Q1d> 357 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 358 CeedScalar *__restrict__ r_V) { 359 CeedScalar r_t[1]; 360 for (CeedInt comp=0; comp<NCOMP; comp++) { 361 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 362 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 363 } 364 } 365 366 template <int NCOMP, int P1d, int Q1d> 367 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 368 CeedScalar *__restrict__ r_V) { 369 CeedScalar r_t[1]; 370 for (CeedInt comp=0; comp<NCOMP; comp++) { 371 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 372 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 373 } 374 } 375 376 template <int NCOMP, int P1d, int Q1d> 377 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 378 const CeedScalar *c_B, const CeedScalar *c_G, 379 CeedScalar *__restrict__ r_V) { 380 CeedScalar r_t[1]; 381 for (CeedInt comp=0; comp<NCOMP; comp++) { 382 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 383 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 384 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 385 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 386 } 387 } 388 389 template <int NCOMP, int P1d, int Q1d> 390 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 391 const CeedScalar *c_B, const CeedScalar *c_G, 392 CeedScalar *__restrict__ r_V) { 393 CeedScalar r_t[1]; 394 for (CeedInt comp=0; comp<NCOMP; comp++) { 395 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 396 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 397 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 398 ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 399 } 400 } 401 402 //**** 403 // 3D 404 template <int NCOMP, int P1d> 405 inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 406 if (data.tidx<P1d && data.tidy<P1d) { 407 for (CeedInt z = 0; z < P1d; ++z) { 408 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 409 const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 410 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 411 r_u[z+comp*P1d] = d_u[ind + nnodes * comp]; 412 } 413 } 414 } 415 } 416 417 template <int NCOMP, int P1d> 418 inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 419 if (data.tidx<P1d && data.tidy<P1d) { 420 for (CeedInt z = 0; z < P1d; ++z) { 421 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 422 const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 423 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 424 r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 425 } 426 } 427 } 428 } 429 430 template <int NCOMP, int Q1d> 431 inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 432 for (CeedInt z=0; z < Q1d; ++z) { 433 const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 434 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 435 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 436 r_u[z+comp*Q1d] = d_u[ind + nquads * comp]; 437 } 438 } 439 } 440 441 template <int NCOMP, int Q1d> 442 inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 443 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 444 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 445 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 446 r_u[comp] = d_u[ind + nquads * comp]; 447 } 448 } 449 450 template <int NCOMP, int Q1d> 451 inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 452 for (CeedInt z=0; z < Q1d; ++z) { 453 const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 454 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 455 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 456 r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp]; 457 } 458 } 459 } 460 461 template <int NCOMP, int Q1d> 462 inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 463 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 464 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 465 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 466 r_u[comp] = d_u[ind * NCOMP + comp]; 467 } 468 } 469 470 template <int NCOMP, int P1d> 471 inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 472 if (data.tidx<P1d && data.tidy<P1d) { 473 for (CeedInt z = 0; z < P1d; ++z) { 474 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 475 const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 476 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 477 atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]); 478 } 479 } 480 } 481 } 482 483 template <int NCOMP, int P1d> 484 inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 485 if (data.tidx<P1d && data.tidy<P1d) { 486 for (CeedInt z = 0; z < P1d; ++z) { 487 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 488 const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 489 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 490 atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 491 } 492 } 493 } 494 } 495 496 template <int NCOMP, int Q1d> 497 inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 498 for (CeedInt z=0; z < Q1d; ++z) { 499 const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 500 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 501 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 502 d_v[ind + nquads * comp] = r_v[z+comp*Q1d]; 503 } 504 } 505 } 506 507 template <int NCOMP, int Q1d> 508 inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 509 for (CeedInt z=0; z < Q1d; ++z) { 510 const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 511 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 512 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 513 d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d]; 514 } 515 } 516 } 517 518 template <int NCOMP, int P1d, int Q1d> 519 inline __device__ void ContractX3d(BackendData& data, 520 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 521 CeedScalar r_B[P1d]; 522 for (CeedInt i = 0; i < P1d; ++i) { 523 r_B[i] = B[i + data.tidx*P1d]; 524 } 525 for (CeedInt k = 0; k < P1d; ++k) { 526 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 527 __syncthreads(); 528 V[k] = 0.0; 529 for (CeedInt i = 0; i < P1d; ++i) { 530 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 531 } 532 __syncthreads(); 533 } 534 } 535 536 template <int NCOMP, int P1d, int Q1d> 537 inline __device__ void ContractY3d(BackendData& data, 538 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 539 CeedScalar r_B[P1d]; 540 for (CeedInt i = 0; i < P1d; ++i) { 541 r_B[i] = B[i + data.tidy*P1d]; 542 } 543 for (CeedInt k = 0; k < P1d; ++k) { 544 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 545 __syncthreads(); 546 V[k] = 0.0; 547 for (CeedInt i = 0; i < P1d; ++i) { 548 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 549 } 550 __syncthreads(); 551 } 552 } 553 554 template <int NCOMP, int P1d, int Q1d> 555 inline __device__ void ContractZ3d(BackendData& data, 556 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 557 for (CeedInt k = 0; k < Q1d; ++k) { 558 V[k] = 0.0; 559 for (CeedInt i = 0; i < P1d; ++i) { 560 V[k] += B[i + k*P1d] * U[i];//contract z direction 561 } 562 } 563 } 564 565 template <int NCOMP, int P1d, int Q1d> 566 inline __device__ void ContractTransposeZ3d(BackendData& data, 567 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 568 for (CeedInt k = 0; k < Q1d; ++k) { 569 V[k] = 0.0; 570 if (k<P1d) { 571 for (CeedInt i = 0; i < Q1d; ++i) { 572 V[k] += B[k + i*P1d] * U[i];//contract z direction 573 } 574 } 575 } 576 } 577 578 template <int NCOMP, int P1d, int Q1d> 579 inline __device__ void ContractTransposeY3d(BackendData& data, 580 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 581 CeedScalar r_B[Q1d]; 582 for (CeedInt i = 0; i < Q1d; ++i) 583 { 584 r_B[i] = B[data.tidy + i*P1d]; 585 } 586 for (CeedInt k = 0; k < P1d; ++k) { 587 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 588 __syncthreads(); 589 V[k] = 0.0; 590 if (data.tidy<P1d) { 591 for (CeedInt i = 0; i < Q1d; ++i) { 592 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 593 } 594 } 595 __syncthreads(); 596 } 597 } 598 599 template <int NCOMP, int P1d, int Q1d> 600 inline __device__ void ContractTransposeAddY3d(BackendData& data, 601 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 602 CeedScalar r_B[Q1d]; 603 for (CeedInt i = 0; i < Q1d; ++i) { 604 r_B[i] = B[data.tidy + i*P1d]; 605 } 606 for (CeedInt k = 0; k < P1d; ++k) { 607 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 608 __syncthreads(); 609 if (data.tidy<P1d) { 610 for (CeedInt i = 0; i < Q1d; ++i) { 611 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 612 } 613 } 614 __syncthreads(); 615 } 616 } 617 618 template <int NCOMP, int P1d, int Q1d> 619 inline __device__ void ContractTransposeX3d(BackendData& data, 620 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 621 CeedScalar r_B[Q1d]; 622 for (CeedInt i = 0; i < Q1d; ++i) { 623 r_B[i] = B[data.tidx + i*P1d]; 624 } 625 for (CeedInt k = 0; k < P1d; ++k) { 626 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 627 __syncthreads(); 628 V[k] = 0.0; 629 if (data.tidx<P1d) { 630 for (CeedInt i = 0; i < Q1d; ++i) { 631 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 632 } 633 } 634 __syncthreads(); 635 } 636 } 637 638 template <int NCOMP, int P1d, int Q1d> 639 inline __device__ void ContractTransposeAddX3d(BackendData& data, 640 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 641 CeedScalar r_B[Q1d]; 642 for (CeedInt i = 0; i < Q1d; ++i) { 643 r_B[i] = B[data.tidx + i*P1d]; 644 } 645 for (CeedInt k = 0; k < P1d; ++k) { 646 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 647 __syncthreads(); 648 if (data.tidx<P1d) { 649 for (CeedInt i = 0; i < Q1d; ++i) { 650 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 651 } 652 } 653 __syncthreads(); 654 } 655 } 656 657 template <int NCOMP, int P1d, int Q1d> 658 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 659 CeedScalar *__restrict__ r_V) { 660 CeedScalar r_t1[Q1d]; 661 CeedScalar r_t2[Q1d]; 662 for (CeedInt comp=0; comp<NCOMP; comp++) { 663 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 664 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 665 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 666 } 667 } 668 669 template <int NCOMP, int P1d, int Q1d> 670 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 671 CeedScalar *__restrict__ r_V) { 672 CeedScalar r_t1[Q1d]; 673 CeedScalar r_t2[Q1d]; 674 for (CeedInt comp=0; comp<NCOMP; comp++) { 675 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 676 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 677 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 678 } 679 } 680 681 template <int NCOMP, int P1d, int Q1d> 682 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 683 const CeedScalar *c_B, const CeedScalar *c_G, 684 CeedScalar *__restrict__ r_V) { 685 CeedScalar r_t1[Q1d]; 686 CeedScalar r_t2[Q1d]; 687 for (CeedInt comp=0; comp<NCOMP; comp++) { 688 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 689 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 690 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 691 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 692 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 693 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 694 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 695 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 696 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 697 } 698 } 699 700 template <int NCOMP, int P1d, int Q1d> 701 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 702 const CeedScalar *c_B, const CeedScalar *c_G, 703 CeedScalar *__restrict__ r_V) { 704 CeedScalar r_t1[Q1d]; 705 CeedScalar r_t2[Q1d]; 706 for (CeedInt comp=0; comp<NCOMP; comp++) { 707 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 708 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 709 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 710 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 711 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 712 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 713 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 714 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 715 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 716 } 717 } 718 719 template <int NCOMP, int Q1d> 720 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, 721 const CeedScalar *__restrict__ r_U, 722 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 723 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 724 data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d]; 725 __syncthreads(); 726 // X derivative 727 r_V[comp+0*NCOMP] = 0.0; 728 for (CeedInt i = 0; i < Q1d; ++i) { 729 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 730 } 731 // Y derivative 732 r_V[comp+1*NCOMP] = 0.0; 733 for (CeedInt i = 0; i < Q1d; ++i) { 734 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 735 } 736 // Z derivative 737 r_V[comp+2*NCOMP] = 0.0; 738 for (CeedInt i = 0; i < Q1d; ++i) { 739 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative) 740 } 741 __syncthreads(); 742 } 743 } 744 745 template <int NCOMP, int Q1d> 746 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, 747 const CeedScalar *__restrict__ r_U, 748 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 749 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 750 // X derivative 751 data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 752 __syncthreads(); 753 for (CeedInt i = 0; i < Q1d; ++i) { 754 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 755 } 756 __syncthreads(); 757 // Y derivative 758 data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 759 __syncthreads(); 760 for (CeedInt i = 0; i < Q1d; ++i) { 761 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 762 } 763 __syncthreads(); 764 // Z derivative 765 for (CeedInt i = 0; i < Q1d; ++i) { 766 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative) 767 } 768 } 769 } 770 771 template <int Q1d> 772 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 773 *w = qweight1d[data.tidx]; 774 } 775 776 template <int Q1d> 777 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 778 *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 779 } 780 781 template <int Q1d> 782 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 783 const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 784 for (CeedInt z = 0; z < Q1d; ++z) 785 { 786 w[z] = pw*qweight1d[z]; 787 } 788 } 789 790 ); 791 792 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 793 794 using std::ostringstream; 795 using std::string; 796 int ierr; 797 bool setupdone; 798 ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 799 if (setupdone) return 0; 800 Ceed ceed; 801 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 802 CeedOperator_Cuda_gen *data; 803 ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 804 CeedQFunction qf; 805 CeedQFunction_Cuda_gen *qf_data; 806 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 807 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 808 CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes; 809 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 810 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 811 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 812 CeedChk(ierr); 813 CeedOperatorField *opinputfields, *opoutputfields; 814 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 815 CeedChk(ierr); 816 CeedQFunctionField *qfinputfields, *qfoutputfields; 817 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 818 CeedChk(ierr); 819 CeedEvalMode emode; 820 CeedInterlaceMode imode; 821 CeedBasis basis; 822 CeedBasis_Cuda_shared *basis_data; 823 CeedElemRestriction Erestrict; 824 CeedElemRestriction_Cuda_reg *restr_data; 825 826 ostringstream code; 827 string devFunctions(deviceFunctions); 828 829 // Add atomicAdd function for old NVidia architectures 830 struct cudaDeviceProp prop; 831 Ceed delegate; 832 CeedGetDelegate(ceed, &delegate); 833 Ceed_Cuda *ceed_data; 834 ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr); 835 ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 836 if (prop.major<6){ 837 code << atomicAdd; 838 } 839 840 code << devFunctions; 841 842 string qFunction(qf_data->qFunctionSource); 843 844 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 845 code << "\n#define CeedPragmaSIMD\n"; 846 code << qFunction; 847 848 // Setup 849 code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 850 // Input Evecs and Restriction 851 for (CeedInt i = 0; i < numinputfields; i++) { 852 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 853 CeedChk(ierr); 854 if (emode == CEED_EVAL_WEIGHT) { // Skip 855 } else { 856 code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 857 if (emode != CEED_EVAL_NONE) 858 { 859 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 860 bool isTensor; 861 ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 862 //TODO check that all are the same 863 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 864 if (isTensor) 865 { 866 //TODO check that all are the same 867 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 868 } else { 869 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 870 } 871 } 872 } 873 } 874 data->dim = dim; 875 data->Q1d = Q1d; 876 877 for (CeedInt i = 0; i < numoutputfields; i++) { 878 code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 879 } 880 code << "const CeedInt Dim = "<<dim<<";\n"; 881 code << "const CeedInt Q1d = "<<Q1d<<";\n"; 882 // code << "const CeedInt Q = "<<Q<<";\n"; 883 code << "extern __shared__ CeedScalar slice[];\n"; 884 code << "BackendData data;\n"; 885 code << "data.tidx = threadIdx.x;\n"; 886 code << "data.tidy = threadIdx.y;\n"; 887 code << "data.tidz = threadIdx.z;\n"; 888 code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 889 code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 890 //Initialize constants, and matrices B and G 891 for (CeedInt i = 0; i < numinputfields; i++) { 892 code << "// Input field "<<i<<"\n"; 893 // Get elemsize, emode, ncomp 894 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 895 CeedChk(ierr); 896 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 897 CeedChk(ierr); 898 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 899 CeedChk(ierr); 900 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 901 CeedChk(ierr); 902 // Basis action 903 switch (emode) { 904 case CEED_EVAL_NONE: 905 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 906 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 907 code << " const CeedInt nquads_in_"<<i<<" = "<<nnodes<<";\n"; 908 break; 909 case CEED_EVAL_INTERP: 910 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 911 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 912 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 913 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 914 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 915 code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 916 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 917 data->B.in[i] = basis_data->d_interp1d; 918 code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 919 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 920 break; 921 case CEED_EVAL_GRAD: 922 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 923 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 924 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 925 code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 926 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 927 code << "const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 928 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 929 data->B.in[i] = basis_data->d_interp1d; 930 code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 931 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 932 if (basis_data->d_collograd1d) { 933 data->G.in[i] = basis_data->d_collograd1d; 934 code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 935 code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 936 } else { 937 data->G.in[i] = basis_data->d_grad1d; 938 code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 939 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 940 } 941 break; 942 case CEED_EVAL_WEIGHT: 943 break; // No action 944 case CEED_EVAL_DIV: 945 break; // TODO: Not implemented 946 case CEED_EVAL_CURL: 947 break; // TODO: Not implemented 948 } 949 } 950 for (CeedInt i = 0; i < numoutputfields; i++) { 951 code << "// Output field "<<i<<"\n"; 952 // Get elemsize, emode, ncomp 953 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 954 CeedChk(ierr); 955 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 956 CeedChk(ierr); 957 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 958 CeedChk(ierr); 959 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 960 CeedChk(ierr); 961 // Basis action 962 switch (emode) { 963 case CEED_EVAL_NONE: 964 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 965 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 966 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 967 code << " const CeedInt nquads_out_"<<i<<" = "<<nnodes<<";\n"; 968 break; // No action 969 case CEED_EVAL_INTERP: 970 code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 971 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 972 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 973 code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 974 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 975 data->B.out[i] = basis_data->d_interp1d; 976 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 977 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 978 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 979 code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 980 break; 981 case CEED_EVAL_GRAD: 982 code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 983 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 984 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 985 code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 986 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 987 data->B.out[i] = basis_data->d_interp1d; 988 code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 989 code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 990 if (basis_data->d_collograd1d) { 991 data->G.out[i] = basis_data->d_collograd1d; 992 code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 993 code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 994 } else { 995 data->G.out[i] = basis_data->d_grad1d; 996 code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 997 code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 998 } 999 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 1000 code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 1001 break; 1002 case CEED_EVAL_WEIGHT: { 1003 Ceed ceed; 1004 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1005 return CeedError(ceed, 1, 1006 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1007 break; // Should not occur 1008 } 1009 case CEED_EVAL_DIV: 1010 break; // TODO: Not implemented 1011 case CEED_EVAL_CURL: 1012 break; // TODO: Not implemented 1013 } 1014 } 1015 code << "\n"; 1016 code << "__syncthreads();\n"; 1017 code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1018 // Input basis apply if needed 1019 // Generate the correct eval mode code for each input 1020 for (CeedInt i = 0; i < numinputfields; i++) { 1021 code << " // Input field "<<i<<"\n"; 1022 // Get elemsize, emode, ncomp 1023 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1024 CeedChk(ierr); 1025 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1026 CeedChk(ierr); 1027 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1028 CeedChk(ierr); 1029 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1030 CeedChk(ierr); 1031 // Basis action 1032 switch (emode) { 1033 case CEED_EVAL_NONE: 1034 if (!basis_data->d_collograd1d) { 1035 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1036 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1037 code << " readQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n"; 1038 } 1039 break; 1040 case CEED_EVAL_INTERP: 1041 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1042 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1043 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1044 data->indices.in[i] = restr_data->d_ind; 1045 code << " readDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1046 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1047 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1048 break; 1049 case CEED_EVAL_GRAD: 1050 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1051 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1052 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1053 data->indices.in[i] = restr_data->d_ind; 1054 code << " readDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1055 if (basis_data->d_collograd1d) { 1056 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1057 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1058 } else { 1059 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1060 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1061 } 1062 break; 1063 case CEED_EVAL_WEIGHT: 1064 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1065 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1066 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1067 data->W = basis_data->d_qweight1d; 1068 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1069 break; // No action 1070 case CEED_EVAL_DIV: 1071 break; // TODO: Not implemented 1072 case CEED_EVAL_CURL: 1073 break; // TODO: Not implemented 1074 } 1075 } 1076 1077 // Q function 1078 for (CeedInt i = 0; i < numoutputfields; i++) { 1079 code << " // Output field "<<i<<"\n"; 1080 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1081 CeedChk(ierr); 1082 if (emode==CEED_EVAL_GRAD) 1083 { 1084 if (basis_data->d_collograd1d) { 1085 //Accumulator for gradient slices 1086 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1087 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1088 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1089 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1090 code << " }\n"; 1091 code << " }\n"; 1092 } else { 1093 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1094 } 1095 } 1096 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1097 { 1098 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1099 } 1100 } 1101 //We treat quadrature points per slice in 3d to save registers 1102 code << "// QFunction\n"; 1103 if (basis_data->d_collograd1d) { 1104 code << "#pragma unroll\n"; 1105 code << "for (CeedInt q=0; q<Q1d; q++) {\n"; 1106 for (CeedInt i = 0; i < numinputfields; i++) { 1107 code << " // Input field "<<i<<"\n"; 1108 // Get elemsize, emode, ncomp 1109 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1110 CeedChk(ierr); 1111 // Basis action 1112 switch (emode) { 1113 case CEED_EVAL_NONE: 1114 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1115 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1116 code << " readSliceQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1117 break; 1118 case CEED_EVAL_INTERP: 1119 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1120 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1121 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1122 code << " }\n"; 1123 break; 1124 case CEED_EVAL_GRAD: 1125 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1126 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1127 break; 1128 case CEED_EVAL_WEIGHT: 1129 code << " CeedScalar r_q"<<i<<"[1];\n"; 1130 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1131 break; // No action 1132 case CEED_EVAL_DIV: 1133 break; // TODO: Not implemented 1134 case CEED_EVAL_CURL: 1135 break; // TODO: Not implemented 1136 } 1137 } 1138 for (CeedInt i = 0; i < numoutputfields; i++) { 1139 code << " // Output field "<<i<<"\n"; 1140 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1141 CeedChk(ierr); 1142 // Basis action 1143 switch (emode) { 1144 case CEED_EVAL_NONE: 1145 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1146 break; // No action 1147 case CEED_EVAL_INTERP: 1148 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1149 break; 1150 case CEED_EVAL_GRAD: 1151 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1152 break; 1153 case CEED_EVAL_WEIGHT: 1154 break; // Should not occur 1155 case CEED_EVAL_DIV: 1156 break; // TODO: Not implemented 1157 case CEED_EVAL_CURL: 1158 break; // TODO: Not implemented 1159 } 1160 } 1161 } else { 1162 for (CeedInt i = 0; i < numinputfields; i++) { 1163 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1164 } 1165 for (CeedInt i = 0; i < numoutputfields; i++) { 1166 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1167 } 1168 } 1169 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1170 for (CeedInt i = 0; i < numinputfields; i++) { 1171 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1172 } 1173 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1174 for (CeedInt i = 0; i < numoutputfields; i++) { 1175 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1176 } 1177 string qFunctionName(qf_data->qFunctionName); 1178 code << " "<<qFunctionName<<"(ctx, "; 1179 if (dim != 3 || basis_data->d_collograd1d) { 1180 code << "1 "; 1181 }else{ 1182 code << "Q1d "; 1183 } 1184 code << ", in, out);\n"; 1185 if (basis_data->d_collograd1d) { 1186 for (CeedInt i = 0; i < numoutputfields; i++) { 1187 code << " // Output field "<<i<<"\n"; 1188 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1189 CeedChk(ierr); 1190 // Basis action 1191 switch (emode) { 1192 case CEED_EVAL_NONE: 1193 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1194 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1195 code << " }\n"; 1196 break; // No action 1197 case CEED_EVAL_INTERP: 1198 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1199 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1200 code << " }\n"; 1201 break; 1202 case CEED_EVAL_GRAD: 1203 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1204 break; 1205 case CEED_EVAL_WEIGHT: 1206 break; // Should not occur 1207 case CEED_EVAL_DIV: 1208 break; // TODO: Not implemented 1209 case CEED_EVAL_CURL: 1210 break; // TODO: Not implemented 1211 } 1212 } 1213 code << "}\n"; 1214 } 1215 1216 // Output basis apply if needed 1217 // Generate the correct eval mode code for each output 1218 for (CeedInt i = 0; i < numoutputfields; i++) { 1219 code << " // Output field "<<i<<"\n"; 1220 // Get elemsize, emode, ncomp 1221 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1222 CeedChk(ierr); 1223 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1224 CeedChk(ierr); 1225 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1226 CeedChk(ierr); 1227 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1228 CeedChk(ierr); 1229 // Basis action 1230 switch (emode) { 1231 case CEED_EVAL_NONE: 1232 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1233 code << " writeQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n"; 1234 break; // No action 1235 case CEED_EVAL_INTERP: 1236 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1237 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1238 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1239 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1240 data->indices.out[i] = restr_data->d_ind; 1241 code << " writeDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1242 break; 1243 case CEED_EVAL_GRAD: 1244 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1245 if (basis_data->d_collograd1d) { 1246 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1247 } else { 1248 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1249 } 1250 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1251 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1252 data->indices.out[i] = restr_data->d_ind; 1253 code << " writeDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1254 break; 1255 case CEED_EVAL_WEIGHT: { 1256 Ceed ceed; 1257 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1258 return CeedError(ceed, 1, 1259 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1260 break; // Should not occur 1261 } 1262 case CEED_EVAL_DIV: 1263 break; // TODO: Not implemented 1264 case CEED_EVAL_CURL: 1265 break; // TODO: Not implemented 1266 } 1267 } 1268 1269 code << " }\n"; 1270 code << "}\n\n"; 1271 1272 // std::cout << code.str(); 1273 1274 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr); 1275 ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1276 CeedChk(ierr); 1277 1278 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1279 1280 return 0; 1281 } 1282