1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 #include <ceed-backend.h> 17 #include "ceed-cuda-gen.h" 18 #include <iostream> 19 #include <sstream> 20 #include "../cuda/ceed-cuda.h" 21 #include "../cuda-reg/ceed-cuda-reg.h" 22 #include "../cuda-shared/ceed-cuda-shared.h" 23 24 static const char *atomicAdd = QUOTE( 25 __device__ double atomicAdd(double *address, double val) { 26 unsigned long long int *address_as_ull = (unsigned long long int *)address; 27 unsigned long long int old = *address_as_ull, assumed; 28 do { 29 assumed = old; 30 old = 31 atomicCAS(address_as_ull, assumed, 32 __double_as_longlong(val + 33 __longlong_as_double(assumed))); 34 // Note: uses integer comparison to avoid hang in case of NaN 35 // (since NaN != NaN) 36 } while (assumed != old); 37 return __longlong_as_double(old); 38 } 39 ); 40 41 static const char *deviceFunctions = QUOTE( 42 43 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45 46 typedef struct { 47 CeedInt tidx; 48 CeedInt tidy; 49 CeedInt tidz; 50 CeedInt tid; 51 CeedScalar* slice; 52 } BackendData; 53 54 template <int P, int Q> 55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56 for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57 B[i] = d_B[i]; 58 } 59 } 60 61 //**** 62 // 1D 63 template <int NCOMP, int P1d> 64 inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 65 if (data.tidx<P1d) 66 { 67 const CeedInt node = data.tidx; 68 const CeedInt ind = indices[node + elem * P1d]; 69 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 70 r_u[comp] = d_u[ind + nnodes * comp]; 71 } 72 } 73 } 74 75 template <int NCOMP, int P1d> 76 inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77 if (data.tidx<P1d) 78 { 79 const CeedInt node = data.tidx; 80 const CeedInt ind = indices[node + elem * P1d]; 81 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 82 r_u[comp] = d_u[ind * NCOMP + comp]; 83 } 84 } 85 } 86 87 template <int NCOMP, int P1d> 88 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedInt *strides, const CeedScalar* d_u, CeedScalar* r_u) { 89 if (data.tidx<P1d) 90 { 91 const CeedInt node = data.tidx; 92 const CeedInt ind = node * strides[0] + elem * strides[2]; 93 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 94 r_u[comp] = d_u[ind + comp * strides[1]]; 95 } 96 } 97 } 98 99 template <int NCOMP, int P1d> 100 inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 101 if (data.tidx<P1d) 102 { 103 const CeedInt node = data.tidx; 104 const CeedInt ind = indices[node + elem * P1d]; 105 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 106 atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 107 } 108 } 109 } 110 111 template <int NCOMP, int P1d> 112 inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 113 if (data.tidx<P1d) 114 { 115 const CeedInt node = data.tidx; 116 const CeedInt ind = indices[node + elem * P1d]; 117 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 118 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 119 } 120 } 121 } 122 123 template <int NCOMP, int P1d> 124 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedInt *strides, const CeedScalar* r_v, CeedScalar* d_v) { 125 if (data.tidx<P1d) 126 { 127 const CeedInt node = data.tidx; 128 const CeedInt ind = node * strides[0] + elem * strides[2]; 129 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 130 d_v[ind + comp * strides[1]] += r_v[comp]; 131 } 132 } 133 } 134 135 template <int NCOMP, int P1d, int Q1d> 136 inline __device__ void ContractX1d(BackendData& data, 137 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 138 data.slice[data.tidx] = *U; 139 __syncthreads(); 140 *V = 0.0; 141 for (CeedInt i = 0; i < P1d; ++i) { 142 *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 143 } 144 __syncthreads(); 145 } 146 147 template <int NCOMP, int P1d, int Q1d> 148 inline __device__ void ContractTransposeX1d(BackendData& data, 149 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 150 data.slice[data.tidx] = *U; 151 __syncthreads(); 152 *V = 0.0; 153 for (CeedInt i = 0; i < Q1d; ++i) { 154 *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 155 } 156 __syncthreads(); 157 } 158 159 template <int NCOMP, int P1d, int Q1d> 160 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 161 CeedScalar *__restrict__ r_V) { 162 for (CeedInt comp=0; comp<NCOMP; comp++) { 163 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 164 } 165 } 166 167 template <int NCOMP, int P1d, int Q1d> 168 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 169 CeedScalar *__restrict__ r_V) { 170 for (CeedInt comp=0; comp<NCOMP; comp++) { 171 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 172 } 173 } 174 175 template <int NCOMP, int P1d, int Q1d> 176 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 177 const CeedScalar *c_B, const CeedScalar *c_G, 178 CeedScalar *__restrict__ r_V) { 179 for (CeedInt comp=0; comp<NCOMP; comp++) { 180 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 181 } 182 } 183 184 template <int NCOMP, int P1d, int Q1d> 185 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 186 const CeedScalar *c_B, const CeedScalar *c_G, 187 CeedScalar *__restrict__ r_V) { 188 for (CeedInt comp=0; comp<NCOMP; comp++) { 189 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 190 } 191 } 192 193 //**** 194 // 2D 195 template <int NCOMP, int P1d> 196 inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 197 if (data.tidx<P1d && data.tidy<P1d) 198 { 199 const CeedInt node = data.tidx + data.tidy*P1d; 200 const CeedInt ind = indices[node + elem * P1d*P1d]; 201 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 202 r_u[comp] = d_u[ind + nnodes * comp]; 203 } 204 } 205 } 206 207 template <int NCOMP, int P1d> 208 inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 209 if (data.tidx<P1d && data.tidy<P1d) 210 { 211 const CeedInt node = data.tidx + data.tidy*P1d; 212 const CeedInt ind = indices[node + elem * P1d*P1d]; 213 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 214 r_u[comp] = d_u[ind * NCOMP + comp]; 215 } 216 } 217 } 218 219 template <int NCOMP, int P1d> 220 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedInt *strides, const CeedScalar* d_u, CeedScalar* r_u) { 221 if (data.tidx<P1d && data.tidy<P1d) 222 { 223 const CeedInt node = data.tidx + data.tidy*P1d; 224 const CeedInt ind = node * strides[0] + elem * strides[2]; 225 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 226 r_u[comp] = d_u[ind + comp * strides[1]]; 227 } 228 } 229 } 230 231 template <int NCOMP, int P1d> 232 inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 233 if (data.tidx<P1d && data.tidy<P1d) 234 { 235 const CeedInt node = data.tidx + data.tidy*P1d; 236 const CeedInt ind = indices[node + elem * P1d*P1d]; 237 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 238 atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 239 } 240 } 241 } 242 243 template <int NCOMP, int P1d> 244 inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 245 if (data.tidx<P1d && data.tidy<P1d) 246 { 247 const CeedInt node = data.tidx + data.tidy*P1d; 248 const CeedInt ind = indices[node + elem * P1d*P1d]; 249 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 250 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 251 } 252 } 253 } 254 255 template <int NCOMP, int P1d> 256 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedInt *strides, const CeedScalar* r_v, CeedScalar* d_v) { 257 if (data.tidx<P1d && data.tidy<P1d) 258 { 259 const CeedInt node = data.tidx + data.tidy*P1d; 260 const CeedInt ind = node * strides[0] + elem * strides[2]; 261 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 262 d_v[ind + comp * strides[1]] += r_v[comp]; 263 } 264 } 265 } 266 267 template <int NCOMP, int P1d, int Q1d> 268 inline __device__ void ContractX2d(BackendData& data, 269 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 270 data.slice[data.tidx+data.tidy*Q1d] = *U; 271 __syncthreads(); 272 *V = 0.0; 273 for (CeedInt i = 0; i < P1d; ++i) { 274 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 275 } 276 __syncthreads(); 277 } 278 279 template <int NCOMP, int P1d, int Q1d> 280 inline __device__ void ContractY2d(BackendData& data, 281 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 282 data.slice[data.tidx+data.tidy*Q1d] = *U; 283 __syncthreads(); 284 *V = 0.0; 285 for (CeedInt i = 0; i < P1d; ++i) { 286 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 287 } 288 __syncthreads(); 289 } 290 291 template <int NCOMP, int P1d, int Q1d> 292 inline __device__ void ContractYTranspose2d(BackendData& data, 293 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 294 data.slice[data.tidx+data.tidy*Q1d] = *U; 295 __syncthreads(); 296 *V = 0.0; 297 if (data.tidy<P1d) { 298 for (CeedInt i = 0; i < Q1d; ++i) { 299 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 300 } 301 } 302 __syncthreads(); 303 } 304 305 template <int NCOMP, int P1d, int Q1d> 306 inline __device__ void ContractXTranspose2d(BackendData& data, 307 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 308 data.slice[data.tidx+data.tidy*Q1d] = *U; 309 __syncthreads(); 310 *V = 0.0; 311 if (data.tidx<P1d) { 312 for (CeedInt i = 0; i < Q1d; ++i) { 313 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 314 } 315 } 316 __syncthreads(); 317 } 318 319 template <int NCOMP, int P1d, int Q1d> 320 inline __device__ void ContractXTransposeAdd2d(BackendData& data, 321 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 322 data.slice[data.tidx+data.tidy*Q1d] = *U; 323 __syncthreads(); 324 if (data.tidx<P1d) { 325 for (CeedInt i = 0; i < Q1d; ++i) { 326 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 327 } 328 } 329 __syncthreads(); 330 } 331 332 template <int NCOMP, int P1d, int Q1d> 333 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 334 CeedScalar *__restrict__ r_V) { 335 CeedScalar r_t[1]; 336 for (CeedInt comp=0; comp<NCOMP; comp++) { 337 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 338 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 339 } 340 } 341 342 template <int NCOMP, int P1d, int Q1d> 343 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 344 CeedScalar *__restrict__ r_V) { 345 CeedScalar r_t[1]; 346 for (CeedInt comp=0; comp<NCOMP; comp++) { 347 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 348 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 349 } 350 } 351 352 template <int NCOMP, int P1d, int Q1d> 353 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 354 const CeedScalar *c_B, const CeedScalar *c_G, 355 CeedScalar *__restrict__ r_V) { 356 CeedScalar r_t[1]; 357 for (CeedInt comp=0; comp<NCOMP; comp++) { 358 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 359 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 360 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 361 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 362 } 363 } 364 365 template <int NCOMP, int P1d, int Q1d> 366 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 367 const CeedScalar *c_B, const CeedScalar *c_G, 368 CeedScalar *__restrict__ r_V) { 369 CeedScalar r_t[1]; 370 for (CeedInt comp=0; comp<NCOMP; comp++) { 371 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 372 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 373 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 374 ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 375 } 376 } 377 378 //**** 379 // 3D 380 template <int NCOMP, int P1d> 381 inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 382 if (data.tidx<P1d && data.tidy<P1d) { 383 for (CeedInt z = 0; z < P1d; ++z) { 384 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 385 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 386 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 387 r_u[z+comp*P1d] = d_u[ind + nnodes * comp]; 388 } 389 } 390 } 391 } 392 393 template <int NCOMP, int P1d> 394 inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 395 if (data.tidx<P1d && data.tidy<P1d) { 396 for (CeedInt z = 0; z < P1d; ++z) { 397 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 398 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 399 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 400 r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 401 } 402 } 403 } 404 } 405 406 template <int NCOMP, int P1d> 407 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedInt *strides, const CeedScalar* d_u, CeedScalar* r_u) { 408 if (data.tidx<P1d && data.tidy<P1d) { 409 for (CeedInt z = 0; z < P1d; ++z) { 410 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 411 const CeedInt ind = node * strides[0] + elem * strides[2]; 412 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 413 r_u[z+comp*P1d] = d_u[ind + comp * strides[1]]; 414 } 415 } 416 } 417 } 418 419 template <int NCOMP, int Q1d> 420 inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 421 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 422 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 423 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 424 r_u[comp] = d_u[ind + nquads * comp]; 425 } 426 } 427 428 template <int NCOMP, int Q1d> 429 inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 430 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 431 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d]; 432 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 433 r_u[comp] = d_u[ind * NCOMP + comp]; 434 } 435 } 436 437 template <int NCOMP, int Q1d> 438 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedInt *strides, const CeedScalar* d_u, CeedScalar* r_u) { 439 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 440 const CeedInt ind = node * strides[0] + elem * strides[2]; 441 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 442 r_u[comp] = d_u[ind + comp * strides[1]]; 443 } 444 } 445 446 template <int NCOMP, int P1d> 447 inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 448 if (data.tidx<P1d && data.tidy<P1d) { 449 for (CeedInt z = 0; z < P1d; ++z) { 450 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 451 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 452 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 453 atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]); 454 } 455 } 456 } 457 } 458 459 template <int NCOMP, int P1d> 460 inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 461 if (data.tidx<P1d && data.tidy<P1d) { 462 for (CeedInt z = 0; z < P1d; ++z) { 463 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 464 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 465 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 466 atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 467 } 468 } 469 } 470 } 471 472 template <int NCOMP, int P1d> 473 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedInt *strides, const CeedScalar* r_v, CeedScalar* d_v) { 474 if (data.tidx<P1d && data.tidy<P1d) { 475 for (CeedInt z = 0; z < P1d; ++z) { 476 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 477 const CeedInt ind = node * strides[0] + elem * strides[2]; 478 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 479 d_v[ind + comp * strides[1]] += r_v[z+comp*P1d]; 480 } 481 } 482 } 483 } 484 485 template <int NCOMP, int P1d, int Q1d> 486 inline __device__ void ContractX3d(BackendData& data, 487 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 488 CeedScalar r_B[P1d]; 489 for (CeedInt i = 0; i < P1d; ++i) { 490 r_B[i] = B[i + data.tidx*P1d]; 491 } 492 for (CeedInt k = 0; k < P1d; ++k) { 493 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 494 __syncthreads(); 495 V[k] = 0.0; 496 for (CeedInt i = 0; i < P1d; ++i) { 497 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 498 } 499 __syncthreads(); 500 } 501 } 502 503 template <int NCOMP, int P1d, int Q1d> 504 inline __device__ void ContractY3d(BackendData& data, 505 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 506 CeedScalar r_B[P1d]; 507 for (CeedInt i = 0; i < P1d; ++i) { 508 r_B[i] = B[i + data.tidy*P1d]; 509 } 510 for (CeedInt k = 0; k < P1d; ++k) { 511 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 512 __syncthreads(); 513 V[k] = 0.0; 514 for (CeedInt i = 0; i < P1d; ++i) { 515 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 516 } 517 __syncthreads(); 518 } 519 } 520 521 template <int NCOMP, int P1d, int Q1d> 522 inline __device__ void ContractZ3d(BackendData& data, 523 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 524 for (CeedInt k = 0; k < Q1d; ++k) { 525 V[k] = 0.0; 526 for (CeedInt i = 0; i < P1d; ++i) { 527 V[k] += B[i + k*P1d] * U[i];//contract z direction 528 } 529 } 530 } 531 532 template <int NCOMP, int P1d, int Q1d> 533 inline __device__ void ContractTransposeZ3d(BackendData& data, 534 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 535 for (CeedInt k = 0; k < Q1d; ++k) { 536 V[k] = 0.0; 537 if (k<P1d) { 538 for (CeedInt i = 0; i < Q1d; ++i) { 539 V[k] += B[k + i*P1d] * U[i];//contract z direction 540 } 541 } 542 } 543 } 544 545 template <int NCOMP, int P1d, int Q1d> 546 inline __device__ void ContractTransposeY3d(BackendData& data, 547 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 548 CeedScalar r_B[Q1d]; 549 for (CeedInt i = 0; i < Q1d; ++i) 550 { 551 r_B[i] = B[data.tidy + i*P1d]; 552 } 553 for (CeedInt k = 0; k < P1d; ++k) { 554 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 555 __syncthreads(); 556 V[k] = 0.0; 557 if (data.tidy<P1d) { 558 for (CeedInt i = 0; i < Q1d; ++i) { 559 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 560 } 561 } 562 __syncthreads(); 563 } 564 } 565 566 template <int NCOMP, int P1d, int Q1d> 567 inline __device__ void ContractTransposeAddY3d(BackendData& data, 568 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 569 CeedScalar r_B[Q1d]; 570 for (CeedInt i = 0; i < Q1d; ++i) { 571 r_B[i] = B[data.tidy + i*P1d]; 572 } 573 for (CeedInt k = 0; k < P1d; ++k) { 574 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 575 __syncthreads(); 576 if (data.tidy<P1d) { 577 for (CeedInt i = 0; i < Q1d; ++i) { 578 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 579 } 580 } 581 __syncthreads(); 582 } 583 } 584 585 template <int NCOMP, int P1d, int Q1d> 586 inline __device__ void ContractTransposeX3d(BackendData& data, 587 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 588 CeedScalar r_B[Q1d]; 589 for (CeedInt i = 0; i < Q1d; ++i) { 590 r_B[i] = B[data.tidx + i*P1d]; 591 } 592 for (CeedInt k = 0; k < P1d; ++k) { 593 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 594 __syncthreads(); 595 V[k] = 0.0; 596 if (data.tidx<P1d) { 597 for (CeedInt i = 0; i < Q1d; ++i) { 598 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 599 } 600 } 601 __syncthreads(); 602 } 603 } 604 605 template <int NCOMP, int P1d, int Q1d> 606 inline __device__ void ContractTransposeAddX3d(BackendData& data, 607 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 608 CeedScalar r_B[Q1d]; 609 for (CeedInt i = 0; i < Q1d; ++i) { 610 r_B[i] = B[data.tidx + i*P1d]; 611 } 612 for (CeedInt k = 0; k < P1d; ++k) { 613 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 614 __syncthreads(); 615 if (data.tidx<P1d) { 616 for (CeedInt i = 0; i < Q1d; ++i) { 617 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 618 } 619 } 620 __syncthreads(); 621 } 622 } 623 624 template <int NCOMP, int P1d, int Q1d> 625 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 626 CeedScalar *__restrict__ r_V) { 627 CeedScalar r_t1[Q1d]; 628 CeedScalar r_t2[Q1d]; 629 for (CeedInt comp=0; comp<NCOMP; comp++) { 630 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 631 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 632 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 633 } 634 } 635 636 template <int NCOMP, int P1d, int Q1d> 637 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 638 CeedScalar *__restrict__ r_V) { 639 CeedScalar r_t1[Q1d]; 640 CeedScalar r_t2[Q1d]; 641 for (CeedInt comp=0; comp<NCOMP; comp++) { 642 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 643 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 644 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 645 } 646 } 647 648 template <int NCOMP, int P1d, int Q1d> 649 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 650 const CeedScalar *c_B, const CeedScalar *c_G, 651 CeedScalar *__restrict__ r_V) { 652 CeedScalar r_t1[Q1d]; 653 CeedScalar r_t2[Q1d]; 654 for (CeedInt comp=0; comp<NCOMP; comp++) { 655 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 656 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 657 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 658 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 659 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 660 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 661 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 662 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 663 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 664 } 665 } 666 667 template <int NCOMP, int P1d, int Q1d> 668 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 669 const CeedScalar *c_B, const CeedScalar *c_G, 670 CeedScalar *__restrict__ r_V) { 671 CeedScalar r_t1[Q1d]; 672 CeedScalar r_t2[Q1d]; 673 for (CeedInt comp=0; comp<NCOMP; comp++) { 674 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 675 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 676 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 677 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 678 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 679 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 680 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 681 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 682 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 683 } 684 } 685 686 template <int NCOMP, int Q1d> 687 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, 688 const CeedScalar *__restrict__ r_U, 689 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 690 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 691 data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d]; 692 __syncthreads(); 693 // X derivative 694 r_V[comp+0*NCOMP] = 0.0; 695 for (CeedInt i = 0; i < Q1d; ++i) { 696 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 697 } 698 // Y derivative 699 r_V[comp+1*NCOMP] = 0.0; 700 for (CeedInt i = 0; i < Q1d; ++i) { 701 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 702 } 703 // Z derivative 704 r_V[comp+2*NCOMP] = 0.0; 705 for (CeedInt i = 0; i < Q1d; ++i) { 706 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative) 707 } 708 __syncthreads(); 709 } 710 } 711 712 template <int NCOMP, int Q1d> 713 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, 714 const CeedScalar *__restrict__ r_U, 715 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 716 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 717 // X derivative 718 data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 719 __syncthreads(); 720 for (CeedInt i = 0; i < Q1d; ++i) { 721 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 722 } 723 __syncthreads(); 724 // Y derivative 725 data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 726 __syncthreads(); 727 for (CeedInt i = 0; i < Q1d; ++i) { 728 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 729 } 730 __syncthreads(); 731 // Z derivative 732 for (CeedInt i = 0; i < Q1d; ++i) { 733 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative) 734 } 735 } 736 } 737 738 template <int Q1d> 739 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 740 *w = qweight1d[data.tidx]; 741 } 742 743 template <int Q1d> 744 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 745 *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 746 } 747 748 template <int Q1d> 749 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 750 const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 751 for (CeedInt z = 0; z < Q1d; ++z) 752 { 753 w[z] = pw*qweight1d[z]; 754 } 755 } 756 757 ); 758 759 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 760 761 using std::ostringstream; 762 using std::string; 763 int ierr; 764 bool setupdone; 765 ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 766 if (setupdone) return 0; 767 Ceed ceed; 768 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 769 CeedOperator_Cuda_gen *data; 770 ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 771 CeedQFunction qf; 772 CeedQFunction_Cuda_gen *qf_data; 773 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 774 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 775 CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes; 776 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 777 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 778 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 779 CeedChk(ierr); 780 CeedOperatorField *opinputfields, *opoutputfields; 781 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 782 CeedChk(ierr); 783 CeedQFunctionField *qfinputfields, *qfoutputfields; 784 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 785 CeedChk(ierr); 786 CeedEvalMode emode; 787 CeedInterlaceMode imode; 788 CeedBasis basis; 789 CeedBasis_Cuda_shared *basis_data; 790 CeedElemRestriction Erestrict; 791 CeedElemRestriction_Cuda_reg *restr_data; 792 793 ostringstream code; 794 string devFunctions(deviceFunctions); 795 796 // Add atomicAdd function for old NVidia architectures 797 struct cudaDeviceProp prop; 798 Ceed delegate; 799 CeedGetDelegate(ceed, &delegate); 800 Ceed_Cuda *ceed_data; 801 ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr); 802 ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 803 if (prop.major<6){ 804 code << atomicAdd; 805 } 806 807 code << devFunctions; 808 809 string qFunction(qf_data->qFunctionSource); 810 811 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 812 code << "\n#define CeedPragmaSIMD\n"; 813 code << qFunction; 814 815 // Setup 816 code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFieldsInt strides, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 817 // Input Evecs and Restriction 818 for (CeedInt i = 0; i < numinputfields; i++) { 819 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 820 CeedChk(ierr); 821 if (emode == CEED_EVAL_WEIGHT) { // Skip 822 } else { 823 code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 824 if (emode != CEED_EVAL_NONE) 825 { 826 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 827 bool isTensor; 828 ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 829 //TODO check that all are the same 830 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 831 if (isTensor) 832 { 833 //TODO check that all are the same 834 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 835 } else { 836 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 837 } 838 } 839 } 840 } 841 data->dim = dim; 842 data->Q1d = Q1d; 843 844 for (CeedInt i = 0; i < numoutputfields; i++) { 845 code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 846 } 847 code << "const CeedInt Dim = "<<dim<<";\n"; 848 code << "const CeedInt Q1d = "<<Q1d<<";\n"; 849 // code << "const CeedInt Q = "<<Q<<";\n"; 850 code << "extern __shared__ CeedScalar slice[];\n"; 851 code << "BackendData data;\n"; 852 code << "data.tidx = threadIdx.x;\n"; 853 code << "data.tidy = threadIdx.y;\n"; 854 code << "data.tidz = threadIdx.z;\n"; 855 code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 856 code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 857 858 code << "\n// Input field constants and basis data\n"; 859 //Initialize constants, and matrices B and G 860 for (CeedInt i = 0; i < numinputfields; i++) { 861 code << "// -- Input field "<<i<<" --\n"; 862 // Get elemsize, emode, ncomp 863 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 864 CeedChk(ierr); 865 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 866 CeedChk(ierr); 867 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 868 CeedChk(ierr); 869 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 870 CeedChk(ierr); 871 872 // Set field constants 873 if (emode != CEED_EVAL_WEIGHT) { 874 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 875 if (basis != CEED_BASIS_COLLOCATED) { 876 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 877 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 878 } else { 879 code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 880 } 881 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 882 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 883 code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 884 } 885 886 // Load basis data 887 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 888 switch (emode) { 889 case CEED_EVAL_NONE: 890 break; 891 case CEED_EVAL_INTERP: 892 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 893 data->B.in[i] = basis_data->d_interp1d; 894 code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 895 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 896 break; 897 case CEED_EVAL_GRAD: 898 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 899 data->B.in[i] = basis_data->d_interp1d; 900 code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 901 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 902 if (basis_data->d_collograd1d) { 903 data->G.in[i] = basis_data->d_collograd1d; 904 code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 905 code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 906 } else { 907 data->G.in[i] = basis_data->d_grad1d; 908 code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 909 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 910 } 911 break; 912 case CEED_EVAL_WEIGHT: 913 break; // No action 914 case CEED_EVAL_DIV: 915 break; // TODO: Not implemented 916 case CEED_EVAL_CURL: 917 break; // TODO: Not implemented 918 } 919 } 920 921 code << "\n// Output field constants and basis data\n"; 922 for (CeedInt i = 0; i < numoutputfields; i++) { 923 code << "// -- Output field "<<i<<" --\n"; 924 // Get elemsize, emode, ncomp 925 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 926 CeedChk(ierr); 927 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 928 CeedChk(ierr); 929 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 930 CeedChk(ierr); 931 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 932 CeedChk(ierr); 933 934 // Set field constants 935 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 936 if (basis != CEED_BASIS_COLLOCATED) { 937 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 938 code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 939 } else { 940 code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 941 } 942 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 943 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 944 code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 945 946 // Load basis data 947 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 948 switch (emode) { 949 case CEED_EVAL_NONE: 950 break; // No action 951 case CEED_EVAL_INTERP: 952 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 953 data->B.out[i] = basis_data->d_interp1d; 954 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 955 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 956 break; 957 case CEED_EVAL_GRAD: 958 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 959 data->B.out[i] = basis_data->d_interp1d; 960 code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 961 code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 962 if (basis_data->d_collograd1d) { 963 data->G.out[i] = basis_data->d_collograd1d; 964 code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 965 code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 966 } else { 967 data->G.out[i] = basis_data->d_grad1d; 968 code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 969 code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 970 } 971 break; 972 case CEED_EVAL_WEIGHT: { 973 Ceed ceed; 974 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 975 return CeedError(ceed, 1, 976 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 977 break; // Should not occur 978 } 979 case CEED_EVAL_DIV: 980 break; // TODO: Not implemented 981 case CEED_EVAL_CURL: 982 break; // TODO: Not implemented 983 } 984 } 985 code << "\n"; 986 code << "__syncthreads();\n"; 987 code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 988 // Input basis apply if needed 989 // Generate the correct eval mode code for each input 990 code << "\n// Input field restrictions and basis actions\n"; 991 for (CeedInt i = 0; i < numinputfields; i++) { 992 code << " // -- Input field "<<i<<" --\n"; 993 // Get elemsize, emode, ncomp 994 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 995 CeedChk(ierr); 996 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 997 CeedChk(ierr); 998 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 999 CeedChk(ierr); 1000 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1001 CeedChk(ierr); 1002 1003 // Restriction 1004 if (emode != CEED_EVAL_WEIGHT && 1005 !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) { 1006 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1007 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1008 data->indices.in[i] = restr_data->d_ind; 1009 if (data->indices.in[i]) { 1010 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1011 code << " // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n"; 1012 code << " readDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1013 } else { 1014 bool backendstrides; 1015 ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1016 &backendstrides); 1017 CeedChk(ierr); 1018 CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1019 if (!backendstrides) { 1020 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1021 CeedChk(ierr); 1022 } 1023 ierr = cudaMalloc( (void **)&data->strides.in[i], 3 * sizeof(CeedInt)); 1024 CeedChk_Cu(ceed, (CUresult)ierr); 1025 ierr = cudaMemcpy(data->strides.in[i], strides, 3 * sizeof(CeedInt), 1026 cudaMemcpyHostToDevice); 1027 CeedChk_Cu(ceed, (CUresult)ierr); 1028 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1029 code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, elem, strides.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1030 } 1031 } 1032 1033 // Basis action 1034 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1035 switch (emode) { 1036 case CEED_EVAL_NONE: 1037 if (!basis_data->d_collograd1d) { 1038 code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1039 } 1040 break; 1041 case CEED_EVAL_INTERP: 1042 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1043 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1044 break; 1045 case CEED_EVAL_GRAD: 1046 if (basis_data->d_collograd1d) { 1047 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1048 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1049 } else { 1050 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1051 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1052 } 1053 break; 1054 case CEED_EVAL_WEIGHT: 1055 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1056 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1057 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1058 data->W = basis_data->d_qweight1d; 1059 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1060 break; // No action 1061 case CEED_EVAL_DIV: 1062 break; // TODO: Not implemented 1063 case CEED_EVAL_CURL: 1064 break; // TODO: Not implemented 1065 } 1066 } 1067 1068 // Q function 1069 code << "\n// Output field setup\n"; 1070 for (CeedInt i = 0; i < numoutputfields; i++) { 1071 code << " // -- Output field "<<i<<" --\n"; 1072 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1073 CeedChk(ierr); 1074 if (emode==CEED_EVAL_GRAD) 1075 { 1076 if (basis_data->d_collograd1d) { 1077 //Accumulator for gradient slices 1078 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1079 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1080 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1081 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1082 code << " }\n"; 1083 code << " }\n"; 1084 } else { 1085 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1086 } 1087 } 1088 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1089 { 1090 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1091 } 1092 } 1093 //We treat quadrature points per slice in 3d to save registers 1094 if (basis_data->d_collograd1d) { 1095 code << "\n // Note: Collocated Gradient\n"; 1096 code << "#pragma unroll\n"; 1097 code << "for (CeedInt q=0; q<Q1d; q++) {\n"; 1098 for (CeedInt i = 0; i < numinputfields; i++) { 1099 code << " // -- Input field "<<i<<" --\n"; 1100 // Get elemsize, emode, ncomp 1101 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1102 CeedChk(ierr); 1103 // Basis action 1104 code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1105 switch (emode) { 1106 case CEED_EVAL_NONE: 1107 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1108 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1109 data->indices.in[i] = restr_data->d_ind; 1110 if (data->indices.in[i]) { 1111 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1112 code << " // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n"; 1113 code << " readSliceQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1114 } else { 1115 bool backendstrides; 1116 ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1117 &backendstrides); 1118 CeedChk(ierr); 1119 CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1120 if (!backendstrides) { 1121 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1122 CeedChk(ierr); 1123 } 1124 ierr = cudaMalloc( (void **)&data->strides.in[i], 3 * sizeof(CeedInt)); 1125 CeedChk_Cu(ceed, (CUresult)ierr); 1126 ierr = cudaMemcpy(data->strides.in[i], strides, 3 * sizeof(CeedInt), 1127 cudaMemcpyHostToDevice); 1128 CeedChk_Cu(ceed, (CUresult)ierr); 1129 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1130 code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d>(data, elem, q, strides.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1131 } 1132 break; 1133 case CEED_EVAL_INTERP: 1134 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1135 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1136 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1137 code << " }\n"; 1138 break; 1139 case CEED_EVAL_GRAD: 1140 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1141 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1142 break; 1143 case CEED_EVAL_WEIGHT: 1144 code << " CeedScalar r_q"<<i<<"[1];\n"; 1145 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1146 break; // No action 1147 case CEED_EVAL_DIV: 1148 break; // TODO: Not implemented 1149 case CEED_EVAL_CURL: 1150 break; // TODO: Not implemented 1151 } 1152 } 1153 for (CeedInt i = 0; i < numoutputfields; i++) { 1154 code << " // -- Output field "<<i<<" --\n"; 1155 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1156 CeedChk(ierr); 1157 // Basis action 1158 switch (emode) { 1159 case CEED_EVAL_NONE: 1160 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1161 break; // No action 1162 case CEED_EVAL_INTERP: 1163 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1164 break; 1165 case CEED_EVAL_GRAD: 1166 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1167 break; 1168 case CEED_EVAL_WEIGHT: 1169 break; // Should not occur 1170 case CEED_EVAL_DIV: 1171 break; // TODO: Not implemented 1172 case CEED_EVAL_CURL: 1173 break; // TODO: Not implemented 1174 } 1175 } 1176 } else { 1177 code << "\n // Note: No Collocated Gradient\n"; 1178 for (CeedInt i = 0; i < numinputfields; i++) { 1179 code << " // -- Input field "<<i<<" --\n"; 1180 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1181 } 1182 for (CeedInt i = 0; i < numoutputfields; i++) { 1183 code << " // -- Output field "<<i<<" --\n"; 1184 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1185 } 1186 } 1187 code << " // QFunction Inputs and outputs\n"; 1188 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1189 for (CeedInt i = 0; i < numinputfields; i++) { 1190 code << " // -- Input field "<<i<<" --\n"; 1191 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1192 } 1193 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1194 for (CeedInt i = 0; i < numoutputfields; i++) { 1195 code << " // -- Output field "<<i<<" --\n"; 1196 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1197 } 1198 code << "\n // Apply QFunction\n"; 1199 string qFunctionName(qf_data->qFunctionName); 1200 code << " "<<qFunctionName<<"(ctx, "; 1201 if (dim != 3 || basis_data->d_collograd1d) { 1202 code << "1 "; 1203 }else{ 1204 code << "Q1d "; 1205 } 1206 code << ", in, out);\n"; 1207 if (basis_data->d_collograd1d) { 1208 code << "\n // Note: Collocated Gradient\n"; 1209 for (CeedInt i = 0; i < numoutputfields; i++) { 1210 code << " // -- Output field "<<i<<" --\n"; 1211 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1212 CeedChk(ierr); 1213 // Basis action 1214 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1215 switch (emode) { 1216 case CEED_EVAL_NONE: 1217 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1218 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1219 code << " }\n"; 1220 break; // No action 1221 case CEED_EVAL_INTERP: 1222 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1223 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1224 code << " }\n"; 1225 break; 1226 case CEED_EVAL_GRAD: 1227 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1228 break; 1229 case CEED_EVAL_WEIGHT: 1230 break; // Should not occur 1231 case CEED_EVAL_DIV: 1232 break; // TODO: Not implemented 1233 case CEED_EVAL_CURL: 1234 break; // TODO: Not implemented 1235 } 1236 } 1237 code << "}\n"; 1238 } 1239 1240 // Output basis apply if needed 1241 // Generate the correct eval mode code for each output 1242 code << "\n// Output field basis action and restrictions\n"; 1243 for (CeedInt i = 0; i < numoutputfields; i++) { 1244 code << " // -- Output field "<<i<<" --\n"; 1245 // Get elemsize, emode, ncomp 1246 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1247 CeedChk(ierr); 1248 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1249 CeedChk(ierr); 1250 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1251 CeedChk(ierr); 1252 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1253 CeedChk(ierr); 1254 // Basis action 1255 code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1256 switch (emode) { 1257 case CEED_EVAL_NONE: 1258 code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1259 break; // No action 1260 case CEED_EVAL_INTERP: 1261 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1262 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1263 break; 1264 case CEED_EVAL_GRAD: 1265 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1266 if (basis_data->d_collograd1d) { 1267 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1268 } else { 1269 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1270 } 1271 break; 1272 case CEED_EVAL_WEIGHT: { 1273 Ceed ceed; 1274 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1275 return CeedError(ceed, 1, 1276 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1277 break; // Should not occur 1278 } 1279 case CEED_EVAL_DIV: 1280 break; // TODO: Not implemented 1281 case CEED_EVAL_CURL: 1282 break; // TODO: Not implemented 1283 } 1284 // Restriction 1285 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1286 data->indices.out[i] = restr_data->d_ind; 1287 if (data->indices.out[i]) { 1288 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1289 code << " // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n"; 1290 code << " writeDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1291 } else { 1292 bool backendstrides; 1293 ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1294 &backendstrides); 1295 CeedChk(ierr); 1296 CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1297 if (!backendstrides) { 1298 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1299 CeedChk(ierr); 1300 } 1301 ierr = cudaMalloc( (void **)&data->strides.out[i], 3 * sizeof(CeedInt)); 1302 CeedChk_Cu(ceed, (CUresult)ierr); 1303 ierr = cudaMemcpy(data->strides.out[i], strides, 3 * sizeof(CeedInt), 1304 cudaMemcpyHostToDevice); 1305 CeedChk_Cu(ceed, (CUresult)ierr); 1306 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1307 code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, elem, strides.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1308 } 1309 } 1310 1311 code << " }\n"; 1312 code << "}\n\n"; 1313 1314 // std::cout << code.str(); 1315 1316 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); 1317 CeedChk(ierr); 1318 ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1319 CeedChk(ierr); 1320 1321 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1322 1323 return 0; 1324 } 1325