1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 #include <ceed-backend.h> 17 #include "ceed-cuda-gen.h" 18 #include <iostream> 19 #include <sstream> 20 #include "../cuda/ceed-cuda.h" 21 #include "../cuda-reg/ceed-cuda-reg.h" 22 #include "../cuda-shared/ceed-cuda-shared.h" 23 24 static const char *atomicAdd = QUOTE( 25 __device__ double atomicAdd(double *address, double val) { 26 unsigned long long int *address_as_ull = (unsigned long long int *)address; 27 unsigned long long int old = *address_as_ull, assumed; 28 do { 29 assumed = old; 30 old = 31 atomicCAS(address_as_ull, assumed, 32 __double_as_longlong(val + 33 __longlong_as_double(assumed))); 34 // Note: uses integer comparison to avoid hang in case of NaN 35 // (since NaN != NaN) 36 } while (assumed != old); 37 return __longlong_as_double(old); 38 } 39 ); 40 41 static const char *deviceFunctions = QUOTE( 42 43 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45 46 typedef struct { 47 CeedInt tidx; 48 CeedInt tidy; 49 CeedInt tidz; 50 CeedInt tid; 51 CeedScalar* slice; 52 } BackendData; 53 54 template <int P, int Q> 55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56 for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57 B[i] = d_B[i]; 58 } 59 } 60 61 //**** 62 // 1D 63 template <int NCOMP, int P1d> 64 inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 65 if (data.tidx<P1d) 66 { 67 const CeedInt node = data.tidx; 68 const CeedInt ind = indices[node + elem * P1d]; 69 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 70 r_u[comp] = d_u[ind + nnodes * comp]; 71 } 72 } 73 } 74 75 template <int NCOMP, int P1d> 76 inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77 if (data.tidx<P1d) 78 { 79 const CeedInt node = data.tidx; 80 const CeedInt ind = indices[node + elem * P1d]; 81 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 82 r_u[comp] = d_u[ind * NCOMP + comp]; 83 } 84 } 85 } 86 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 87 if (data.tidx<P1d) 88 { 89 const CeedInt node = data.tidx; 90 const CeedInt ind = node * strides[0] + elem * strides[2]; 91 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 92 r_u[comp] = d_u[ind + comp * strides[1]]; 93 } 94 } 95 } 96 97 template <int NCOMP, int Q1d> 98 inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 99 const CeedInt node = data.tidx; 100 const CeedInt ind = node + elem * Q1d; 101 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 102 r_u[comp] = d_u[ind + nquads * comp]; 103 } 104 } 105 106 template <int NCOMP, int Q1d> 107 inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 108 const CeedInt node = data.tidx; 109 const CeedInt ind = node + elem * Q1d; 110 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 111 r_u[comp] = d_u[ind * NCOMP + comp]; 112 } 113 } 114 115 template <int NCOMP, int P1d> 116 inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 117 if (data.tidx<P1d) 118 { 119 const CeedInt node = data.tidx; 120 const CeedInt ind = indices[node + elem * P1d]; 121 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 122 atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 123 } 124 } 125 } 126 127 template <int NCOMP, int P1d> 128 inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 129 if (data.tidx<P1d) 130 { 131 const CeedInt node = data.tidx; 132 const CeedInt ind = indices[node + elem * P1d]; 133 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 134 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 135 } 136 } 137 } 138 139 template <int NCOMP, int P1d> 140 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 141 if (data.tidx<P1d) 142 { 143 const CeedInt node = data.tidx; 144 const CeedInt ind = node * strides[0] + elem * strides[2]; 145 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 146 atomicAdd(&d_v[ind + comp * strides[1]], r_v[comp]); 147 } 148 } 149 } 150 151 template <int NCOMP, int Q1d> 152 inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 153 const CeedInt node = data.tidx; 154 const CeedInt ind = node + elem * Q1d; 155 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 156 d_v[ind + nquads * comp] = r_v[comp]; 157 } 158 } 159 160 template <int NCOMP, int Q1d> 161 inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 162 const CeedInt node = data.tidx; 163 const CeedInt ind = node + elem * Q1d; 164 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 165 d_v[ind * NCOMP + comp] = r_v[comp]; 166 } 167 } 168 169 template <int NCOMP, int P1d, int Q1d> 170 inline __device__ void ContractX1d(BackendData& data, 171 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 172 data.slice[data.tidx] = *U; 173 __syncthreads(); 174 *V = 0.0; 175 for (CeedInt i = 0; i < P1d; ++i) { 176 *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 177 } 178 __syncthreads(); 179 } 180 181 template <int NCOMP, int P1d, int Q1d> 182 inline __device__ void ContractTransposeX1d(BackendData& data, 183 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 184 data.slice[data.tidx] = *U; 185 __syncthreads(); 186 *V = 0.0; 187 for (CeedInt i = 0; i < Q1d; ++i) { 188 *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 189 } 190 __syncthreads(); 191 } 192 193 template <int NCOMP, int P1d, int Q1d> 194 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 195 CeedScalar *__restrict__ r_V) { 196 for (CeedInt comp=0; comp<NCOMP; comp++) { 197 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 198 } 199 } 200 201 template <int NCOMP, int P1d, int Q1d> 202 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 203 CeedScalar *__restrict__ r_V) { 204 for (CeedInt comp=0; comp<NCOMP; comp++) { 205 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 206 } 207 } 208 209 template <int NCOMP, int P1d, int Q1d> 210 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 211 const CeedScalar *c_B, const CeedScalar *c_G, 212 CeedScalar *__restrict__ r_V) { 213 for (CeedInt comp=0; comp<NCOMP; comp++) { 214 ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 215 } 216 } 217 218 template <int NCOMP, int P1d, int Q1d> 219 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 220 const CeedScalar *c_B, const CeedScalar *c_G, 221 CeedScalar *__restrict__ r_V) { 222 for (CeedInt comp=0; comp<NCOMP; comp++) { 223 ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 224 } 225 } 226 227 //**** 228 // 2D 229 template <int NCOMP, int P1d> 230 inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 231 if (data.tidx<P1d && data.tidy<P1d) 232 { 233 const CeedInt node = data.tidx + data.tidy*P1d; 234 const CeedInt ind = indices[node + elem * P1d*P1d]; 235 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 236 r_u[comp] = d_u[ind + nnodes * comp]; 237 } 238 } 239 } 240 241 template <int NCOMP, int P1d> 242 inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 243 if (data.tidx<P1d && data.tidy<P1d) 244 { 245 const CeedInt node = data.tidx + data.tidy*P1d; 246 const CeedInt ind = indices[node + elem * P1d*P1d]; 247 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 248 r_u[comp] = d_u[ind * NCOMP + comp]; 249 } 250 } 251 } 252 template <int NCOMP, int P1d> 253 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 254 if (data.tidx<P1d && data.tidy<P1d) 255 { 256 const CeedInt node = data.tidx + data.tidy*P1d; 257 const CeedInt ind = node * strides[0] + elem * strides[2]; 258 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 259 r_u[comp] = d_u[ind + comp * strides[1]]; 260 } 261 } 262 } 263 264 template <int NCOMP, int Q1d> 265 inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 266 const CeedInt node = data.tidx + data.tidy*Q1d; 267 const CeedInt ind = node + elem * Q1d*Q1d; 268 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 269 r_u[comp] = d_u[ind + nquads * comp]; 270 } 271 } 272 273 template <int NCOMP, int Q1d> 274 inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 275 const CeedInt node = data.tidx + data.tidy*Q1d; 276 const CeedInt ind = node + elem * Q1d*Q1d; 277 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 278 r_u[comp] = d_u[ind * NCOMP + comp]; 279 } 280 } 281 282 template <int NCOMP, int P1d> 283 inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 284 if (data.tidx<P1d && data.tidy<P1d) 285 { 286 const CeedInt node = data.tidx + data.tidy*P1d; 287 const CeedInt ind = indices[node + elem * P1d*P1d]; 288 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 289 atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 290 } 291 } 292 } 293 294 template <int NCOMP, int P1d> 295 inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 296 if (data.tidx<P1d && data.tidy<P1d) 297 { 298 const CeedInt node = data.tidx + data.tidy*P1d; 299 const CeedInt ind = indices[node + elem * P1d*P1d]; 300 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 301 atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 302 } 303 } 304 } 305 306 template <int NCOMP, int P1d> 307 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 308 if (data.tidx<P1d && data.tidy<P1d) 309 { 310 const CeedInt node = data.tidx + data.tidy*P1d; 311 const CeedInt ind = node * strides[0] + elem * strides[2]; 312 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 313 atomicAdd(&d_v[ind + comp * strides[1]], r_v[comp]); 314 } 315 } 316 } 317 318 template <int NCOMP, int Q1d> 319 inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 320 const CeedInt node = data.tidx + data.tidy*Q1d; 321 const CeedInt ind = node + elem * Q1d*Q1d; 322 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 323 d_v[ind + nquads * comp] = r_v[comp]; 324 } 325 } 326 327 template <int NCOMP, int Q1d> 328 inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 329 const CeedInt node = data.tidx + data.tidy*Q1d; 330 const CeedInt ind = node + elem * Q1d*Q1d; 331 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 332 d_v[ind * NCOMP + comp] = r_v[comp]; 333 } 334 } 335 336 template <int NCOMP, int P1d, int Q1d> 337 inline __device__ void ContractX2d(BackendData& data, 338 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 339 data.slice[data.tidx+data.tidy*Q1d] = *U; 340 __syncthreads(); 341 *V = 0.0; 342 for (CeedInt i = 0; i < P1d; ++i) { 343 *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 344 } 345 __syncthreads(); 346 } 347 348 template <int NCOMP, int P1d, int Q1d> 349 inline __device__ void ContractY2d(BackendData& data, 350 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 351 data.slice[data.tidx+data.tidy*Q1d] = *U; 352 __syncthreads(); 353 *V = 0.0; 354 for (CeedInt i = 0; i < P1d; ++i) { 355 *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 356 } 357 __syncthreads(); 358 } 359 360 template <int NCOMP, int P1d, int Q1d> 361 inline __device__ void ContractYTranspose2d(BackendData& data, 362 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 363 data.slice[data.tidx+data.tidy*Q1d] = *U; 364 __syncthreads(); 365 *V = 0.0; 366 if (data.tidy<P1d) { 367 for (CeedInt i = 0; i < Q1d; ++i) { 368 *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 369 } 370 } 371 __syncthreads(); 372 } 373 374 template <int NCOMP, int P1d, int Q1d> 375 inline __device__ void ContractXTranspose2d(BackendData& data, 376 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 377 data.slice[data.tidx+data.tidy*Q1d] = *U; 378 __syncthreads(); 379 *V = 0.0; 380 if (data.tidx<P1d) { 381 for (CeedInt i = 0; i < Q1d; ++i) { 382 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 383 } 384 } 385 __syncthreads(); 386 } 387 388 template <int NCOMP, int P1d, int Q1d> 389 inline __device__ void ContractXTransposeAdd2d(BackendData& data, 390 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 391 data.slice[data.tidx+data.tidy*Q1d] = *U; 392 __syncthreads(); 393 if (data.tidx<P1d) { 394 for (CeedInt i = 0; i < Q1d; ++i) { 395 *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 396 } 397 } 398 __syncthreads(); 399 } 400 401 template <int NCOMP, int P1d, int Q1d> 402 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 403 CeedScalar *__restrict__ r_V) { 404 CeedScalar r_t[1]; 405 for (CeedInt comp=0; comp<NCOMP; comp++) { 406 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 407 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 408 } 409 } 410 411 template <int NCOMP, int P1d, int Q1d> 412 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 413 CeedScalar *__restrict__ r_V) { 414 CeedScalar r_t[1]; 415 for (CeedInt comp=0; comp<NCOMP; comp++) { 416 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 417 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 418 } 419 } 420 421 template <int NCOMP, int P1d, int Q1d> 422 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 423 const CeedScalar *c_B, const CeedScalar *c_G, 424 CeedScalar *__restrict__ r_V) { 425 CeedScalar r_t[1]; 426 for (CeedInt comp=0; comp<NCOMP; comp++) { 427 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 428 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 429 ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 430 ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 431 } 432 } 433 434 template <int NCOMP, int P1d, int Q1d> 435 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 436 const CeedScalar *c_B, const CeedScalar *c_G, 437 CeedScalar *__restrict__ r_V) { 438 CeedScalar r_t[1]; 439 for (CeedInt comp=0; comp<NCOMP; comp++) { 440 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 441 ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 442 ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 443 ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 444 } 445 } 446 447 //**** 448 // 3D 449 template <int NCOMP, int P1d> 450 inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 451 if (data.tidx<P1d && data.tidy<P1d) { 452 for (CeedInt z = 0; z < P1d; ++z) { 453 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 454 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 455 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 456 r_u[z+comp*P1d] = d_u[ind + nnodes * comp]; 457 } 458 } 459 } 460 } 461 462 template <int NCOMP, int P1d> 463 inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 464 if (data.tidx<P1d && data.tidy<P1d) { 465 for (CeedInt z = 0; z < P1d; ++z) { 466 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 467 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 468 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 469 r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 470 } 471 } 472 } 473 } 474 template <int NCOMP, int P1d> 475 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 476 if (data.tidx<P1d && data.tidy<P1d) { 477 for (CeedInt z = 0; z < P1d; ++z) { 478 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 479 const CeedInt ind = node * strides[0] + elem * strides[2]; 480 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 481 r_u[z+comp*P1d] = d_u[ind + comp * strides[1]]; 482 } 483 } 484 } 485 } 486 487 template <int NCOMP, int Q1d> 488 inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 489 for (CeedInt z=0; z < Q1d; ++z) { 490 const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 491 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 492 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 493 r_u[z+comp*Q1d] = d_u[ind + nquads * comp]; 494 } 495 } 496 } 497 498 template <int NCOMP, int Q1d> 499 inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 500 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 501 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 502 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 503 r_u[comp] = d_u[ind + nquads * comp]; 504 } 505 } 506 507 template <int NCOMP, int Q1d> 508 inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 509 for (CeedInt z=0; z < Q1d; ++z) { 510 const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 511 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 512 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 513 r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp]; 514 } 515 } 516 } 517 518 template <int NCOMP, int Q1d> 519 inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 520 const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 521 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 522 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 523 r_u[comp] = d_u[ind * NCOMP + comp]; 524 } 525 } 526 527 template <int NCOMP, int P1d> 528 inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 529 if (data.tidx<P1d && data.tidy<P1d) { 530 for (CeedInt z = 0; z < P1d; ++z) { 531 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 532 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 533 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 534 atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]); 535 } 536 } 537 } 538 } 539 540 template <int NCOMP, int P1d> 541 inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 542 if (data.tidx<P1d && data.tidy<P1d) { 543 for (CeedInt z = 0; z < P1d; ++z) { 544 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 545 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 546 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 547 atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 548 } 549 } 550 } 551 } 552 553 template <int NCOMP, int P1d> 554 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 555 if (data.tidx<P1d && data.tidy<P1d) { 556 for (CeedInt z = 0; z < P1d; ++z) { 557 const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 558 const CeedInt ind = node * strides[0] + elem * strides[2]; 559 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 560 atomicAdd(&d_v[ind + comp * strides[1]], r_v[z+comp*P1d]); 561 } 562 } 563 } 564 } 565 566 template <int NCOMP, int Q1d> 567 inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 568 for (CeedInt z=0; z < Q1d; ++z) { 569 const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 570 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 571 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 572 d_v[ind + nquads * comp] = r_v[z+comp*Q1d]; 573 } 574 } 575 } 576 577 template <int NCOMP, int Q1d> 578 inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 579 for (CeedInt z=0; z < Q1d; ++z) { 580 const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 581 const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 582 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 583 d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d]; 584 } 585 } 586 } 587 588 template <int NCOMP, int P1d, int Q1d> 589 inline __device__ void ContractX3d(BackendData& data, 590 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 591 CeedScalar r_B[P1d]; 592 for (CeedInt i = 0; i < P1d; ++i) { 593 r_B[i] = B[i + data.tidx*P1d]; 594 } 595 for (CeedInt k = 0; k < P1d; ++k) { 596 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 597 __syncthreads(); 598 V[k] = 0.0; 599 for (CeedInt i = 0; i < P1d; ++i) { 600 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 601 } 602 __syncthreads(); 603 } 604 } 605 606 template <int NCOMP, int P1d, int Q1d> 607 inline __device__ void ContractY3d(BackendData& data, 608 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 609 CeedScalar r_B[P1d]; 610 for (CeedInt i = 0; i < P1d; ++i) { 611 r_B[i] = B[i + data.tidy*P1d]; 612 } 613 for (CeedInt k = 0; k < P1d; ++k) { 614 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 615 __syncthreads(); 616 V[k] = 0.0; 617 for (CeedInt i = 0; i < P1d; ++i) { 618 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 619 } 620 __syncthreads(); 621 } 622 } 623 624 template <int NCOMP, int P1d, int Q1d> 625 inline __device__ void ContractZ3d(BackendData& data, 626 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 627 for (CeedInt k = 0; k < Q1d; ++k) { 628 V[k] = 0.0; 629 for (CeedInt i = 0; i < P1d; ++i) { 630 V[k] += B[i + k*P1d] * U[i];//contract z direction 631 } 632 } 633 } 634 635 template <int NCOMP, int P1d, int Q1d> 636 inline __device__ void ContractTransposeZ3d(BackendData& data, 637 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 638 for (CeedInt k = 0; k < Q1d; ++k) { 639 V[k] = 0.0; 640 if (k<P1d) { 641 for (CeedInt i = 0; i < Q1d; ++i) { 642 V[k] += B[k + i*P1d] * U[i];//contract z direction 643 } 644 } 645 } 646 } 647 648 template <int NCOMP, int P1d, int Q1d> 649 inline __device__ void ContractTransposeY3d(BackendData& data, 650 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 651 CeedScalar r_B[Q1d]; 652 for (CeedInt i = 0; i < Q1d; ++i) 653 { 654 r_B[i] = B[data.tidy + i*P1d]; 655 } 656 for (CeedInt k = 0; k < P1d; ++k) { 657 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 658 __syncthreads(); 659 V[k] = 0.0; 660 if (data.tidy<P1d) { 661 for (CeedInt i = 0; i < Q1d; ++i) { 662 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 663 } 664 } 665 __syncthreads(); 666 } 667 } 668 669 template <int NCOMP, int P1d, int Q1d> 670 inline __device__ void ContractTransposeAddY3d(BackendData& data, 671 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 672 CeedScalar r_B[Q1d]; 673 for (CeedInt i = 0; i < Q1d; ++i) { 674 r_B[i] = B[data.tidy + i*P1d]; 675 } 676 for (CeedInt k = 0; k < P1d; ++k) { 677 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 678 __syncthreads(); 679 if (data.tidy<P1d) { 680 for (CeedInt i = 0; i < Q1d; ++i) { 681 V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 682 } 683 } 684 __syncthreads(); 685 } 686 } 687 688 template <int NCOMP, int P1d, int Q1d> 689 inline __device__ void ContractTransposeX3d(BackendData& data, 690 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 691 CeedScalar r_B[Q1d]; 692 for (CeedInt i = 0; i < Q1d; ++i) { 693 r_B[i] = B[data.tidx + i*P1d]; 694 } 695 for (CeedInt k = 0; k < P1d; ++k) { 696 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 697 __syncthreads(); 698 V[k] = 0.0; 699 if (data.tidx<P1d) { 700 for (CeedInt i = 0; i < Q1d; ++i) { 701 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 702 } 703 } 704 __syncthreads(); 705 } 706 } 707 708 template <int NCOMP, int P1d, int Q1d> 709 inline __device__ void ContractTransposeAddX3d(BackendData& data, 710 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 711 CeedScalar r_B[Q1d]; 712 for (CeedInt i = 0; i < Q1d; ++i) { 713 r_B[i] = B[data.tidx + i*P1d]; 714 } 715 for (CeedInt k = 0; k < P1d; ++k) { 716 data.slice[data.tidx+data.tidy*Q1d] = U[k]; 717 __syncthreads(); 718 if (data.tidx<P1d) { 719 for (CeedInt i = 0; i < Q1d; ++i) { 720 V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 721 } 722 } 723 __syncthreads(); 724 } 725 } 726 727 template <int NCOMP, int P1d, int Q1d> 728 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 729 CeedScalar *__restrict__ r_V) { 730 CeedScalar r_t1[Q1d]; 731 CeedScalar r_t2[Q1d]; 732 for (CeedInt comp=0; comp<NCOMP; comp++) { 733 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 734 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 735 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 736 } 737 } 738 739 template <int NCOMP, int P1d, int Q1d> 740 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 741 CeedScalar *__restrict__ r_V) { 742 CeedScalar r_t1[Q1d]; 743 CeedScalar r_t2[Q1d]; 744 for (CeedInt comp=0; comp<NCOMP; comp++) { 745 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 746 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 747 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 748 } 749 } 750 751 template <int NCOMP, int P1d, int Q1d> 752 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 753 const CeedScalar *c_B, const CeedScalar *c_G, 754 CeedScalar *__restrict__ r_V) { 755 CeedScalar r_t1[Q1d]; 756 CeedScalar r_t2[Q1d]; 757 for (CeedInt comp=0; comp<NCOMP; comp++) { 758 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 759 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 760 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 761 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 762 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 763 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 764 ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 765 ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 766 ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 767 } 768 } 769 770 template <int NCOMP, int P1d, int Q1d> 771 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 772 const CeedScalar *c_B, const CeedScalar *c_G, 773 CeedScalar *__restrict__ r_V) { 774 CeedScalar r_t1[Q1d]; 775 CeedScalar r_t2[Q1d]; 776 for (CeedInt comp=0; comp<NCOMP; comp++) { 777 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 778 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 779 ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 780 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 781 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 782 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 783 ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 784 ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 785 ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 786 } 787 } 788 789 template <int NCOMP, int Q1d> 790 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, 791 const CeedScalar *__restrict__ r_U, 792 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 793 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 794 data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d]; 795 __syncthreads(); 796 // X derivative 797 r_V[comp+0*NCOMP] = 0.0; 798 for (CeedInt i = 0; i < Q1d; ++i) { 799 r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 800 } 801 // Y derivative 802 r_V[comp+1*NCOMP] = 0.0; 803 for (CeedInt i = 0; i < Q1d; ++i) { 804 r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 805 } 806 // Z derivative 807 r_V[comp+2*NCOMP] = 0.0; 808 for (CeedInt i = 0; i < Q1d; ++i) { 809 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative) 810 } 811 __syncthreads(); 812 } 813 } 814 815 template <int NCOMP, int Q1d> 816 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, 817 const CeedScalar *__restrict__ r_U, 818 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 819 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 820 // X derivative 821 data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 822 __syncthreads(); 823 for (CeedInt i = 0; i < Q1d; ++i) { 824 r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 825 } 826 __syncthreads(); 827 // Y derivative 828 data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 829 __syncthreads(); 830 for (CeedInt i = 0; i < Q1d; ++i) { 831 r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 832 } 833 __syncthreads(); 834 // Z derivative 835 for (CeedInt i = 0; i < Q1d; ++i) { 836 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative) 837 } 838 } 839 } 840 841 template <int Q1d> 842 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 843 *w = qweight1d[data.tidx]; 844 } 845 846 template <int Q1d> 847 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 848 *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 849 } 850 851 template <int Q1d> 852 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 853 const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 854 for (CeedInt z = 0; z < Q1d; ++z) 855 { 856 w[z] = pw*qweight1d[z]; 857 } 858 } 859 860 ); 861 862 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 863 864 using std::ostringstream; 865 using std::string; 866 int ierr; 867 bool setupdone; 868 ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 869 if (setupdone) return 0; 870 Ceed ceed; 871 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 872 CeedOperator_Cuda_gen *data; 873 ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 874 CeedQFunction qf; 875 CeedQFunction_Cuda_gen *qf_data; 876 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 877 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 878 CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes; 879 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 880 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 881 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 882 CeedChk(ierr); 883 CeedOperatorField *opinputfields, *opoutputfields; 884 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 885 CeedChk(ierr); 886 CeedQFunctionField *qfinputfields, *qfoutputfields; 887 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 888 CeedChk(ierr); 889 CeedEvalMode emode; 890 CeedInterlaceMode imode; 891 CeedBasis basis; 892 CeedBasis_Cuda_shared *basis_data; 893 CeedElemRestriction Erestrict; 894 CeedElemRestriction_Cuda_reg *restr_data; 895 896 ostringstream code; 897 string devFunctions(deviceFunctions); 898 899 // Add atomicAdd function for old NVidia architectures 900 struct cudaDeviceProp prop; 901 Ceed delegate; 902 CeedGetDelegate(ceed, &delegate); 903 Ceed_Cuda *ceed_data; 904 ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr); 905 ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 906 if (prop.major<6){ 907 code << atomicAdd; 908 } 909 910 code << devFunctions; 911 912 string qFunction(qf_data->qFunctionSource); 913 914 code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 915 code << "\n#define CeedPragmaSIMD\n"; 916 code << qFunction; 917 918 // Setup 919 code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 920 // Input Evecs and Restriction 921 for (CeedInt i = 0; i < numinputfields; i++) { 922 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 923 CeedChk(ierr); 924 if (emode == CEED_EVAL_WEIGHT) { // Skip 925 } else { 926 code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 927 if (emode != CEED_EVAL_NONE) 928 { 929 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 930 bool isTensor; 931 ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 932 //TODO check that all are the same 933 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 934 if (isTensor) 935 { 936 //TODO check that all are the same 937 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 938 } else { 939 return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 940 } 941 } 942 } 943 } 944 data->dim = dim; 945 data->Q1d = Q1d; 946 947 for (CeedInt i = 0; i < numoutputfields; i++) { 948 code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 949 } 950 code << "const CeedInt Dim = "<<dim<<";\n"; 951 code << "const CeedInt Q1d = "<<Q1d<<";\n"; 952 // code << "const CeedInt Q = "<<Q<<";\n"; 953 code << "extern __shared__ CeedScalar slice[];\n"; 954 code << "BackendData data;\n"; 955 code << "data.tidx = threadIdx.x;\n"; 956 code << "data.tidy = threadIdx.y;\n"; 957 code << "data.tidz = threadIdx.z;\n"; 958 code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 959 code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 960 //Initialize constants, and matrices B and G 961 for (CeedInt i = 0; i < numinputfields; i++) { 962 code << "// Input field "<<i<<"\n"; 963 // Get elemsize, emode, ncomp 964 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 965 CeedChk(ierr); 966 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 967 CeedChk(ierr); 968 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 969 CeedChk(ierr); 970 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 971 CeedChk(ierr); 972 // Basis action 973 switch (emode) { 974 case CEED_EVAL_NONE: 975 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 976 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 977 code << " const CeedInt nquads_in_"<<i<<" = "<<nnodes<<";\n"; 978 break; 979 case CEED_EVAL_INTERP: 980 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 981 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 982 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 983 code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 984 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 985 code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 986 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 987 data->B.in[i] = basis_data->d_interp1d; 988 code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 989 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 990 break; 991 case CEED_EVAL_GRAD: 992 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 993 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 994 code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 995 code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 996 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 997 code << "const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 998 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 999 data->B.in[i] = basis_data->d_interp1d; 1000 code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 1001 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 1002 if (basis_data->d_collograd1d) { 1003 data->G.in[i] = basis_data->d_collograd1d; 1004 code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1005 code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 1006 } else { 1007 data->G.in[i] = basis_data->d_grad1d; 1008 code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 1009 code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 1010 } 1011 break; 1012 case CEED_EVAL_WEIGHT: 1013 break; // No action 1014 case CEED_EVAL_DIV: 1015 break; // TODO: Not implemented 1016 case CEED_EVAL_CURL: 1017 break; // TODO: Not implemented 1018 } 1019 } 1020 for (CeedInt i = 0; i < numoutputfields; i++) { 1021 code << "// Output field "<<i<<"\n"; 1022 // Get elemsize, emode, ncomp 1023 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1024 CeedChk(ierr); 1025 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1026 CeedChk(ierr); 1027 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1028 CeedChk(ierr); 1029 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1030 CeedChk(ierr); 1031 // Basis action 1032 switch (emode) { 1033 case CEED_EVAL_NONE: 1034 code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 1035 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 1036 code << " const CeedInt nquads_out_"<<i<<" = "<<nnodes<<";\n"; 1037 break; // No action 1038 case CEED_EVAL_INTERP: 1039 code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 1040 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 1041 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 1042 code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 1043 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1044 data->B.out[i] = basis_data->d_interp1d; 1045 code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1046 code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1047 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 1048 code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 1049 break; 1050 case CEED_EVAL_GRAD: 1051 code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 1052 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 1053 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 1054 code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 1055 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1056 data->B.out[i] = basis_data->d_interp1d; 1057 code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1058 code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1059 if (basis_data->d_collograd1d) { 1060 data->G.out[i] = basis_data->d_collograd1d; 1061 code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1062 code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1063 } else { 1064 data->G.out[i] = basis_data->d_grad1d; 1065 code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1066 code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1067 } 1068 ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 1069 code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 1070 break; 1071 case CEED_EVAL_WEIGHT: { 1072 Ceed ceed; 1073 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1074 return CeedError(ceed, 1, 1075 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1076 break; // Should not occur 1077 } 1078 case CEED_EVAL_DIV: 1079 break; // TODO: Not implemented 1080 case CEED_EVAL_CURL: 1081 break; // TODO: Not implemented 1082 } 1083 } 1084 code << "\n"; 1085 code << "__syncthreads();\n"; 1086 code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1087 // Input basis apply if needed 1088 // Generate the correct eval mode code for each input 1089 for (CeedInt i = 0; i < numinputfields; i++) { 1090 code << " // Input field "<<i<<"\n"; 1091 // Get elemsize, emode, ncomp 1092 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1093 CeedChk(ierr); 1094 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1095 CeedChk(ierr); 1096 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1097 CeedChk(ierr); 1098 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1099 CeedChk(ierr); 1100 // Basis action 1101 switch (emode) { 1102 case CEED_EVAL_NONE: 1103 if (!basis_data->d_collograd1d) { 1104 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1105 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1106 code << " readQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n"; 1107 } 1108 break; 1109 case CEED_EVAL_INTERP: 1110 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1111 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1112 data->indices.in[i] = restr_data->d_ind; 1113 if (data->indicies.in[i]) { 1114 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1115 } else { 1116 ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.in[i])); CeedChk(ierr); 1117 } 1118 code << " readDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, strides.in["<<i<<"], indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1119 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1120 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1121 break; 1122 case CEED_EVAL_GRAD: 1123 code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1124 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1125 data->indices.in[i] = restr_data->d_ind; 1126 if (data->indicies.in[i]) { 1127 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1128 } else { 1129 ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.in[i])); CeedChk(ierr); 1130 } 1131 code << " readDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, strides.in["<<i<<"], indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1132 if (basis_data->d_collograd1d) { 1133 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1134 code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1135 } else { 1136 code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1137 code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1138 } 1139 break; 1140 case CEED_EVAL_WEIGHT: 1141 code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1142 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1143 ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1144 data->W = basis_data->d_qweight1d; 1145 code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1146 break; // No action 1147 case CEED_EVAL_DIV: 1148 break; // TODO: Not implemented 1149 case CEED_EVAL_CURL: 1150 break; // TODO: Not implemented 1151 } 1152 } 1153 1154 // Q function 1155 for (CeedInt i = 0; i < numoutputfields; i++) { 1156 code << " // Output field "<<i<<"\n"; 1157 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1158 CeedChk(ierr); 1159 if (emode==CEED_EVAL_GRAD) 1160 { 1161 if (basis_data->d_collograd1d) { 1162 //Accumulator for gradient slices 1163 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1164 code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1165 code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1166 code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1167 code << " }\n"; 1168 code << " }\n"; 1169 } else { 1170 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1171 } 1172 } 1173 if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1174 { 1175 code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1176 } 1177 } 1178 //We treat quadrature points per slice in 3d to save registers 1179 code << "// QFunction\n"; 1180 if (basis_data->d_collograd1d) { 1181 code << "#pragma unroll\n"; 1182 code << "for (CeedInt q=0; q<Q1d; q++) {\n"; 1183 for (CeedInt i = 0; i < numinputfields; i++) { 1184 code << " // Input field "<<i<<"\n"; 1185 // Get elemsize, emode, ncomp 1186 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1187 CeedChk(ierr); 1188 // Basis action 1189 switch (emode) { 1190 case CEED_EVAL_NONE: 1191 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1192 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1193 code << " readSliceQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1194 break; 1195 case CEED_EVAL_INTERP: 1196 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1197 code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1198 code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1199 code << " }\n"; 1200 break; 1201 case CEED_EVAL_GRAD: 1202 code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1203 code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1204 break; 1205 case CEED_EVAL_WEIGHT: 1206 code << " CeedScalar r_q"<<i<<"[1];\n"; 1207 code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1208 break; // No action 1209 case CEED_EVAL_DIV: 1210 break; // TODO: Not implemented 1211 case CEED_EVAL_CURL: 1212 break; // TODO: Not implemented 1213 } 1214 } 1215 for (CeedInt i = 0; i < numoutputfields; i++) { 1216 code << " // Output field "<<i<<"\n"; 1217 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1218 CeedChk(ierr); 1219 // Basis action 1220 switch (emode) { 1221 case CEED_EVAL_NONE: 1222 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1223 break; // No action 1224 case CEED_EVAL_INTERP: 1225 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1226 break; 1227 case CEED_EVAL_GRAD: 1228 code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1229 break; 1230 case CEED_EVAL_WEIGHT: 1231 break; // Should not occur 1232 case CEED_EVAL_DIV: 1233 break; // TODO: Not implemented 1234 case CEED_EVAL_CURL: 1235 break; // TODO: Not implemented 1236 } 1237 } 1238 } else { 1239 for (CeedInt i = 0; i < numinputfields; i++) { 1240 code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1241 } 1242 for (CeedInt i = 0; i < numoutputfields; i++) { 1243 code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1244 } 1245 } 1246 code << " CeedScalar* in["<<numinputfields<<"];\n"; 1247 for (CeedInt i = 0; i < numinputfields; i++) { 1248 code << " in["<<i<<"] = r_q"<<i<<";\n"; 1249 } 1250 code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1251 for (CeedInt i = 0; i < numoutputfields; i++) { 1252 code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1253 } 1254 string qFunctionName(qf_data->qFunctionName); 1255 code << " "<<qFunctionName<<"(ctx, "; 1256 if (dim != 3 || basis_data->d_collograd1d) { 1257 code << "1 "; 1258 }else{ 1259 code << "Q1d "; 1260 } 1261 code << ", in, out);\n"; 1262 if (basis_data->d_collograd1d) { 1263 for (CeedInt i = 0; i < numoutputfields; i++) { 1264 code << " // Output field "<<i<<"\n"; 1265 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1266 CeedChk(ierr); 1267 // Basis action 1268 switch (emode) { 1269 case CEED_EVAL_NONE: 1270 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1271 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1272 code << " }\n"; 1273 break; // No action 1274 case CEED_EVAL_INTERP: 1275 code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1276 code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1277 code << " }\n"; 1278 break; 1279 case CEED_EVAL_GRAD: 1280 code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1281 break; 1282 case CEED_EVAL_WEIGHT: 1283 break; // Should not occur 1284 case CEED_EVAL_DIV: 1285 break; // TODO: Not implemented 1286 case CEED_EVAL_CURL: 1287 break; // TODO: Not implemented 1288 } 1289 } 1290 code << "}\n"; 1291 } 1292 1293 // Output basis apply if needed 1294 // Generate the correct eval mode code for each output 1295 for (CeedInt i = 0; i < numoutputfields; i++) { 1296 code << " // Output field "<<i<<"\n"; 1297 // Get elemsize, emode, ncomp 1298 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1299 CeedChk(ierr); 1300 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1301 CeedChk(ierr); 1302 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1303 CeedChk(ierr); 1304 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1305 CeedChk(ierr); 1306 // Basis action 1307 switch (emode) { 1308 case CEED_EVAL_NONE: 1309 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1310 code << " writeQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n"; 1311 break; // No action 1312 case CEED_EVAL_INTERP: 1313 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1314 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1315 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1316 data->indices.out[i] = restr_data->d_ind; 1317 if (data->indicies.out[i]) { 1318 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1319 } else { 1320 ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.out[i])); CeedChk(ierr); 1321 } 1322 code << " writeDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, strides.out["<<i<<"], indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1323 break; 1324 case CEED_EVAL_GRAD: 1325 code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1326 if (basis_data->d_collograd1d) { 1327 code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1328 } else { 1329 code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1330 } 1331 ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1332 data->indices.out[i] = restr_data->d_ind; 1333 if (data->indicies.out[i]) { 1334 ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1335 } else { 1336 ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.out[i])); CeedChk(ierr); 1337 } 1338 code << " writeDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, strides.out["<<i<<"], indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1339 break; 1340 case CEED_EVAL_WEIGHT: { 1341 Ceed ceed; 1342 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1343 return CeedError(ceed, 1, 1344 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1345 break; // Should not occur 1346 } 1347 case CEED_EVAL_DIV: 1348 break; // TODO: Not implemented 1349 case CEED_EVAL_CURL: 1350 break; // TODO: Not implemented 1351 } 1352 } 1353 1354 code << " }\n"; 1355 code << "}\n\n"; 1356 1357 // std::cout << code.str(); 1358 1359 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr); 1360 ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1361 CeedChk(ierr); 1362 1363 ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1364 1365 return 0; 1366 } 1367