1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed-backend.h> 18 #include <ceed.h> 19 #include "ceed-cuda-shared.h" 20 #include "../cuda/ceed-cuda.h" 21 22 //********************* 23 // shared mem kernels 24 static const char *kernelsShared = QUOTE( 25 26 inline __device__ void add(CeedScalar *r_V, const CeedScalar *r_U) { 27 for (int i = 0; i < Q1D; i++) 28 r_V[i] += r_U[i]; 29 } 30 31 ////////// 32 // 1D // 33 ////////// 34 35 inline __device__ void readDofs1d(const int elem, const int tidx, 36 const int tidy, const int tidz,const int comp, 37 const int nelem, const CeedScalar *d_U, CeedScalar *slice) { 38 for (int i = 0; i < P1D; i++) 39 slice[i+tidz*Q1D] = d_U[i + comp*P1D + elem*BASIS_NCOMP*P1D]; 40 for (int i = P1D; i < Q1D; i++) 41 slice[i+tidz*Q1D] = 0.0; 42 } 43 44 inline __device__ void writeDofs1d(const int elem, const int tidx, 45 const int tidy, const int comp, 46 const int nelem, const CeedScalar &r_V, 47 CeedScalar *d_V) { 48 if (tidx<P1D) { 49 d_V[tidx + comp*P1D + elem*BASIS_NCOMP*P1D] = r_V; 50 } 51 } 52 53 inline __device__ void readQuads1d(const int elem, const int tidx, 54 const int tidy, const int tidz, const int comp, 55 const int dim, const int nelem, 56 const CeedScalar *d_U, CeedScalar *slice) { 57 for (int i = 0; i < Q1D; i++) 58 slice[i+tidz*Q1D] = d_U[i + elem*Q1D + comp*Q1D*nelem + 59 dim*BASIS_NCOMP*nelem*Q1D]; 60 } 61 62 inline __device__ void writeQuads1d(const int elem, const int tidx, 63 const int tidy, const int comp, 64 const int dim, const int nelem, 65 const CeedScalar &r_V, CeedScalar *d_V) { 66 d_V[tidx + elem*Q1D + comp*Q1D*nelem + dim*BASIS_NCOMP*nelem*Q1D] = r_V; 67 } 68 69 inline __device__ void ContractX1d(CeedScalar *slice, const int tidx, 70 const int tidy, const int tidz, 71 const CeedScalar &U, const CeedScalar *B, 72 CeedScalar &V) { 73 V = 0.0; 74 for (int i = 0; i < P1D; ++i) { 75 V += B[i + tidx*P1D] * slice[i+tidz*Q1D];//contract x direction 76 } 77 } 78 79 inline __device__ void ContractTransposeX1d(CeedScalar *slice, const int tidx, 80 const int tidy, const int tidz, 81 const CeedScalar &U, const CeedScalar *B, CeedScalar &V) { 82 V = 0.0; 83 for (int i = 0; i < Q1D; ++i) { 84 V += B[tidx + i*P1D] * slice[i+tidz*Q1D];//contract x direction 85 } 86 } 87 88 inline __device__ void interp1d(const CeedInt nelem, const int transpose, 89 const CeedScalar *c_B, 90 const CeedScalar *__restrict__ d_U, 91 CeedScalar *__restrict__ d_V, 92 CeedScalar *slice) { 93 CeedScalar r_V; 94 CeedScalar r_t; 95 96 const int tidx = threadIdx.x; 97 const int tidy = threadIdx.y; 98 const int tidz = threadIdx.z; 99 100 101 for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; 102 elem += gridDim.x*blockDim.z) { 103 for(int comp=0; comp<BASIS_NCOMP; comp++) { 104 if(!transpose) { 105 readDofs1d(elem, tidx, tidy, tidz, comp, nelem, d_U, slice); 106 ContractX1d(slice, tidx, tidy, tidz, r_t, c_B, r_V); 107 writeQuads1d(elem, tidx, tidy, comp, 0, nelem, r_V, d_V); 108 } else { 109 readQuads1d(elem, tidx, tidy, tidz, comp, 0, nelem, d_U, slice); 110 ContractTransposeX1d(slice, tidx, tidy, tidz, r_t, c_B, r_V); 111 writeDofs1d(elem, tidx, tidy, comp, nelem, r_V, d_V); 112 } 113 } 114 } 115 } 116 117 inline __device__ void grad1d(const CeedInt nelem, const int transpose, 118 const CeedScalar *c_B, const CeedScalar *c_G, 119 const CeedScalar *__restrict__ d_U, 120 CeedScalar *__restrict__ d_V, 121 CeedScalar *slice) { 122 CeedScalar r_U; 123 CeedScalar r_V; 124 125 const int tidx = threadIdx.x; 126 const int tidy = threadIdx.y; 127 const int tidz = threadIdx.z; 128 int dim; 129 130 for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; 131 elem += gridDim.x*blockDim.z) { 132 for(int comp=0; comp<BASIS_NCOMP; comp++) { 133 if(!transpose) { 134 readDofs1d(elem, tidx, tidy, tidz, comp, nelem, d_U, slice); 135 ContractX1d(slice, tidx, tidy, tidz, r_U, c_G, r_V); 136 dim = 0; 137 writeQuads1d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V); 138 } else { 139 dim = 0; 140 readQuads1d(elem, tidx, tidy, tidz, comp, dim, nelem, d_U, slice); 141 ContractTransposeX1d(slice, tidx, tidy, tidz, r_U, c_G, r_V); 142 writeDofs1d(elem, tidx, tidy, comp, nelem, r_V, d_V); 143 } 144 } 145 } 146 } 147 ////////// 148 // 2D // 149 ////////// 150 151 inline __device__ void readDofs2d(const int elem, const int tidx, 152 const int tidy, const int comp, 153 const int nelem, const CeedScalar *d_U, 154 CeedScalar &U) { 155 U = (tidx<P1D 156 && tidy<P1D) ? d_U[tidx + tidy*P1D + comp*P1D*P1D + elem*BASIS_NCOMP*P1D*P1D ] : 157 0.0; 158 } 159 160 inline __device__ void writeDofs2d(const int elem, const int tidx, 161 const int tidy, const int comp, 162 const int nelem, const CeedScalar &r_V, 163 CeedScalar *d_V) { 164 if (tidx<P1D && tidy<P1D) { 165 d_V[tidx + tidy*P1D + comp*P1D*P1D + elem*BASIS_NCOMP*P1D*P1D ] = r_V; 166 } 167 } 168 169 inline __device__ void readQuads2d(const int elem, const int tidx, 170 const int tidy, const int comp, 171 const int dim, const int nelem, 172 const CeedScalar *d_U, CeedScalar &U ) { 173 U = d_U[tidx + tidy*Q1D + elem*Q1D*Q1D + comp*Q1D*Q1D*nelem + 174 dim*BASIS_NCOMP*nelem*Q1D*Q1D]; 175 } 176 177 inline __device__ void writeQuads2d(const int elem, const int tidx, 178 const int tidy, const int comp, 179 const int dim, const int nelem, 180 const CeedScalar &r_V, CeedScalar *d_V) { 181 d_V[tidx + tidy*Q1D + elem*Q1D*Q1D + comp*Q1D*Q1D*nelem + 182 dim*BASIS_NCOMP*nelem*Q1D*Q1D ] = r_V; 183 } 184 185 inline __device__ void ContractX2d(CeedScalar *slice, const int tidx, 186 const int tidy, const int tidz, 187 const CeedScalar &U, const CeedScalar *B, 188 CeedScalar &V) { 189 slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U; 190 __syncthreads(); 191 V = 0.0; 192 for (int i = 0; i < P1D; ++i) { 193 V += B[i + tidx*P1D] * slice[i + tidy*Q1D + tidz*Q1D*Q1D];//contract x direction 194 } 195 __syncthreads(); 196 } 197 198 inline __device__ void ContractY2d(CeedScalar *slice, const int tidx, 199 const int tidy, const int tidz, 200 const CeedScalar &U, const CeedScalar *B, 201 CeedScalar &V) { 202 slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U; 203 __syncthreads(); 204 V = 0.0; 205 for (int i = 0; i < P1D; ++i) { 206 V += B[i + tidy*P1D] * slice[tidx + i*Q1D + tidz*Q1D*Q1D];//contract y direction 207 } 208 __syncthreads(); 209 } 210 211 inline __device__ void ContractTransposeY2d(CeedScalar *slice, const int tidx, 212 const int tidy, const int tidz, 213 const CeedScalar &U, const CeedScalar *B, CeedScalar &V) { 214 slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U; 215 __syncthreads(); 216 V = 0.0; 217 if (tidy<P1D) { 218 for (int i = 0; i < Q1D; ++i) { 219 V += B[tidy + i*P1D] * slice[tidx + i*Q1D + tidz*Q1D*Q1D];//contract y direction 220 } 221 } 222 __syncthreads(); 223 } 224 225 inline __device__ void ContractTransposeX2d(CeedScalar *slice, const int tidx, 226 const int tidy, const int tidz, 227 const CeedScalar &U, const CeedScalar *B, CeedScalar &V) { 228 slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U; 229 __syncthreads(); 230 V = 0.0; 231 if (tidx<P1D) { 232 for (int i = 0; i < Q1D; ++i) { 233 V += B[tidx + i*P1D] * slice[i + tidy*Q1D + tidz*Q1D*Q1D];//contract x direction 234 } 235 } 236 __syncthreads(); 237 } 238 239 inline __device__ void interp2d(const CeedInt nelem, const int transpose, 240 const CeedScalar *c_B, 241 const CeedScalar *__restrict__ d_U, 242 CeedScalar *__restrict__ d_V, 243 CeedScalar *slice) { 244 CeedScalar r_V; 245 CeedScalar r_t; 246 247 const int tidx = threadIdx.x; 248 const int tidy = threadIdx.y; 249 const int tidz = threadIdx.z; 250 const int blockElem = tidz/BASIS_NCOMP; 251 const int elemsPerBlock = blockDim.z/BASIS_NCOMP; 252 const int comp = tidz%BASIS_NCOMP; 253 254 for (CeedInt elem = blockIdx.x*elemsPerBlock + blockElem; elem < nelem; 255 elem += gridDim.x*elemsPerBlock) { 256 const int comp = tidz%BASIS_NCOMP; 257 r_V = 0.0; 258 r_t = 0.0; 259 if(!transpose) { 260 readDofs2d(elem, tidx, tidy, comp, nelem, d_U, r_V); 261 ContractX2d(slice, tidx, tidy, tidz, r_V, c_B, r_t); 262 ContractY2d(slice, tidx, tidy, tidz, r_t, c_B, r_V); 263 writeQuads2d(elem, tidx, tidy, comp, 0, nelem, r_V, d_V); 264 } else { 265 readQuads2d(elem, tidx, tidy, comp, 0, nelem, d_U, r_V); 266 ContractTransposeY2d(slice, tidx, tidy, tidz, r_V, c_B, r_t); 267 ContractTransposeX2d(slice, tidx, tidy, tidz, r_t, c_B, r_V); 268 writeDofs2d(elem, tidx, tidy, comp, nelem, r_V, d_V); 269 } 270 } 271 } 272 273 inline __device__ void grad2d(const CeedInt nelem, const int transpose, 274 const CeedScalar *c_B, const CeedScalar *c_G, 275 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V, 276 CeedScalar *slice) { 277 CeedScalar r_U; 278 CeedScalar r_V; 279 CeedScalar r_t; 280 281 const int tidx = threadIdx.x; 282 const int tidy = threadIdx.y; 283 const int tidz = threadIdx.z; 284 const int blockElem = tidz/BASIS_NCOMP; 285 const int elemsPerBlock = blockDim.z/BASIS_NCOMP; 286 const int comp = tidz%BASIS_NCOMP; 287 int dim; 288 289 for (CeedInt elem = blockIdx.x*elemsPerBlock + blockElem; elem < nelem; 290 elem += gridDim.x*elemsPerBlock) { 291 if(!transpose) { 292 readDofs2d(elem, tidx, tidy, comp, nelem, d_U, r_U); 293 ContractX2d(slice, tidx, tidy, tidz, r_U, c_G, r_t); 294 ContractY2d(slice, tidx, tidy, tidz, r_t, c_B, r_V); 295 dim = 0; 296 writeQuads2d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V); 297 ContractX2d(slice, tidx, tidy, tidz, r_U, c_B, r_t); 298 ContractY2d(slice, tidx, tidy, tidz, r_t, c_G, r_V); 299 dim = 1; 300 writeQuads2d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V); 301 } else { 302 dim = 0; 303 readQuads2d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U); 304 ContractTransposeY2d(slice, tidx, tidy, tidz, r_U, c_B, r_t); 305 ContractTransposeX2d(slice, tidx, tidy, tidz, r_t, c_G, r_V); 306 dim = 1; 307 readQuads2d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U); 308 ContractTransposeY2d(slice, tidx, tidy, tidz, r_U, c_G, r_t); 309 ContractTransposeX2d(slice, tidx, tidy, tidz, r_t, c_B, r_U); 310 r_V+=r_U; 311 writeDofs2d(elem, tidx, tidy, comp, nelem, r_V, d_V); 312 } 313 } 314 } 315 ////////// 316 // 3D // 317 ////////// 318 319 inline __device__ void readDofs3d(const int elem, const int tidx, 320 const int tidy, const int comp, 321 const int nelem, const CeedScalar *d_U, CeedScalar *r_U) { 322 for (int i = 0; i < P1D; i++) 323 r_U[i] = (tidx<P1D 324 && tidy<P1D) ? d_U[tidx + tidy*P1D + i*P1D*P1D + comp*P1D*P1D*P1D + 325 elem*BASIS_NCOMP*P1D*P1D*P1D ] : 0.0; 326 for (int i = P1D; i < Q1D; i++) 327 r_U[i] = 0.0; 328 } 329 330 inline __device__ void readQuads3d(const int elem, const int tidx, 331 const int tidy, const int comp, 332 const int dim, const int nelem, const CeedScalar *d_U, CeedScalar *r_U) { 333 for (int i = 0; i < Q1D; i++) 334 r_U[i] = d_U[tidx + tidy*Q1D + i*Q1D*Q1D + elem*Q1D*Q1D*Q1D + 335 comp*Q1D*Q1D*Q1D*nelem + dim*BASIS_NCOMP*nelem*Q1D*Q1D*Q1D]; 336 } 337 338 inline __device__ void writeDofs3d(const int elem, const int tidx, 339 const int tidy, const int comp, 340 const int nelem, const CeedScalar *r_V, CeedScalar *d_V) { 341 if (tidx<P1D && tidy<P1D) { 342 for (int i = 0; i < P1D; i++) 343 d_V[tidx + tidy*P1D + i*P1D*P1D + comp*P1D*P1D*P1D + 344 elem*BASIS_NCOMP*P1D*P1D*P1D ] = r_V[i]; 345 } 346 } 347 348 inline __device__ void writeQuads3d(const int elem, const int tidx, 349 const int tidy, const int comp, 350 const int dim, const int nelem, const CeedScalar *r_V, CeedScalar *d_V) { 351 for (int i = 0; i < Q1D; i++) 352 d_V[tidx + tidy*Q1D + i*Q1D*Q1D + elem*Q1D*Q1D*Q1D + comp*Q1D*Q1D*Q1D*nelem + 353 dim*BASIS_NCOMP*nelem*Q1D*Q1D*Q1D ] = r_V[i]; 354 } 355 356 inline __device__ void ContractX3d(CeedScalar *slice, const int tidx, 357 const int tidy, const int tidz, 358 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 359 for (int k = 0; k < P1D; ++k) { 360 slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U[k]; 361 __syncthreads(); 362 V[k] = 0.0; 363 for (int i = 0; i < P1D; ++i) { 364 V[k] += B[i + tidx*P1D] * slice[i + tidy*Q1D + 365 tidz*Q1D*Q1D];//contract x direction 366 } 367 __syncthreads(); 368 } 369 } 370 371 inline __device__ void ContractY3d(CeedScalar *slice, const int tidx, 372 const int tidy, const int tidz, 373 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 374 for (int k = 0; k < P1D; ++k) { 375 slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U[k]; 376 __syncthreads(); 377 V[k] = 0.0; 378 for (int i = 0; i < P1D; ++i) { 379 V[k] += B[i + tidy*P1D] * slice[tidx + i*Q1D + 380 tidz*Q1D*Q1D];//contract y direction 381 } 382 __syncthreads(); 383 } 384 } 385 386 inline __device__ void ContractZ3d(CeedScalar *slice, const int tidx, 387 const int tidy, const int tidz, 388 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 389 for (int k = 0; k < Q1D; ++k) { 390 V[k] = 0.0; 391 for (int i = 0; i < P1D; ++i) { 392 V[k] += B[i + k*P1D] * U[i];//contract z direction 393 } 394 } 395 } 396 397 inline __device__ void ContractTransposeZ3d(CeedScalar *slice, const int tidx, 398 const int tidy, const int tidz, 399 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 400 for (int k = 0; k < Q1D; ++k) { 401 V[k] = 0.0; 402 if (k<P1D) { 403 for (int i = 0; i < Q1D; ++i) { 404 V[k] += B[k + i*P1D] * U[i];//contract z direction 405 } 406 } 407 } 408 } 409 410 inline __device__ void ContractTransposeY3d(CeedScalar *slice, const int tidx, 411 const int tidy, const int tidz, 412 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 413 for (int k = 0; k < P1D; ++k) { 414 slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U[k]; 415 __syncthreads(); 416 V[k] = 0.0; 417 if (tidy<P1D) { 418 for (int i = 0; i < Q1D; ++i) { 419 V[k] += B[tidy + i*P1D] * slice[tidx + i*Q1D + 420 tidz*Q1D*Q1D];//contract y direction 421 } 422 } 423 __syncthreads(); 424 } 425 } 426 427 inline __device__ void ContractTransposeX3d(CeedScalar *slice, const int tidx, 428 const int tidy, const int tidz, 429 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 430 for (int k = 0; k < P1D; ++k) { 431 slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U[k]; 432 __syncthreads(); 433 V[k] = 0.0; 434 if (tidx<P1D) { 435 for (int i = 0; i < Q1D; ++i) { 436 V[k] += B[tidx + i*P1D] * slice[i + tidy*Q1D + 437 tidz*Q1D*Q1D];//contract x direction 438 } 439 } 440 __syncthreads(); 441 } 442 } 443 444 inline __device__ void interp3d(const CeedInt nelem, const int transpose, 445 const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, 446 CeedScalar *__restrict__ d_V, 447 CeedScalar *slice) { 448 CeedScalar r_V[Q1D]; 449 CeedScalar r_t[Q1D]; 450 451 const int tidx = threadIdx.x; 452 const int tidy = threadIdx.y; 453 const int tidz = threadIdx.z; 454 const int blockElem = tidz/BASIS_NCOMP; 455 const int elemsPerBlock = blockDim.z/BASIS_NCOMP; 456 const int comp = tidz%BASIS_NCOMP; 457 458 for (CeedInt elem = blockIdx.x*elemsPerBlock + blockElem; elem < nelem; 459 elem += gridDim.x*elemsPerBlock) { 460 for (int i = 0; i < Q1D; ++i) { 461 r_V[i] = 0.0; 462 r_t[i] = 0.0; 463 } 464 if(!transpose) { 465 readDofs3d(elem, tidx, tidy, comp, nelem, d_U, r_V); 466 ContractX3d(slice, tidx, tidy, tidz, r_V, c_B, r_t); 467 ContractY3d(slice, tidx, tidy, tidz, r_t, c_B, r_V); 468 ContractZ3d(slice, tidx, tidy, tidz, r_V, c_B, r_t); 469 writeQuads3d(elem, tidx, tidy, comp, 0, nelem, r_t, d_V); 470 } else { 471 readQuads3d(elem, tidx, tidy, comp, 0, nelem, d_U, r_V); 472 ContractTransposeZ3d(slice, tidx, tidy, tidz, r_V, c_B, r_t); 473 ContractTransposeY3d(slice, tidx, tidy, tidz, r_t, c_B, r_V); 474 ContractTransposeX3d(slice, tidx, tidy, tidz, r_V, c_B, r_t); 475 writeDofs3d(elem, tidx, tidy, comp, nelem, r_t, d_V); 476 } 477 } 478 } 479 480 inline __device__ void grad3d(const CeedInt nelem, const int transpose, 481 const CeedScalar *c_B, const CeedScalar *c_G, 482 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V, 483 CeedScalar *slice) { 484 //use P1D for one of these 485 CeedScalar r_U[Q1D]; 486 CeedScalar r_V[Q1D]; 487 CeedScalar r_t[Q1D]; 488 489 const int tidx = threadIdx.x; 490 const int tidy = threadIdx.y; 491 const int tidz = threadIdx.z; 492 const int blockElem = tidz/BASIS_NCOMP; 493 const int elemsPerBlock = blockDim.z/BASIS_NCOMP; 494 const int comp = tidz%BASIS_NCOMP; 495 int dim; 496 497 for (CeedInt elem = blockIdx.x*elemsPerBlock + blockElem; elem < nelem; 498 elem += gridDim.x*elemsPerBlock) { 499 if(!transpose) { 500 readDofs3d(elem, tidx, tidy, comp, nelem, d_U, r_U); 501 ContractX3d(slice, tidx, tidy, tidz, r_U, c_G, r_V); 502 ContractY3d(slice, tidx, tidy, tidz, r_V, c_B, r_t); 503 ContractZ3d(slice, tidx, tidy, tidz, r_t, c_B, r_V); 504 dim = 0; 505 writeQuads3d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V); 506 ContractX3d(slice, tidx, tidy, tidz, r_U, c_B, r_V); 507 ContractY3d(slice, tidx, tidy, tidz, r_V, c_G, r_t); 508 ContractZ3d(slice, tidx, tidy, tidz, r_t, c_B, r_V); 509 dim = 1; 510 writeQuads3d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V); 511 ContractX3d(slice, tidx, tidy, tidz, r_U, c_B, r_V); 512 ContractY3d(slice, tidx, tidy, tidz, r_V, c_B, r_t); 513 ContractZ3d(slice, tidx, tidy, tidz, r_t, c_G, r_V); 514 dim = 2; 515 writeQuads3d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V); 516 } else { 517 dim = 0; 518 readQuads3d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U); 519 ContractTransposeZ3d(slice, tidx, tidy, tidz, r_U, c_B, r_t); 520 ContractTransposeY3d(slice, tidx, tidy, tidz, r_t, c_B, r_U); 521 ContractTransposeX3d(slice, tidx, tidy, tidz, r_U, c_G, r_V); 522 dim = 1; 523 readQuads3d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U); 524 ContractTransposeZ3d(slice, tidx, tidy, tidz, r_U, c_B, r_t); 525 ContractTransposeY3d(slice, tidx, tidy, tidz, r_t, c_G, r_U); 526 ContractTransposeX3d(slice, tidx, tidy, tidz, r_U, c_B, r_t); 527 add(r_V, r_t); 528 dim = 2; 529 readQuads3d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U); 530 ContractTransposeZ3d(slice, tidx, tidy, tidz, r_U, c_G, r_t); 531 ContractTransposeY3d(slice, tidx, tidy, tidz, r_t, c_B, r_U); 532 ContractTransposeX3d(slice, tidx, tidy, tidz, r_U, c_B, r_t); 533 add(r_V, r_t); 534 writeDofs3d(elem, tidx, tidy, comp, nelem, r_V, d_V); 535 } 536 } 537 } 538 539 ///////////// 540 // Kernels // 541 ///////////// 542 extern "C" __global__ void interp(const CeedInt nelem, const int transpose, 543 const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, 544 CeedScalar *__restrict__ d_V) { 545 extern __shared__ double slice[]; 546 if (BASIS_DIM==1) { 547 interp1d(nelem, transpose, c_B, d_U, d_V, slice); 548 } else if (BASIS_DIM==2) { 549 interp2d(nelem, transpose, c_B, d_U, d_V, slice); 550 } else if (BASIS_DIM==3) { 551 interp3d(nelem, transpose, c_B, d_U, d_V, slice); 552 } 553 } 554 555 extern "C" __global__ void grad(const CeedInt nelem, const int transpose, 556 const CeedScalar *c_B, const CeedScalar *c_G, 557 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 558 extern __shared__ double slice[]; 559 if (BASIS_DIM==1) { 560 grad1d(nelem, transpose, c_B, c_G, d_U, d_V, slice); 561 } else if (BASIS_DIM==2) { 562 grad2d(nelem, transpose, c_B, c_G, d_U, d_V, slice); 563 } else if (BASIS_DIM==3) { 564 grad3d(nelem, transpose, c_B, c_G, d_U, d_V, slice); 565 } 566 } 567 568 ///////////// 569 // Weights // 570 ///////////// 571 __device__ void weight1d(const CeedInt nelem, const CeedScalar *qweight1d, 572 CeedScalar *w) { 573 const int tid = threadIdx.x; 574 const CeedScalar weight = qweight1d[tid]; 575 for (CeedInt elem = blockIdx.x*blockDim.y + threadIdx.y; elem < nelem; 576 elem += gridDim.x*blockDim.y) { 577 const int ind = elem*Q1D + tid; 578 w[ind] = weight; 579 } 580 } 581 582 __device__ void weight2d(const CeedInt nelem, const CeedScalar *qweight1d, 583 CeedScalar *w) { 584 const int i = threadIdx.x; 585 const int j = threadIdx.y; 586 const CeedScalar weight = qweight1d[i]*qweight1d[j]; 587 for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; 588 elem += gridDim.x*blockDim.z) { 589 const int ind = elem*Q1D*Q1D + i + j*Q1D; 590 w[ind] = weight; 591 } 592 } 593 594 __device__ void weight3d(const CeedInt nelem, const CeedScalar *qweight1d, 595 CeedScalar *w) { 596 const int i = threadIdx.x; 597 const int j = threadIdx.y; 598 const int k = threadIdx.z; 599 const CeedScalar weight = qweight1d[i]*qweight1d[j]*qweight1d[k]; 600 for (int e = blockIdx.x; e < nelem; e += gridDim.x) { 601 const int ind = e*Q1D*Q1D*Q1D + i + j*Q1D + k*Q1D*Q1D; 602 w[ind] = weight; 603 } 604 } 605 606 extern "C" __global__ void weight(const CeedInt nelem, 607 const CeedScalar *__restrict__ qweight1d, CeedScalar *__restrict__ v) { 608 if (BASIS_DIM==1) { 609 weight1d(nelem, qweight1d, v); 610 } else if (BASIS_DIM==2) { 611 weight2d(nelem, qweight1d, v); 612 } else if (BASIS_DIM==3) { 613 weight3d(nelem, qweight1d, v); 614 } 615 } 616 617 ); 618 619 int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P1d, CeedInt Q1d, 620 CeedScalar **c_B); 621 int CeedCudaInitInterpGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P1d, 622 CeedInt Q1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); 623 624 int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt nelem, 625 CeedTransposeMode tmode, 626 CeedEvalMode emode, CeedVector u, CeedVector v) { 627 int ierr; 628 Ceed ceed; 629 ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 630 Ceed_Cuda_shared *ceed_Cuda; 631 CeedGetData(ceed, (void *) &ceed_Cuda); CeedChk(ierr); 632 CeedBasis_Cuda_shared *data; 633 CeedBasisGetData(basis, (void *)&data); CeedChk(ierr); 634 const CeedInt transpose = tmode == CEED_TRANSPOSE; 635 CeedInt dim, ncomp; 636 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 637 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChk(ierr); 638 639 const CeedScalar *d_u; 640 CeedScalar *d_v; 641 if(emode!=CEED_EVAL_WEIGHT) { 642 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChk(ierr); 643 } 644 ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChk(ierr); 645 646 if (tmode == CEED_TRANSPOSE) { 647 CeedInt length; 648 ierr = CeedVectorGetLength(v, &length); CeedChk(ierr); 649 ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChk(ierr); 650 } 651 if (emode == CEED_EVAL_INTERP) { 652 CeedInt P1d, Q1d; 653 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 654 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 655 ierr = CeedCudaInitInterp(data->d_interp1d, P1d, Q1d, &data->c_B); 656 CeedChk(ierr); 657 void *interpargs[] = {(void *) &nelem, (void *) &transpose, &data->c_B, &d_u, &d_v}; 658 if (dim==1) { 659 CeedInt elemsPerBlock = 32; 660 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 661 ? 1 : 0 ); 662 CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar); 663 ierr = CeedRunKernelDimSharedCuda(ceed, data->interp, grid, Q1d, 1, 664 elemsPerBlock, sharedMem, 665 interpargs); 666 CeedChk(ierr); 667 } else if (dim==2) { 668 const CeedInt optElems[7] = {0,32,8,6,4,2,8}; 669 CeedInt elemsPerBlock = Q1d < 7 ? optElems[Q1d]/ncomp : 1; 670 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 671 ? 1 : 0 ); 672 CeedInt sharedMem = ncomp*elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 673 ierr = CeedRunKernelDimSharedCuda(ceed, data->interp, grid, Q1d, Q1d, 674 ncomp*elemsPerBlock, sharedMem, 675 interpargs); 676 CeedChk(ierr); 677 } else if (dim==3) { 678 CeedInt elemsPerBlock = 1; 679 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 680 ? 1 : 0 ); 681 CeedInt sharedMem = ncomp*elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 682 ierr = CeedRunKernelDimSharedCuda(ceed, data->interp, grid, Q1d, Q1d, 683 ncomp*elemsPerBlock, sharedMem, 684 interpargs); 685 CeedChk(ierr); 686 } 687 } else if (emode == CEED_EVAL_GRAD) { 688 CeedInt P1d, Q1d; 689 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 690 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 691 ierr = CeedCudaInitInterpGrad(data->d_interp1d, data->d_grad1d, P1d, 692 Q1d, &data->c_B, &data->c_G); 693 CeedChk(ierr); 694 void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->c_B, &data->c_G, &d_u, &d_v}; 695 if (dim==1) { 696 CeedInt elemsPerBlock = 32; 697 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 698 ? 1 : 0 ); 699 CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar); 700 ierr = CeedRunKernelDimSharedCuda(ceed, data->grad, grid, Q1d, 1, elemsPerBlock, 701 sharedMem, 702 gradargs); 703 CeedChk(ierr); 704 } else if (dim==2) { 705 const CeedInt optElems[7] = {0,32,8,6,4,2,8}; 706 CeedInt elemsPerBlock = Q1d < 7 ? optElems[Q1d]/ncomp : 1; 707 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 708 ? 1 : 0 ); 709 CeedInt sharedMem = ncomp*elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 710 ierr = CeedRunKernelDimSharedCuda(ceed, data->grad, grid, Q1d, Q1d, 711 ncomp*elemsPerBlock, sharedMem, 712 gradargs); 713 CeedChk(ierr); 714 } else if (dim==3) { 715 CeedInt elemsPerBlock = 1; 716 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 717 ? 1 : 0 ); 718 CeedInt sharedMem = ncomp*elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 719 ierr = CeedRunKernelDimSharedCuda(ceed, data->grad, grid, Q1d, Q1d, 720 ncomp*elemsPerBlock, sharedMem, 721 gradargs); 722 CeedChk(ierr); 723 } 724 } else if (emode == CEED_EVAL_WEIGHT) { 725 CeedInt Q1d; 726 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 727 void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight1d, &d_v}; 728 if(dim==1) { 729 const CeedInt elemsPerBlock = 32/Q1d; 730 const CeedInt gridsize = nelem/elemsPerBlock + ( ( 731 nelem/elemsPerBlock*elemsPerBlock<nelem)? 1 : 0 ); 732 ierr = CeedRunKernelDimCuda(ceed, data->weight, gridsize, Q1d, elemsPerBlock, 1, 733 weightargs); 734 CeedChk(ierr); 735 } else if(dim==2) { 736 const CeedInt optElems = 32/(Q1d*Q1d); 737 const CeedInt elemsPerBlock = optElems>0?optElems:1; 738 const CeedInt gridsize = nelem/elemsPerBlock + ( ( 739 nelem/elemsPerBlock*elemsPerBlock<nelem)? 1 : 0 ); 740 ierr = CeedRunKernelDimCuda(ceed, data->weight, gridsize, Q1d, Q1d, 741 elemsPerBlock, weightargs); 742 CeedChk(ierr); 743 } else if(dim==3) { 744 const CeedInt gridsize = nelem; 745 ierr = CeedRunKernelDimCuda(ceed, data->weight, gridsize, Q1d, Q1d, Q1d, 746 weightargs); 747 CeedChk(ierr); 748 } 749 } 750 751 if(emode!=CEED_EVAL_WEIGHT) { 752 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChk(ierr); 753 } 754 ierr = CeedVectorRestoreArray(v, &d_v); CeedChk(ierr); 755 756 return 0; 757 } 758 759 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 760 int ierr; 761 Ceed ceed; 762 ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 763 764 CeedBasis_Cuda_shared *data; 765 ierr = CeedBasisGetData(basis, (void *) &data); CeedChk(ierr); 766 767 CeedChk_Cu(ceed, cuModuleUnload(data->module)); 768 769 ierr = cudaFree(data->d_qweight1d); CeedChk_Cu(ceed, ierr); 770 ierr = cudaFree(data->d_interp1d); CeedChk_Cu(ceed, ierr); 771 ierr = cudaFree(data->d_grad1d); CeedChk_Cu(ceed, ierr); 772 773 ierr = CeedFree(&data); CeedChk(ierr); 774 775 return 0; 776 } 777 778 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P1d, CeedInt Q1d, 779 const CeedScalar *interp1d, 780 const CeedScalar *grad1d, 781 const CeedScalar *qref1d, 782 const CeedScalar *qweight1d, 783 CeedBasis basis) { 784 int ierr; 785 Ceed ceed; 786 ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 787 if (Q1d<P1d) { 788 return CeedError(ceed, 1, "Backend does not implement underintegrated basis."); 789 } 790 CeedBasis_Cuda_shared *data; 791 ierr = CeedCalloc(1, &data); CeedChk(ierr); 792 793 const CeedInt qBytes = Q1d * sizeof(CeedScalar); 794 ierr = cudaMalloc((void **)&data->d_qweight1d, qBytes); CeedChk_Cu(ceed, ierr); 795 ierr = cudaMemcpy(data->d_qweight1d, qweight1d, qBytes, 796 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 797 798 const CeedInt iBytes = qBytes * P1d; 799 ierr = cudaMalloc((void **)&data->d_interp1d, iBytes); CeedChk_Cu(ceed, ierr); 800 ierr = cudaMemcpy(data->d_interp1d, interp1d, iBytes, 801 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 802 803 ierr = cudaMalloc((void **)&data->d_grad1d, iBytes); CeedChk_Cu(ceed, ierr); 804 ierr = cudaMemcpy(data->d_grad1d, grad1d, iBytes, 805 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 806 807 CeedInt ncomp; 808 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChk(ierr); 809 ierr = CeedCompileCuda(ceed, kernelsShared, &data->module, 7, 810 "Q1D", Q1d, 811 "P1D", P1d, 812 "BASIS_BUF_LEN", ncomp * CeedIntPow(Q1d > P1d ? 813 Q1d : P1d, dim), 814 "BASIS_DIM", dim, 815 "BASIS_NCOMP", ncomp, 816 "BASIS_ELEMSIZE", CeedIntPow(P1d, dim), 817 "BASIS_NQPT", CeedIntPow(Q1d, dim) 818 ); CeedChk(ierr); 819 ierr = CeedGetKernelCuda(ceed, data->module, "interp", &data->interp); 820 CeedChk(ierr); 821 ierr = CeedGetKernelCuda(ceed, data->module, "grad", &data->grad); 822 CeedChk(ierr); 823 ierr = CeedGetKernelCuda(ceed, data->module, "weight", &data->weight); 824 CeedChk(ierr); 825 826 ierr = CeedBasisSetData(basis, (void *)&data); 827 CeedChk(ierr); 828 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 829 CeedBasisApplyTensor_Cuda_shared); 830 CeedChk(ierr); 831 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 832 CeedBasisDestroy_Cuda_shared); 833 CeedChk(ierr); 834 return 0; 835 } 836