xref: /libCEED/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h (revision a171b6ef1192a8f5f3031330339aaab2ded4c8fb)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA non-tensor product basis
10 #ifndef CEED_CUDA_REF_BASIS_NONTENSOR_H
11 #define CEED_CUDA_REF_BASIS_NONTENSOR_H
12 
13 #include <ceed.h>
14 
15 #include "cuda-ref-basis-nontensor-templates.h"
16 
17 //------------------------------------------------------------------------------
18 // Non-Tensor Basis Kernels
19 //------------------------------------------------------------------------------
20 
21 //------------------------------------------------------------------------------
22 // Interp
23 //------------------------------------------------------------------------------
24 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U,
25                                   CeedScalar *__restrict__ d_V) {
26   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
27     Contract<BASIS_NUM_COMP, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q>(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem,
28                                                                     BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
29   }
30 }
31 
32 extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U,
33                                            CeedScalar *__restrict__ d_V) {
34   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
35     ContractTranspose<BASIS_NUM_COMP, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q>(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem,
36                                                                              BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
37   }
38 }
39 
40 //------------------------------------------------------------------------------
41 // Deriv
42 //------------------------------------------------------------------------------
43 extern "C" __global__ void Deriv(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U,
44                                  CeedScalar *__restrict__ d_V) {
45   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
46     Contract<BASIS_NUM_COMP, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q>(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem,
47                                                                    BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
48   }
49 }
50 
51 extern "C" __global__ void DerivTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U,
52                                           CeedScalar *__restrict__ d_V) {
53   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
54     ContractTranspose<BASIS_NUM_COMP, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q>(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem,
55                                                                             BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
56   }
57 }
58 
59 //------------------------------------------------------------------------------
60 // Weight
61 //------------------------------------------------------------------------------
62 extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) {
63   const CeedInt t_id = threadIdx.x;
64   // TODO load q_weight in shared memory if blockDim.z > 1?
65 
66   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
67     d_V[elem * BASIS_Q + t_id] = q_weight[t_id];
68   }
69 }
70 
71 //------------------------------------------------------------------------------
72 
73 #endif  // CEED_CUDA_REF_BASIS_NONTENSOR_H
74