1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for CUDA shared memory non-tensor basis
10 #include <ceed/types.h>
11
12 #include "cuda-shared-basis-nontensor-templates.h"
13 #include "cuda-shared-basis-read-write-templates.h"
14
15 //------------------------------------------------------------------------------
16 // Interp kernels
17 //------------------------------------------------------------------------------
Interp(const CeedInt num_elem,const CeedScalar * c_B,const CeedScalar * __restrict__ d_U,CeedScalar * __restrict__ d_V)18 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
19 extern __shared__ CeedScalar slice[];
20
21 SharedData_Cuda data;
22 data.t_id_x = threadIdx.x;
23 data.t_id_y = threadIdx.y;
24 data.t_id_z = threadIdx.z;
25 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
26 data.slice = slice + data.t_id_z * BASIS_T_1D;
27
28 CeedScalar r_U[BASIS_NUM_COMP];
29 CeedScalar r_V[BASIS_NUM_COMP];
30
31 // load interp into shared memory
32 __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
33 LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
34 __syncthreads();
35
36 // Apply basis element by element
37 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
38 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
39 InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
40 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
41 }
42 }
43
InterpTranspose(const CeedInt num_elem,const CeedScalar * c_B,const CeedScalar * __restrict__ d_U,CeedScalar * __restrict__ d_V)44 extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
45 CeedScalar *__restrict__ d_V) {
46 extern __shared__ CeedScalar slice[];
47
48 SharedData_Cuda data;
49 data.t_id_x = threadIdx.x;
50 data.t_id_y = threadIdx.y;
51 data.t_id_z = threadIdx.z;
52 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
53 data.slice = slice + data.t_id_z * BASIS_T_1D;
54
55 CeedScalar r_U[BASIS_NUM_COMP];
56 CeedScalar r_V[BASIS_NUM_COMP];
57
58 // load interp into shared memory
59 __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
60 LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
61 __syncthreads();
62
63 // Apply basis element by element
64 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
65 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
66 InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
67 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
68 }
69 }
70
InterpTransposeAdd(const CeedInt num_elem,const CeedScalar * c_B,const CeedScalar * __restrict__ d_U,CeedScalar * __restrict__ d_V)71 extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
72 CeedScalar *__restrict__ d_V) {
73 extern __shared__ CeedScalar slice[];
74
75 SharedData_Cuda data;
76 data.t_id_x = threadIdx.x;
77 data.t_id_y = threadIdx.y;
78 data.t_id_z = threadIdx.z;
79 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
80 data.slice = slice + data.t_id_z * BASIS_T_1D;
81
82 CeedScalar r_U[BASIS_NUM_COMP];
83 CeedScalar r_V[BASIS_NUM_COMP];
84
85 // load interp into shared memory
86 __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
87 LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
88 __syncthreads();
89
90 // Apply basis element by element
91 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
92 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
93 InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
94 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
95 }
96 }
97
98 //------------------------------------------------------------------------------
99 // Grad kernels
100 //------------------------------------------------------------------------------
Grad(const CeedInt num_elem,const CeedScalar * c_G,const CeedScalar * __restrict__ d_U,CeedScalar * __restrict__ d_V)101 extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
102 extern __shared__ CeedScalar slice[];
103
104 SharedData_Cuda data;
105 data.t_id_x = threadIdx.x;
106 data.t_id_y = threadIdx.y;
107 data.t_id_z = threadIdx.z;
108 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
109 data.slice = slice + data.t_id_z * BASIS_T_1D;
110
111 CeedScalar r_U[BASIS_NUM_COMP];
112 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
113
114 // load grad into shared memory
115 __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
116 LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
117 __syncthreads();
118
119 // Apply basis element by element
120 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
121 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
122 GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
123 WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
124 }
125 }
126
GradTranspose(const CeedInt num_elem,const CeedScalar * c_G,const CeedScalar * __restrict__ d_U,CeedScalar * __restrict__ d_V)127 extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
128 CeedScalar *__restrict__ d_V) {
129 extern __shared__ CeedScalar slice[];
130
131 SharedData_Cuda data;
132 data.t_id_x = threadIdx.x;
133 data.t_id_y = threadIdx.y;
134 data.t_id_z = threadIdx.z;
135 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
136 data.slice = slice + data.t_id_z * BASIS_T_1D;
137
138 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
139 CeedScalar r_V[BASIS_NUM_COMP];
140
141 // load grad into shared memory
142 __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
143 LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
144 __syncthreads();
145
146 // Apply basis element by element
147 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
148 ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
149 GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
150 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
151 }
152 }
153
GradTransposeAdd(const CeedInt num_elem,const CeedScalar * c_G,const CeedScalar * __restrict__ d_U,CeedScalar * __restrict__ d_V)154 extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
155 CeedScalar *__restrict__ d_V) {
156 extern __shared__ CeedScalar slice[];
157
158 SharedData_Cuda data;
159 data.t_id_x = threadIdx.x;
160 data.t_id_y = threadIdx.y;
161 data.t_id_z = threadIdx.z;
162 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
163 data.slice = slice + data.t_id_z * BASIS_T_1D;
164
165 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
166 CeedScalar r_V[BASIS_NUM_COMP];
167
168 // load grad into shared memory
169 __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
170 LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
171 __syncthreads();
172
173 // Apply basis element by element
174 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
175 ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
176 GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
177 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
178 }
179 }
180
181 //------------------------------------------------------------------------------
182 // Weight kernel
183 //------------------------------------------------------------------------------
Weight(const CeedInt num_elem,const CeedScalar * __restrict__ q_weight,CeedScalar * __restrict__ d_W)184 extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
185 extern __shared__ CeedScalar slice[];
186
187 SharedData_Cuda data;
188 data.t_id_x = threadIdx.x;
189 data.t_id_y = threadIdx.y;
190 data.t_id_z = threadIdx.z;
191 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
192 data.slice = slice + data.t_id_z * BASIS_T_1D;
193
194 CeedScalar r_W[1];
195
196 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
197 WeightNonTensor<BASIS_P, BASIS_Q>(data, q_weight, r_W);
198 WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);
199 }
200 }
201