xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h (revision bdcc27286a8034df1dd97bd8aefef85a0efa7b00)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory non-tensor basis
10 #include <ceed/types.h>
11 
12 #include "cuda-shared-basis-nontensor-templates.h"
13 #include "cuda-shared-basis-read-write-templates.h"
14 
15 //------------------------------------------------------------------------------
16 // Interp kernels
17 //------------------------------------------------------------------------------
18 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
19   extern __shared__ CeedScalar slice[];
20 
21   SharedData_Cuda data;
22   data.t_id_x = threadIdx.x;
23   data.t_id_y = threadIdx.y;
24   data.t_id_z = threadIdx.z;
25   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
26   data.slice  = slice + data.t_id_z * BASIS_T_1D;
27 
28   CeedScalar r_U[BASIS_NUM_COMP];
29   CeedScalar r_V[BASIS_NUM_COMP];
30 
31   // load interp into shared memory
32   __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
33   LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
34   __syncthreads();
35 
36   // Apply basis element by element
37   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
38     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
39     InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
40     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
41   }
42 }
43 
44 extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
45                                            CeedScalar *__restrict__ d_V) {
46   extern __shared__ CeedScalar slice[];
47 
48   SharedData_Cuda data;
49   data.t_id_x = threadIdx.x;
50   data.t_id_y = threadIdx.y;
51   data.t_id_z = threadIdx.z;
52   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
53   data.slice  = slice + data.t_id_z * BASIS_T_1D;
54 
55   CeedScalar r_U[BASIS_NUM_COMP];
56   CeedScalar r_V[BASIS_NUM_COMP];
57 
58   // load interp into shared memory
59   __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
60   LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
61   __syncthreads();
62 
63   // Apply basis element by element
64   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
65     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
66     InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
67     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
68   }
69 }
70 
71 extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
72                                               CeedScalar *__restrict__ d_V) {
73   extern __shared__ CeedScalar slice[];
74 
75   SharedData_Cuda data;
76   data.t_id_x = threadIdx.x;
77   data.t_id_y = threadIdx.y;
78   data.t_id_z = threadIdx.z;
79   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
80   data.slice  = slice + data.t_id_z * BASIS_T_1D;
81 
82   CeedScalar r_U[BASIS_NUM_COMP];
83   CeedScalar r_V[BASIS_NUM_COMP];
84 
85   // load interp into shared memory
86   __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
87   LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
88   __syncthreads();
89 
90   // Apply basis element by element
91   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
92     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
93     InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
94     SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
95   }
96 }
97 
98 //------------------------------------------------------------------------------
99 // Grad kernels
100 //------------------------------------------------------------------------------
101 extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
102   extern __shared__ CeedScalar slice[];
103 
104   SharedData_Cuda data;
105   data.t_id_x = threadIdx.x;
106   data.t_id_y = threadIdx.y;
107   data.t_id_z = threadIdx.z;
108   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
109   data.slice  = slice + data.t_id_z * BASIS_T_1D;
110 
111   CeedScalar r_U[BASIS_NUM_COMP];
112   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
113 
114   // load grad into shared memory
115   __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
116   LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
117   __syncthreads();
118 
119   // Apply basis element by element
120   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
121     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
122     GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
123     WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
124   }
125 }
126 
127 extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
128                                          CeedScalar *__restrict__ d_V) {
129   extern __shared__ CeedScalar slice[];
130 
131   SharedData_Cuda data;
132   data.t_id_x = threadIdx.x;
133   data.t_id_y = threadIdx.y;
134   data.t_id_z = threadIdx.z;
135   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
136   data.slice  = slice + data.t_id_z * BASIS_T_1D;
137 
138   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
139   CeedScalar r_V[BASIS_NUM_COMP];
140 
141   // load grad into shared memory
142   __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
143   LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
144   __syncthreads();
145 
146   // Apply basis element by element
147   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
148     ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
149     GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
150     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
151   }
152 }
153 
154 extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
155                                             CeedScalar *__restrict__ d_V) {
156   extern __shared__ CeedScalar slice[];
157 
158   SharedData_Cuda data;
159   data.t_id_x = threadIdx.x;
160   data.t_id_y = threadIdx.y;
161   data.t_id_z = threadIdx.z;
162   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
163   data.slice  = slice + data.t_id_z * BASIS_T_1D;
164 
165   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
166   CeedScalar r_V[BASIS_NUM_COMP];
167 
168   // load grad into shared memory
169   __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
170   LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
171   __syncthreads();
172 
173   // Apply basis element by element
174   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
175     ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
176     GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
177     SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
178   }
179 }
180 
181 //------------------------------------------------------------------------------
182 // Weight kernel
183 //------------------------------------------------------------------------------
184 extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
185   extern __shared__ CeedScalar slice[];
186 
187   SharedData_Cuda data;
188   data.t_id_x = threadIdx.x;
189   data.t_id_y = threadIdx.y;
190   data.t_id_z = threadIdx.z;
191   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
192   data.slice  = slice + data.t_id_z * BASIS_T_1D;
193 
194   CeedScalar r_W[1];
195 
196   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
197     WeightNonTensor<BASIS_P, BASIS_Q>(data, q_weight, r_W);
198     WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);
199   }
200 }
201