xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h (revision 9ff05d55386b4e6413be60b7231511258906fd9f)
1*9ff05d55SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2*9ff05d55SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*9ff05d55SJeremy L Thompson //
4*9ff05d55SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5*9ff05d55SJeremy L Thompson //
6*9ff05d55SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7*9ff05d55SJeremy L Thompson 
8*9ff05d55SJeremy L Thompson /// @file
9*9ff05d55SJeremy L Thompson /// Internal header for CUDA shared memory non-tensor basis
10*9ff05d55SJeremy L Thompson #include <ceed/types.h>
11*9ff05d55SJeremy L Thompson 
12*9ff05d55SJeremy L Thompson #include "cuda-shared-basis-nontensor-templates.h"
13*9ff05d55SJeremy L Thompson #include "cuda-shared-basis-read-write-templates.h"
14*9ff05d55SJeremy L Thompson 
15*9ff05d55SJeremy L Thompson //------------------------------------------------------------------------------
16*9ff05d55SJeremy L Thompson // Interp kernel by dim
17*9ff05d55SJeremy L Thompson //------------------------------------------------------------------------------
18*9ff05d55SJeremy L Thompson extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
19*9ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
20*9ff05d55SJeremy L Thompson 
21*9ff05d55SJeremy L Thompson   SharedData_Cuda data;
22*9ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
23*9ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
24*9ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
25*9ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
26*9ff05d55SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D;
27*9ff05d55SJeremy L Thompson 
28*9ff05d55SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP];
29*9ff05d55SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP];
30*9ff05d55SJeremy L Thompson 
31*9ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
32*9ff05d55SJeremy L Thompson     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
33*9ff05d55SJeremy L Thompson     InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V);
34*9ff05d55SJeremy L Thompson     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
35*9ff05d55SJeremy L Thompson   }
36*9ff05d55SJeremy L Thompson }
37*9ff05d55SJeremy L Thompson 
38*9ff05d55SJeremy L Thompson extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
39*9ff05d55SJeremy L Thompson                                            CeedScalar *__restrict__ d_V) {
40*9ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
41*9ff05d55SJeremy L Thompson 
42*9ff05d55SJeremy L Thompson   SharedData_Cuda data;
43*9ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
44*9ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
45*9ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
46*9ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
47*9ff05d55SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D;
48*9ff05d55SJeremy L Thompson 
49*9ff05d55SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP];
50*9ff05d55SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP];
51*9ff05d55SJeremy L Thompson 
52*9ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
53*9ff05d55SJeremy L Thompson     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
54*9ff05d55SJeremy L Thompson     InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V);
55*9ff05d55SJeremy L Thompson     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
56*9ff05d55SJeremy L Thompson   }
57*9ff05d55SJeremy L Thompson }
58*9ff05d55SJeremy L Thompson 
59*9ff05d55SJeremy L Thompson extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
60*9ff05d55SJeremy L Thompson                                               CeedScalar *__restrict__ d_V) {
61*9ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
62*9ff05d55SJeremy L Thompson 
63*9ff05d55SJeremy L Thompson   SharedData_Cuda data;
64*9ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
65*9ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
66*9ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
67*9ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
68*9ff05d55SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D;
69*9ff05d55SJeremy L Thompson 
70*9ff05d55SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP];
71*9ff05d55SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP];
72*9ff05d55SJeremy L Thompson 
73*9ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
74*9ff05d55SJeremy L Thompson     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
75*9ff05d55SJeremy L Thompson     InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V);
76*9ff05d55SJeremy L Thompson     SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
77*9ff05d55SJeremy L Thompson   }
78*9ff05d55SJeremy L Thompson }
79*9ff05d55SJeremy L Thompson 
80*9ff05d55SJeremy L Thompson //------------------------------------------------------------------------------
81*9ff05d55SJeremy L Thompson // Grad kernel by dim
82*9ff05d55SJeremy L Thompson //------------------------------------------------------------------------------
83*9ff05d55SJeremy L Thompson extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
84*9ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
85*9ff05d55SJeremy L Thompson 
86*9ff05d55SJeremy L Thompson   SharedData_Cuda data;
87*9ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
88*9ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
89*9ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
90*9ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
91*9ff05d55SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D;
92*9ff05d55SJeremy L Thompson 
93*9ff05d55SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP];
94*9ff05d55SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
95*9ff05d55SJeremy L Thompson 
96*9ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
97*9ff05d55SJeremy L Thompson     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
98*9ff05d55SJeremy L Thompson     GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V);
99*9ff05d55SJeremy L Thompson     WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
100*9ff05d55SJeremy L Thompson   }
101*9ff05d55SJeremy L Thompson }
102*9ff05d55SJeremy L Thompson 
103*9ff05d55SJeremy L Thompson extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
104*9ff05d55SJeremy L Thompson                                          CeedScalar *__restrict__ d_V) {
105*9ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
106*9ff05d55SJeremy L Thompson 
107*9ff05d55SJeremy L Thompson   SharedData_Cuda data;
108*9ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
109*9ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
110*9ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
111*9ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
112*9ff05d55SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D;
113*9ff05d55SJeremy L Thompson 
114*9ff05d55SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
115*9ff05d55SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP];
116*9ff05d55SJeremy L Thompson 
117*9ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
118*9ff05d55SJeremy L Thompson     ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
119*9ff05d55SJeremy L Thompson     GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V);
120*9ff05d55SJeremy L Thompson     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
121*9ff05d55SJeremy L Thompson   }
122*9ff05d55SJeremy L Thompson }
123*9ff05d55SJeremy L Thompson 
124*9ff05d55SJeremy L Thompson extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
125*9ff05d55SJeremy L Thompson                                             CeedScalar *__restrict__ d_V) {
126*9ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
127*9ff05d55SJeremy L Thompson 
128*9ff05d55SJeremy L Thompson   SharedData_Cuda data;
129*9ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
130*9ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
131*9ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
132*9ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
133*9ff05d55SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D;
134*9ff05d55SJeremy L Thompson 
135*9ff05d55SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
136*9ff05d55SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP];
137*9ff05d55SJeremy L Thompson 
138*9ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
139*9ff05d55SJeremy L Thompson     ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
140*9ff05d55SJeremy L Thompson     GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V);
141*9ff05d55SJeremy L Thompson     SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
142*9ff05d55SJeremy L Thompson   }
143*9ff05d55SJeremy L Thompson }
144*9ff05d55SJeremy L Thompson 
145*9ff05d55SJeremy L Thompson //------------------------------------------------------------------------------
146*9ff05d55SJeremy L Thompson // Weight kernels by dim
147*9ff05d55SJeremy L Thompson //------------------------------------------------------------------------------
148*9ff05d55SJeremy L Thompson extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
149*9ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
150*9ff05d55SJeremy L Thompson 
151*9ff05d55SJeremy L Thompson   SharedData_Cuda data;
152*9ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
153*9ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
154*9ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
155*9ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
156*9ff05d55SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D;
157*9ff05d55SJeremy L Thompson 
158*9ff05d55SJeremy L Thompson   CeedScalar r_W[1];
159*9ff05d55SJeremy L Thompson 
160*9ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
161*9ff05d55SJeremy L Thompson     WeightNonTensor<BASIS_Q>(data, q_weight, r_W);
162*9ff05d55SJeremy L Thompson     WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);
163*9ff05d55SJeremy L Thompson   }
164*9ff05d55SJeremy L Thompson }
165