xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h (revision 9ba83ac0e4b1fca39d6fa6737a318a9f0cbc172d)
1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
29ff05d55SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39ff05d55SJeremy L Thompson //
49ff05d55SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
59ff05d55SJeremy L Thompson //
69ff05d55SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
79ff05d55SJeremy L Thompson 
89ff05d55SJeremy L Thompson /// @file
99ff05d55SJeremy L Thompson /// Internal header for CUDA shared memory non-tensor basis
109ff05d55SJeremy L Thompson #include <ceed/types.h>
119ff05d55SJeremy L Thompson 
129ff05d55SJeremy L Thompson #include "cuda-shared-basis-nontensor-templates.h"
139ff05d55SJeremy L Thompson #include "cuda-shared-basis-read-write-templates.h"
149ff05d55SJeremy L Thompson 
159ff05d55SJeremy L Thompson //------------------------------------------------------------------------------
166c13bbcbSJeremy L Thompson // Interp kernels
179ff05d55SJeremy L Thompson //------------------------------------------------------------------------------
189ff05d55SJeremy L Thompson extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
199ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
209ff05d55SJeremy L Thompson 
219ff05d55SJeremy L Thompson   SharedData_Cuda data;
229ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
239ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
249ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
259ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
2699421279SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D;
279ff05d55SJeremy L Thompson 
289ff05d55SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP];
299ff05d55SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP];
309ff05d55SJeremy L Thompson 
31aa4002adSJeremy L Thompson   // load interp into shared memory
32aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
33aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
34aa4002adSJeremy L Thompson   __syncthreads();
35aa4002adSJeremy L Thompson 
36aa4002adSJeremy L Thompson   // Apply basis element by element
379ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
389ff05d55SJeremy L Thompson     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
3999421279SJeremy L Thompson     InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
409ff05d55SJeremy L Thompson     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
419ff05d55SJeremy L Thompson   }
429ff05d55SJeremy L Thompson }
439ff05d55SJeremy L Thompson 
449ff05d55SJeremy L Thompson extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
459ff05d55SJeremy L Thompson                                            CeedScalar *__restrict__ d_V) {
469ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
479ff05d55SJeremy L Thompson 
489ff05d55SJeremy L Thompson   SharedData_Cuda data;
499ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
509ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
519ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
529ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
5399421279SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D;
549ff05d55SJeremy L Thompson 
559ff05d55SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP];
569ff05d55SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP];
579ff05d55SJeremy L Thompson 
58aa4002adSJeremy L Thompson   // load interp into shared memory
59aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
60aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
61aa4002adSJeremy L Thompson   __syncthreads();
62aa4002adSJeremy L Thompson 
63aa4002adSJeremy L Thompson   // Apply basis element by element
649ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
659ff05d55SJeremy L Thompson     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
6699421279SJeremy L Thompson     InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
679ff05d55SJeremy L Thompson     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
689ff05d55SJeremy L Thompson   }
699ff05d55SJeremy L Thompson }
709ff05d55SJeremy L Thompson 
719ff05d55SJeremy L Thompson extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
729ff05d55SJeremy L Thompson                                               CeedScalar *__restrict__ d_V) {
739ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
749ff05d55SJeremy L Thompson 
759ff05d55SJeremy L Thompson   SharedData_Cuda data;
769ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
779ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
789ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
799ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
8099421279SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D;
819ff05d55SJeremy L Thompson 
829ff05d55SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP];
839ff05d55SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP];
849ff05d55SJeremy L Thompson 
85aa4002adSJeremy L Thompson   // load interp into shared memory
86aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
87aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
88aa4002adSJeremy L Thompson   __syncthreads();
89aa4002adSJeremy L Thompson 
90aa4002adSJeremy L Thompson   // Apply basis element by element
919ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
929ff05d55SJeremy L Thompson     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
9399421279SJeremy L Thompson     InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
949ff05d55SJeremy L Thompson     SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
959ff05d55SJeremy L Thompson   }
969ff05d55SJeremy L Thompson }
979ff05d55SJeremy L Thompson 
989ff05d55SJeremy L Thompson //------------------------------------------------------------------------------
996c13bbcbSJeremy L Thompson // Grad kernels
1009ff05d55SJeremy L Thompson //------------------------------------------------------------------------------
1019ff05d55SJeremy L Thompson extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
1029ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
1039ff05d55SJeremy L Thompson 
1049ff05d55SJeremy L Thompson   SharedData_Cuda data;
1059ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
1069ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
1079ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
1089ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
10999421279SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D;
1109ff05d55SJeremy L Thompson 
1119ff05d55SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP];
1129ff05d55SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
1139ff05d55SJeremy L Thompson 
114aa4002adSJeremy L Thompson   // load grad into shared memory
115aa4002adSJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
116aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
117aa4002adSJeremy L Thompson   __syncthreads();
118aa4002adSJeremy L Thompson 
119aa4002adSJeremy L Thompson   // Apply basis element by element
1209ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
1219ff05d55SJeremy L Thompson     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
12299421279SJeremy L Thompson     GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
1239ff05d55SJeremy L Thompson     WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
1249ff05d55SJeremy L Thompson   }
1259ff05d55SJeremy L Thompson }
1269ff05d55SJeremy L Thompson 
1279ff05d55SJeremy L Thompson extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
1289ff05d55SJeremy L Thompson                                          CeedScalar *__restrict__ d_V) {
1299ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
1309ff05d55SJeremy L Thompson 
1319ff05d55SJeremy L Thompson   SharedData_Cuda data;
1329ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
1339ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
1349ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
1359ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
13699421279SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D;
1379ff05d55SJeremy L Thompson 
1389ff05d55SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
1399ff05d55SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP];
1409ff05d55SJeremy L Thompson 
141aa4002adSJeremy L Thompson   // load grad into shared memory
142aa4002adSJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
143aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
144aa4002adSJeremy L Thompson   __syncthreads();
145aa4002adSJeremy L Thompson 
146aa4002adSJeremy L Thompson   // Apply basis element by element
1479ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
1489ff05d55SJeremy L Thompson     ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
14999421279SJeremy L Thompson     GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
1509ff05d55SJeremy L Thompson     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
1519ff05d55SJeremy L Thompson   }
1529ff05d55SJeremy L Thompson }
1539ff05d55SJeremy L Thompson 
1549ff05d55SJeremy L Thompson extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
1559ff05d55SJeremy L Thompson                                             CeedScalar *__restrict__ d_V) {
1569ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
1579ff05d55SJeremy L Thompson 
1589ff05d55SJeremy L Thompson   SharedData_Cuda data;
1599ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
1609ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
1619ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
1629ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
16399421279SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D;
1649ff05d55SJeremy L Thompson 
1659ff05d55SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
1669ff05d55SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP];
1679ff05d55SJeremy L Thompson 
168aa4002adSJeremy L Thompson   // load grad into shared memory
169aa4002adSJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
170aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
171aa4002adSJeremy L Thompson   __syncthreads();
172aa4002adSJeremy L Thompson 
173aa4002adSJeremy L Thompson   // Apply basis element by element
1749ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
1759ff05d55SJeremy L Thompson     ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
17699421279SJeremy L Thompson     GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
1779ff05d55SJeremy L Thompson     SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
1789ff05d55SJeremy L Thompson   }
1799ff05d55SJeremy L Thompson }
1809ff05d55SJeremy L Thompson 
1819ff05d55SJeremy L Thompson //------------------------------------------------------------------------------
1826c13bbcbSJeremy L Thompson // Weight kernel
1839ff05d55SJeremy L Thompson //------------------------------------------------------------------------------
1849ff05d55SJeremy L Thompson extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
1859ff05d55SJeremy L Thompson   extern __shared__ CeedScalar slice[];
1869ff05d55SJeremy L Thompson 
1879ff05d55SJeremy L Thompson   SharedData_Cuda data;
1889ff05d55SJeremy L Thompson   data.t_id_x = threadIdx.x;
1899ff05d55SJeremy L Thompson   data.t_id_y = threadIdx.y;
1909ff05d55SJeremy L Thompson   data.t_id_z = threadIdx.z;
1919ff05d55SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
19299421279SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D;
1939ff05d55SJeremy L Thompson 
1949ff05d55SJeremy L Thompson   CeedScalar r_W[1];
1959ff05d55SJeremy L Thompson 
1969ff05d55SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
197343e3094SJeremy L Thompson     WeightNonTensor<BASIS_P, BASIS_Q>(data, q_weight, r_W);
1989ff05d55SJeremy L Thompson     WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);
1999ff05d55SJeremy L Thompson   }
2009ff05d55SJeremy L Thompson }
201