1*9ff05d55SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2*9ff05d55SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*9ff05d55SJeremy L Thompson // 4*9ff05d55SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5*9ff05d55SJeremy L Thompson // 6*9ff05d55SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7*9ff05d55SJeremy L Thompson 8*9ff05d55SJeremy L Thompson /// @file 9*9ff05d55SJeremy L Thompson /// Internal header for CUDA shared memory non-tensor basis 10*9ff05d55SJeremy L Thompson #include <ceed/types.h> 11*9ff05d55SJeremy L Thompson 12*9ff05d55SJeremy L Thompson #include "cuda-shared-basis-nontensor-templates.h" 13*9ff05d55SJeremy L Thompson #include "cuda-shared-basis-read-write-templates.h" 14*9ff05d55SJeremy L Thompson 15*9ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 16*9ff05d55SJeremy L Thompson // Interp kernel by dim 17*9ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 18*9ff05d55SJeremy L Thompson extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 19*9ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 20*9ff05d55SJeremy L Thompson 21*9ff05d55SJeremy L Thompson SharedData_Cuda data; 22*9ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 23*9ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 24*9ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 25*9ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 26*9ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 27*9ff05d55SJeremy L Thompson 28*9ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 29*9ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 30*9ff05d55SJeremy L Thompson 31*9ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 32*9ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 33*9ff05d55SJeremy L Thompson InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V); 34*9ff05d55SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 35*9ff05d55SJeremy L Thompson } 36*9ff05d55SJeremy L Thompson } 37*9ff05d55SJeremy L Thompson 38*9ff05d55SJeremy L Thompson extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, 39*9ff05d55SJeremy L Thompson CeedScalar *__restrict__ d_V) { 40*9ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 41*9ff05d55SJeremy L Thompson 42*9ff05d55SJeremy L Thompson SharedData_Cuda data; 43*9ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 44*9ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 45*9ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 46*9ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 47*9ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 48*9ff05d55SJeremy L Thompson 49*9ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 50*9ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 51*9ff05d55SJeremy L Thompson 52*9ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 53*9ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 54*9ff05d55SJeremy L Thompson InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V); 55*9ff05d55SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 56*9ff05d55SJeremy L Thompson } 57*9ff05d55SJeremy L Thompson } 58*9ff05d55SJeremy L Thompson 59*9ff05d55SJeremy L Thompson extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, 60*9ff05d55SJeremy L Thompson CeedScalar *__restrict__ d_V) { 61*9ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 62*9ff05d55SJeremy L Thompson 63*9ff05d55SJeremy L Thompson SharedData_Cuda data; 64*9ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 65*9ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 66*9ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 67*9ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 68*9ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 69*9ff05d55SJeremy L Thompson 70*9ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 71*9ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 72*9ff05d55SJeremy L Thompson 73*9ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 74*9ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 75*9ff05d55SJeremy L Thompson InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V); 76*9ff05d55SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 77*9ff05d55SJeremy L Thompson } 78*9ff05d55SJeremy L Thompson } 79*9ff05d55SJeremy L Thompson 80*9ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 81*9ff05d55SJeremy L Thompson // Grad kernel by dim 82*9ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 83*9ff05d55SJeremy L Thompson extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 84*9ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 85*9ff05d55SJeremy L Thompson 86*9ff05d55SJeremy L Thompson SharedData_Cuda data; 87*9ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 88*9ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 89*9ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 90*9ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 91*9ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 92*9ff05d55SJeremy L Thompson 93*9ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 94*9ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 95*9ff05d55SJeremy L Thompson 96*9ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 97*9ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 98*9ff05d55SJeremy L Thompson GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V); 99*9ff05d55SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 100*9ff05d55SJeremy L Thompson } 101*9ff05d55SJeremy L Thompson } 102*9ff05d55SJeremy L Thompson 103*9ff05d55SJeremy L Thompson extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 104*9ff05d55SJeremy L Thompson CeedScalar *__restrict__ d_V) { 105*9ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 106*9ff05d55SJeremy L Thompson 107*9ff05d55SJeremy L Thompson SharedData_Cuda data; 108*9ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 109*9ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 110*9ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 111*9ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 112*9ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 113*9ff05d55SJeremy L Thompson 114*9ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 115*9ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 116*9ff05d55SJeremy L Thompson 117*9ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 118*9ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 119*9ff05d55SJeremy L Thompson GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V); 120*9ff05d55SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 121*9ff05d55SJeremy L Thompson } 122*9ff05d55SJeremy L Thompson } 123*9ff05d55SJeremy L Thompson 124*9ff05d55SJeremy L Thompson extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 125*9ff05d55SJeremy L Thompson CeedScalar *__restrict__ d_V) { 126*9ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 127*9ff05d55SJeremy L Thompson 128*9ff05d55SJeremy L Thompson SharedData_Cuda data; 129*9ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 130*9ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 131*9ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 132*9ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 133*9ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 134*9ff05d55SJeremy L Thompson 135*9ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 136*9ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 137*9ff05d55SJeremy L Thompson 138*9ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 139*9ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 140*9ff05d55SJeremy L Thompson GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V); 141*9ff05d55SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 142*9ff05d55SJeremy L Thompson } 143*9ff05d55SJeremy L Thompson } 144*9ff05d55SJeremy L Thompson 145*9ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 146*9ff05d55SJeremy L Thompson // Weight kernels by dim 147*9ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 148*9ff05d55SJeremy L Thompson extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) { 149*9ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 150*9ff05d55SJeremy L Thompson 151*9ff05d55SJeremy L Thompson SharedData_Cuda data; 152*9ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 153*9ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 154*9ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 155*9ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 156*9ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 157*9ff05d55SJeremy L Thompson 158*9ff05d55SJeremy L Thompson CeedScalar r_W[1]; 159*9ff05d55SJeremy L Thompson 160*9ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 161*9ff05d55SJeremy L Thompson WeightNonTensor<BASIS_Q>(data, q_weight, r_W); 162*9ff05d55SJeremy L Thompson WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W); 163*9ff05d55SJeremy L Thompson } 164*9ff05d55SJeremy L Thompson } 165