1*6c13bbcbSJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2*6c13bbcbSJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*6c13bbcbSJeremy L Thompson // 4*6c13bbcbSJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5*6c13bbcbSJeremy L Thompson // 6*6c13bbcbSJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7*6c13bbcbSJeremy L Thompson 8*6c13bbcbSJeremy L Thompson /// @file 9*6c13bbcbSJeremy L Thompson /// Internal header for HIP shared memory non-tensor basis 10*6c13bbcbSJeremy L Thompson #include <ceed/types.h> 11*6c13bbcbSJeremy L Thompson 12*6c13bbcbSJeremy L Thompson #include "hip-shared-basis-read-write-templates.h" 13*6c13bbcbSJeremy L Thompson #include "hip-shared-basis-nontensor-templates.h" 14*6c13bbcbSJeremy L Thompson 15*6c13bbcbSJeremy L Thompson //------------------------------------------------------------------------------ 16*6c13bbcbSJeremy L Thompson // Interp kernels 17*6c13bbcbSJeremy L Thompson //------------------------------------------------------------------------------ 18*6c13bbcbSJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 19*6c13bbcbSJeremy L Thompson void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 20*6c13bbcbSJeremy L Thompson extern __shared__ CeedScalar slice[]; 21*6c13bbcbSJeremy L Thompson 22*6c13bbcbSJeremy L Thompson SharedData_Hip data; 23*6c13bbcbSJeremy L Thompson data.t_id_x = threadIdx.x; 24*6c13bbcbSJeremy L Thompson data.t_id_y = threadIdx.y; 25*6c13bbcbSJeremy L Thompson data.t_id_z = threadIdx.z; 26*6c13bbcbSJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 27*6c13bbcbSJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 28*6c13bbcbSJeremy L Thompson 29*6c13bbcbSJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 30*6c13bbcbSJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 31*6c13bbcbSJeremy L Thompson 32*6c13bbcbSJeremy L Thompson // load interp into shared memory 33*6c13bbcbSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 34*6c13bbcbSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 35*6c13bbcbSJeremy L Thompson __syncthreads(); 36*6c13bbcbSJeremy L Thompson 37*6c13bbcbSJeremy L Thompson // Apply basis element by element 38*6c13bbcbSJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 39*6c13bbcbSJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 40*6c13bbcbSJeremy L Thompson InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 41*6c13bbcbSJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 42*6c13bbcbSJeremy L Thompson } 43*6c13bbcbSJeremy L Thompson } 44*6c13bbcbSJeremy L Thompson 45*6c13bbcbSJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 46*6c13bbcbSJeremy L Thompson void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 47*6c13bbcbSJeremy L Thompson extern __shared__ CeedScalar slice[]; 48*6c13bbcbSJeremy L Thompson 49*6c13bbcbSJeremy L Thompson SharedData_Hip data; 50*6c13bbcbSJeremy L Thompson data.t_id_x = threadIdx.x; 51*6c13bbcbSJeremy L Thompson data.t_id_y = threadIdx.y; 52*6c13bbcbSJeremy L Thompson data.t_id_z = threadIdx.z; 53*6c13bbcbSJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 54*6c13bbcbSJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 55*6c13bbcbSJeremy L Thompson 56*6c13bbcbSJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 57*6c13bbcbSJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 58*6c13bbcbSJeremy L Thompson 59*6c13bbcbSJeremy L Thompson // load interp into shared memory 60*6c13bbcbSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 61*6c13bbcbSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 62*6c13bbcbSJeremy L Thompson __syncthreads(); 63*6c13bbcbSJeremy L Thompson 64*6c13bbcbSJeremy L Thompson // Apply basis element by element 65*6c13bbcbSJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 66*6c13bbcbSJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 67*6c13bbcbSJeremy L Thompson InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 68*6c13bbcbSJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 69*6c13bbcbSJeremy L Thompson } 70*6c13bbcbSJeremy L Thompson } 71*6c13bbcbSJeremy L Thompson 72*6c13bbcbSJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 73*6c13bbcbSJeremy L Thompson void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 74*6c13bbcbSJeremy L Thompson extern __shared__ CeedScalar slice[]; 75*6c13bbcbSJeremy L Thompson 76*6c13bbcbSJeremy L Thompson SharedData_Hip data; 77*6c13bbcbSJeremy L Thompson data.t_id_x = threadIdx.x; 78*6c13bbcbSJeremy L Thompson data.t_id_y = threadIdx.y; 79*6c13bbcbSJeremy L Thompson data.t_id_z = threadIdx.z; 80*6c13bbcbSJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 81*6c13bbcbSJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 82*6c13bbcbSJeremy L Thompson 83*6c13bbcbSJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 84*6c13bbcbSJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 85*6c13bbcbSJeremy L Thompson 86*6c13bbcbSJeremy L Thompson // load interp into shared memory 87*6c13bbcbSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 88*6c13bbcbSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 89*6c13bbcbSJeremy L Thompson __syncthreads(); 90*6c13bbcbSJeremy L Thompson 91*6c13bbcbSJeremy L Thompson // Apply basis element by element 92*6c13bbcbSJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 93*6c13bbcbSJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 94*6c13bbcbSJeremy L Thompson InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 95*6c13bbcbSJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 96*6c13bbcbSJeremy L Thompson } 97*6c13bbcbSJeremy L Thompson } 98*6c13bbcbSJeremy L Thompson 99*6c13bbcbSJeremy L Thompson //------------------------------------------------------------------------------ 100*6c13bbcbSJeremy L Thompson // Grad kernels 101*6c13bbcbSJeremy L Thompson //------------------------------------------------------------------------------ 102*6c13bbcbSJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, 103*6c13bbcbSJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 104*6c13bbcbSJeremy L Thompson extern __shared__ CeedScalar slice[]; 105*6c13bbcbSJeremy L Thompson 106*6c13bbcbSJeremy L Thompson SharedData_Hip data; 107*6c13bbcbSJeremy L Thompson data.t_id_x = threadIdx.x; 108*6c13bbcbSJeremy L Thompson data.t_id_y = threadIdx.y; 109*6c13bbcbSJeremy L Thompson data.t_id_z = threadIdx.z; 110*6c13bbcbSJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 111*6c13bbcbSJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 112*6c13bbcbSJeremy L Thompson 113*6c13bbcbSJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 114*6c13bbcbSJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 115*6c13bbcbSJeremy L Thompson 116*6c13bbcbSJeremy L Thompson // load grad into shared memory 117*6c13bbcbSJeremy L Thompson __shared__ CeedScalar s_G[BASIS_P * BASIS_Q]; 118*6c13bbcbSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G); 119*6c13bbcbSJeremy L Thompson __syncthreads(); 120*6c13bbcbSJeremy L Thompson 121*6c13bbcbSJeremy L Thompson // Apply basis element by element 122*6c13bbcbSJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 123*6c13bbcbSJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 124*6c13bbcbSJeremy L Thompson GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 125*6c13bbcbSJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 126*6c13bbcbSJeremy L Thompson } 127*6c13bbcbSJeremy L Thompson } 128*6c13bbcbSJeremy L Thompson 129*6c13bbcbSJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ 130*6c13bbcbSJeremy L Thompson void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 131*6c13bbcbSJeremy L Thompson CeedScalar *__restrict__ d_V) { 132*6c13bbcbSJeremy L Thompson extern __shared__ CeedScalar slice[]; 133*6c13bbcbSJeremy L Thompson 134*6c13bbcbSJeremy L Thompson SharedData_Hip data; 135*6c13bbcbSJeremy L Thompson data.t_id_x = threadIdx.x; 136*6c13bbcbSJeremy L Thompson data.t_id_y = threadIdx.y; 137*6c13bbcbSJeremy L Thompson data.t_id_z = threadIdx.z; 138*6c13bbcbSJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 139*6c13bbcbSJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 140*6c13bbcbSJeremy L Thompson 141*6c13bbcbSJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 142*6c13bbcbSJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 143*6c13bbcbSJeremy L Thompson 144*6c13bbcbSJeremy L Thompson // load grad into shared memory 145*6c13bbcbSJeremy L Thompson __shared__ CeedScalar s_G[BASIS_P * BASIS_Q]; 146*6c13bbcbSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G); 147*6c13bbcbSJeremy L Thompson __syncthreads(); 148*6c13bbcbSJeremy L Thompson 149*6c13bbcbSJeremy L Thompson // Apply basis element by element 150*6c13bbcbSJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 151*6c13bbcbSJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 152*6c13bbcbSJeremy L Thompson GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 153*6c13bbcbSJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 154*6c13bbcbSJeremy L Thompson } 155*6c13bbcbSJeremy L Thompson } 156*6c13bbcbSJeremy L Thompson 157*6c13bbcbSJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ 158*6c13bbcbSJeremy L Thompson void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 159*6c13bbcbSJeremy L Thompson CeedScalar *__restrict__ d_V) { 160*6c13bbcbSJeremy L Thompson extern __shared__ CeedScalar slice[]; 161*6c13bbcbSJeremy L Thompson 162*6c13bbcbSJeremy L Thompson SharedData_Hip data; 163*6c13bbcbSJeremy L Thompson data.t_id_x = threadIdx.x; 164*6c13bbcbSJeremy L Thompson data.t_id_y = threadIdx.y; 165*6c13bbcbSJeremy L Thompson data.t_id_z = threadIdx.z; 166*6c13bbcbSJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 167*6c13bbcbSJeremy L Thompson data.slice = &slice[data.t_id_z * T_1D]; 168*6c13bbcbSJeremy L Thompson 169*6c13bbcbSJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 170*6c13bbcbSJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 171*6c13bbcbSJeremy L Thompson 172*6c13bbcbSJeremy L Thompson // load grad into shared memory 173*6c13bbcbSJeremy L Thompson __shared__ CeedScalar s_G[BASIS_P * BASIS_Q]; 174*6c13bbcbSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G); 175*6c13bbcbSJeremy L Thompson __syncthreads(); 176*6c13bbcbSJeremy L Thompson 177*6c13bbcbSJeremy L Thompson // Apply basis element by element 178*6c13bbcbSJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 179*6c13bbcbSJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 180*6c13bbcbSJeremy L Thompson GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 181*6c13bbcbSJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 182*6c13bbcbSJeremy L Thompson } 183*6c13bbcbSJeremy L Thompson } 184*6c13bbcbSJeremy L Thompson 185*6c13bbcbSJeremy L Thompson //------------------------------------------------------------------------------ 186*6c13bbcbSJeremy L Thompson // Weight kernel 187*6c13bbcbSJeremy L Thompson //------------------------------------------------------------------------------ 188*6c13bbcbSJeremy L Thompson extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__ 189*6c13bbcbSJeremy L Thompson void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) { 190*6c13bbcbSJeremy L Thompson extern __shared__ CeedScalar slice[]; 191*6c13bbcbSJeremy L Thompson 192*6c13bbcbSJeremy L Thompson SharedData_Hip data; 193*6c13bbcbSJeremy L Thompson data.t_id_x = threadIdx.x; 194*6c13bbcbSJeremy L Thompson data.t_id_y = threadIdx.y; 195*6c13bbcbSJeremy L Thompson data.t_id_z = threadIdx.z; 196*6c13bbcbSJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 197*6c13bbcbSJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 198*6c13bbcbSJeremy L Thompson 199*6c13bbcbSJeremy L Thompson CeedScalar r_W[1]; 200*6c13bbcbSJeremy L Thompson 201*6c13bbcbSJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 202*6c13bbcbSJeremy L Thompson WeightNonTensor<BASIS_Q>(data, q_weight, r_W); 203*6c13bbcbSJeremy L Thompson WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W); 204*6c13bbcbSJeremy L Thompson } 205*6c13bbcbSJeremy L Thompson } 206