1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory non-tensor basis 10 #include <ceed/types.h> 11 12 #include "hip-shared-basis-nontensor-templates.h" 13 #include "hip-shared-basis-read-write-templates.h" 14 15 //------------------------------------------------------------------------------ 16 // Interp kernels 17 //------------------------------------------------------------------------------ 18 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 19 void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 20 extern __shared__ CeedScalar slice[]; 21 22 SharedData_Hip data; 23 data.t_id_x = threadIdx.x; 24 data.t_id_y = threadIdx.y; 25 data.t_id_z = threadIdx.z; 26 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 27 data.slice = slice + data.t_id_z * T_1D; 28 29 CeedScalar r_U[BASIS_NUM_COMP]; 30 CeedScalar r_V[BASIS_NUM_COMP]; 31 32 // load interp into shared memory 33 __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 34 LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 35 __syncthreads(); 36 37 // Apply basis element by element 38 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 39 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 40 InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 41 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 42 } 43 } 44 45 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 46 void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 47 extern __shared__ CeedScalar slice[]; 48 49 SharedData_Hip data; 50 data.t_id_x = threadIdx.x; 51 data.t_id_y = threadIdx.y; 52 data.t_id_z = threadIdx.z; 53 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 54 data.slice = slice + data.t_id_z * T_1D; 55 56 CeedScalar r_U[BASIS_NUM_COMP]; 57 CeedScalar r_V[BASIS_NUM_COMP]; 58 59 // load interp into shared memory 60 __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 61 LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 62 __syncthreads(); 63 64 // Apply basis element by element 65 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 66 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 67 InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 68 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 69 } 70 } 71 72 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 73 void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 74 extern __shared__ CeedScalar slice[]; 75 76 SharedData_Hip data; 77 data.t_id_x = threadIdx.x; 78 data.t_id_y = threadIdx.y; 79 data.t_id_z = threadIdx.z; 80 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 81 data.slice = slice + data.t_id_z * T_1D; 82 83 CeedScalar r_U[BASIS_NUM_COMP]; 84 CeedScalar r_V[BASIS_NUM_COMP]; 85 86 // load interp into shared memory 87 __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 88 LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 89 __syncthreads(); 90 91 // Apply basis element by element 92 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 93 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 94 InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 95 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 96 } 97 } 98 99 //------------------------------------------------------------------------------ 100 // Grad kernels 101 //------------------------------------------------------------------------------ 102 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 103 void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 104 extern __shared__ CeedScalar slice[]; 105 106 SharedData_Hip data; 107 data.t_id_x = threadIdx.x; 108 data.t_id_y = threadIdx.y; 109 data.t_id_z = threadIdx.z; 110 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 111 data.slice = slice + data.t_id_z * T_1D; 112 113 CeedScalar r_U[BASIS_NUM_COMP]; 114 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 115 116 // load grad into shared memory 117 __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; 118 LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G); 119 __syncthreads(); 120 121 // Apply basis element by element 122 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 123 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 124 GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 125 WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 126 } 127 } 128 129 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 130 void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 131 extern __shared__ CeedScalar slice[]; 132 133 SharedData_Hip data; 134 data.t_id_x = threadIdx.x; 135 data.t_id_y = threadIdx.y; 136 data.t_id_z = threadIdx.z; 137 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 138 data.slice = slice + data.t_id_z * T_1D; 139 140 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 141 CeedScalar r_V[BASIS_NUM_COMP]; 142 143 // load grad into shared memory 144 __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; 145 LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G); 146 __syncthreads(); 147 148 // Apply basis element by element 149 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 150 ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 151 GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 152 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 153 } 154 } 155 156 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 157 void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 158 extern __shared__ CeedScalar slice[]; 159 160 SharedData_Hip data; 161 data.t_id_x = threadIdx.x; 162 data.t_id_y = threadIdx.y; 163 data.t_id_z = threadIdx.z; 164 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 165 data.slice = slice + data.t_id_z * T_1D; 166 167 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 168 CeedScalar r_V[BASIS_NUM_COMP]; 169 170 // load grad into shared memory 171 __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; 172 LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G); 173 __syncthreads(); 174 175 // Apply basis element by element 176 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 177 ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 178 GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 179 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 180 } 181 } 182 183 //------------------------------------------------------------------------------ 184 // Weight kernel 185 //------------------------------------------------------------------------------ 186 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 187 void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) { 188 extern __shared__ CeedScalar slice[]; 189 190 SharedData_Hip data; 191 data.t_id_x = threadIdx.x; 192 data.t_id_y = threadIdx.y; 193 data.t_id_z = threadIdx.z; 194 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 195 data.slice = slice + data.t_id_z * T_1D; 196 197 CeedScalar r_W[1]; 198 199 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 200 WeightNonTensor<BASIS_Q>(data, q_weight, r_W); 201 WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W); 202 } 203 } 204