1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory non-tensor basis 10 #include <ceed/types.h> 11 12 #include "hip-shared-basis-read-write-templates.h" 13 #include "hip-shared-basis-nontensor-templates.h" 14 15 //------------------------------------------------------------------------------ 16 // Interp kernels 17 //------------------------------------------------------------------------------ 18 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 19 void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 20 extern __shared__ CeedScalar slice[]; 21 22 SharedData_Hip data; 23 data.t_id_x = threadIdx.x; 24 data.t_id_y = threadIdx.y; 25 data.t_id_z = threadIdx.z; 26 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 27 data.slice = slice + data.t_id_z * T_1D; 28 29 CeedScalar r_U[BASIS_NUM_COMP]; 30 CeedScalar r_V[BASIS_NUM_COMP]; 31 32 // load interp into shared memory 33 __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 34 LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 35 __syncthreads(); 36 37 // Apply basis element by element 38 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 39 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 40 InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 41 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 42 } 43 } 44 45 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 46 void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 47 extern __shared__ CeedScalar slice[]; 48 49 SharedData_Hip data; 50 data.t_id_x = threadIdx.x; 51 data.t_id_y = threadIdx.y; 52 data.t_id_z = threadIdx.z; 53 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 54 data.slice = slice + data.t_id_z * T_1D; 55 56 CeedScalar r_U[BASIS_NUM_COMP]; 57 CeedScalar r_V[BASIS_NUM_COMP]; 58 59 // load interp into shared memory 60 __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 61 LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 62 __syncthreads(); 63 64 // Apply basis element by element 65 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 66 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 67 InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 68 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 69 } 70 } 71 72 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 73 void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 74 extern __shared__ CeedScalar slice[]; 75 76 SharedData_Hip data; 77 data.t_id_x = threadIdx.x; 78 data.t_id_y = threadIdx.y; 79 data.t_id_z = threadIdx.z; 80 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 81 data.slice = slice + data.t_id_z * T_1D; 82 83 CeedScalar r_U[BASIS_NUM_COMP]; 84 CeedScalar r_V[BASIS_NUM_COMP]; 85 86 // load interp into shared memory 87 __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 88 LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 89 __syncthreads(); 90 91 // Apply basis element by element 92 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 93 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 94 InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 95 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 96 } 97 } 98 99 //------------------------------------------------------------------------------ 100 // Grad kernels 101 //------------------------------------------------------------------------------ 102 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, 103 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 104 extern __shared__ CeedScalar slice[]; 105 106 SharedData_Hip data; 107 data.t_id_x = threadIdx.x; 108 data.t_id_y = threadIdx.y; 109 data.t_id_z = threadIdx.z; 110 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 111 data.slice = slice + data.t_id_z * T_1D; 112 113 CeedScalar r_U[BASIS_NUM_COMP]; 114 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 115 116 // load grad into shared memory 117 __shared__ CeedScalar s_G[BASIS_P * BASIS_Q]; 118 LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G); 119 __syncthreads(); 120 121 // Apply basis element by element 122 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 123 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 124 GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 125 WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 126 } 127 } 128 129 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ 130 void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 131 CeedScalar *__restrict__ d_V) { 132 extern __shared__ CeedScalar slice[]; 133 134 SharedData_Hip data; 135 data.t_id_x = threadIdx.x; 136 data.t_id_y = threadIdx.y; 137 data.t_id_z = threadIdx.z; 138 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 139 data.slice = slice + data.t_id_z * T_1D; 140 141 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 142 CeedScalar r_V[BASIS_NUM_COMP]; 143 144 // load grad into shared memory 145 __shared__ CeedScalar s_G[BASIS_P * BASIS_Q]; 146 LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G); 147 __syncthreads(); 148 149 // Apply basis element by element 150 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 151 ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 152 GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 153 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 154 } 155 } 156 157 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ 158 void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 159 CeedScalar *__restrict__ d_V) { 160 extern __shared__ CeedScalar slice[]; 161 162 SharedData_Hip data; 163 data.t_id_x = threadIdx.x; 164 data.t_id_y = threadIdx.y; 165 data.t_id_z = threadIdx.z; 166 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 167 data.slice = &slice[data.t_id_z * T_1D]; 168 169 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 170 CeedScalar r_V[BASIS_NUM_COMP]; 171 172 // load grad into shared memory 173 __shared__ CeedScalar s_G[BASIS_P * BASIS_Q]; 174 LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G); 175 __syncthreads(); 176 177 // Apply basis element by element 178 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 179 ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 180 GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 181 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 182 } 183 } 184 185 //------------------------------------------------------------------------------ 186 // Weight kernel 187 //------------------------------------------------------------------------------ 188 extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__ 189 void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) { 190 extern __shared__ CeedScalar slice[]; 191 192 SharedData_Hip data; 193 data.t_id_x = threadIdx.x; 194 data.t_id_y = threadIdx.y; 195 data.t_id_z = threadIdx.z; 196 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 197 data.slice = slice + data.t_id_z * T_1D; 198 199 CeedScalar r_W[1]; 200 201 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 202 WeightNonTensor<BASIS_Q>(data, q_weight, r_W); 203 WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W); 204 } 205 } 206