1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory non-tensor basis 10 #include <ceed/types.h> 11 12 #include "cuda-shared-basis-nontensor-templates.h" 13 #include "cuda-shared-basis-read-write-templates.h" 14 15 //------------------------------------------------------------------------------ 16 // Interp kernels 17 //------------------------------------------------------------------------------ 18 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 19 extern __shared__ CeedScalar slice[]; 20 21 SharedData_Cuda data; 22 data.t_id_x = threadIdx.x; 23 data.t_id_y = threadIdx.y; 24 data.t_id_z = threadIdx.z; 25 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 26 data.slice = slice + data.t_id_z * T_1D; 27 28 CeedScalar r_U[BASIS_NUM_COMP]; 29 CeedScalar r_V[BASIS_NUM_COMP]; 30 31 // load interp into shared memory 32 __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 33 LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 34 __syncthreads(); 35 36 // Apply basis element by element 37 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 38 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 39 InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 40 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 41 } 42 } 43 44 extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, 45 CeedScalar *__restrict__ d_V) { 46 extern __shared__ CeedScalar slice[]; 47 48 SharedData_Cuda data; 49 data.t_id_x = threadIdx.x; 50 data.t_id_y = threadIdx.y; 51 data.t_id_z = threadIdx.z; 52 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 53 data.slice = slice + data.t_id_z * T_1D; 54 55 CeedScalar r_U[BASIS_NUM_COMP]; 56 CeedScalar r_V[BASIS_NUM_COMP]; 57 58 // load interp into shared memory 59 __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 60 LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 61 __syncthreads(); 62 63 // Apply basis element by element 64 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 65 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 66 InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 67 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 68 } 69 } 70 71 extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, 72 CeedScalar *__restrict__ d_V) { 73 extern __shared__ CeedScalar slice[]; 74 75 SharedData_Cuda data; 76 data.t_id_x = threadIdx.x; 77 data.t_id_y = threadIdx.y; 78 data.t_id_z = threadIdx.z; 79 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 80 data.slice = slice + data.t_id_z * T_1D; 81 82 CeedScalar r_U[BASIS_NUM_COMP]; 83 CeedScalar r_V[BASIS_NUM_COMP]; 84 85 // load interp into shared memory 86 __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 87 LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 88 __syncthreads(); 89 90 // Apply basis element by element 91 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 92 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 93 InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 94 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 95 } 96 } 97 98 //------------------------------------------------------------------------------ 99 // Grad kernels 100 //------------------------------------------------------------------------------ 101 extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 102 extern __shared__ CeedScalar slice[]; 103 104 SharedData_Cuda data; 105 data.t_id_x = threadIdx.x; 106 data.t_id_y = threadIdx.y; 107 data.t_id_z = threadIdx.z; 108 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 109 data.slice = slice + data.t_id_z * T_1D; 110 111 CeedScalar r_U[BASIS_NUM_COMP]; 112 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 113 114 // load grad into shared memory 115 __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; 116 LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G); 117 __syncthreads(); 118 119 // Apply basis element by element 120 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 121 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 122 GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 123 WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 124 } 125 } 126 127 extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 128 CeedScalar *__restrict__ d_V) { 129 extern __shared__ CeedScalar slice[]; 130 131 SharedData_Cuda data; 132 data.t_id_x = threadIdx.x; 133 data.t_id_y = threadIdx.y; 134 data.t_id_z = threadIdx.z; 135 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 136 data.slice = slice + data.t_id_z * T_1D; 137 138 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 139 CeedScalar r_V[BASIS_NUM_COMP]; 140 141 // load grad into shared memory 142 __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; 143 LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G); 144 __syncthreads(); 145 146 // Apply basis element by element 147 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 148 ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 149 GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 150 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 151 } 152 } 153 154 extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 155 CeedScalar *__restrict__ d_V) { 156 extern __shared__ CeedScalar slice[]; 157 158 SharedData_Cuda data; 159 data.t_id_x = threadIdx.x; 160 data.t_id_y = threadIdx.y; 161 data.t_id_z = threadIdx.z; 162 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 163 data.slice = slice + data.t_id_z * T_1D; 164 165 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 166 CeedScalar r_V[BASIS_NUM_COMP]; 167 168 // load grad into shared memory 169 __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; 170 LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G); 171 __syncthreads(); 172 173 // Apply basis element by element 174 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 175 ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 176 GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 177 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 178 } 179 } 180 181 //------------------------------------------------------------------------------ 182 // Weight kernel 183 //------------------------------------------------------------------------------ 184 extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) { 185 extern __shared__ CeedScalar slice[]; 186 187 SharedData_Cuda data; 188 data.t_id_x = threadIdx.x; 189 data.t_id_y = threadIdx.y; 190 data.t_id_z = threadIdx.z; 191 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 192 data.slice = slice + data.t_id_z * T_1D; 193 194 CeedScalar r_W[1]; 195 196 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 197 WeightNonTensor<BASIS_Q>(data, q_weight, r_W); 198 WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W); 199 } 200 } 201