1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory basis read/write templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Load matrices for basis actions 14 //------------------------------------------------------------------------------ 15 template <int P, int Q> 16 inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 18 } 19 20 //------------------------------------------------------------------------------ 21 // 1D 22 //------------------------------------------------------------------------------ 23 24 //------------------------------------------------------------------------------ 25 // E-vector -> single element 26 //------------------------------------------------------------------------------ 27 template <int NUM_COMP, int P_1D> 28 inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, 29 const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 30 if (data.t_id_x < P_1D) { 31 const CeedInt node = data.t_id_x; 32 const CeedInt ind = node * strides_node + elem * strides_elem; 33 34 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 35 r_u[comp] = d_u[ind + comp * strides_comp]; 36 } 37 } 38 } 39 40 //------------------------------------------------------------------------------ 41 // Single element -> E-vector 42 //------------------------------------------------------------------------------ 43 template <int NUM_COMP, int P_1D> 44 inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, 45 const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { 46 if (data.t_id_x < P_1D) { 47 const CeedInt node = data.t_id_x; 48 const CeedInt ind = node * strides_node + elem * strides_elem; 49 50 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 51 d_v[ind + comp * strides_comp] = r_v[comp]; 52 } 53 } 54 } 55 56 template <int NUM_COMP, int P_1D> 57 inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, 58 const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { 59 if (data.t_id_x < P_1D) { 60 const CeedInt node = data.t_id_x; 61 const CeedInt ind = node * strides_node + elem * strides_elem; 62 63 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 64 d_v[ind + comp * strides_comp] += r_v[comp]; 65 } 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // 2D 71 //------------------------------------------------------------------------------ 72 73 //------------------------------------------------------------------------------ 74 // E-vector -> single element 75 //------------------------------------------------------------------------------ 76 template <int NUM_COMP, int P_1D> 77 inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, 78 const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 79 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 80 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 81 const CeedInt ind = node * strides_node + elem * strides_elem; 82 83 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 84 r_u[comp] = d_u[ind + comp * strides_comp]; 85 } 86 } 87 } 88 89 //------------------------------------------------------------------------------ 90 // Single element -> E-vector 91 //------------------------------------------------------------------------------ 92 template <int NUM_COMP, int P_1D> 93 inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, 94 const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { 95 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 96 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 97 const CeedInt ind = node * strides_node + elem * strides_elem; 98 99 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 100 d_v[ind + comp * strides_comp] = r_v[comp]; 101 } 102 } 103 } 104 105 template <int NUM_COMP, int P_1D> 106 inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, 107 const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { 108 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 109 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 110 const CeedInt ind = node * strides_node + elem * strides_elem; 111 112 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 113 d_v[ind + comp * strides_comp] += r_v[comp]; 114 } 115 } 116 } 117 118 //------------------------------------------------------------------------------ 119 // 3D 120 //------------------------------------------------------------------------------ 121 122 //------------------------------------------------------------------------------ 123 // E-vector -> single element 124 //------------------------------------------------------------------------------ 125 template <int NUM_COMP, int P_1D> 126 inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, 127 const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 128 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 129 for (CeedInt z = 0; z < P_1D; z++) { 130 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 131 const CeedInt ind = node * strides_node + elem * strides_elem; 132 133 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 134 r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp]; 135 } 136 } 137 } 138 } 139 140 //------------------------------------------------------------------------------ 141 // Single element -> E-vector 142 //------------------------------------------------------------------------------ 143 template <int NUM_COMP, int P_1D> 144 inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, 145 const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { 146 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 147 for (CeedInt z = 0; z < P_1D; z++) { 148 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 149 const CeedInt ind = node * strides_node + elem * strides_elem; 150 151 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 152 d_v[ind + comp * strides_comp] = r_v[z + comp * P_1D]; 153 } 154 } 155 } 156 } 157 158 template <int NUM_COMP, int P_1D> 159 inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, 160 const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { 161 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 162 for (CeedInt z = 0; z < P_1D; z++) { 163 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 164 const CeedInt ind = node * strides_node + elem * strides_elem; 165 166 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 167 d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D]; 168 } 169 } 170 } 171 } 172 173 //------------------------------------------------------------------------------ 174 // AtPoints 175 //------------------------------------------------------------------------------ 176 177 //------------------------------------------------------------------------------ 178 // E-vector -> single point 179 //------------------------------------------------------------------------------ 180 template <int NUM_COMP, int NUM_PTS> 181 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 182 const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, 183 const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 184 const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem; 185 186 if (p < points_in_elem) { 187 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 188 r_u[comp] = d_u[ind + comp * strides_comp]; 189 } 190 } else { 191 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 192 r_u[comp] = 0.0; 193 } 194 } 195 } 196 197 //------------------------------------------------------------------------------ 198 // Single point -> E-vector 199 //------------------------------------------------------------------------------ 200 template <int NUM_COMP, int NUM_PTS> 201 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 202 const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v, 203 CeedScalar *d_v) { 204 if (p < points_in_elem) { 205 const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem; 206 207 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 208 d_v[ind + comp * strides_comp] = r_v[comp]; 209 } 210 } 211 } 212