1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other 2 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE 3 // files for details. 4 // 5 // SPDX-License-Identifier: BSD-2-Clause 6 // 7 // This file is part of CEED: http://github.com/ceed 8 9 #include <ceed/backend.h> 10 #include <ceed/ceed.h> 11 #include <ceed/jit-tools.h> 12 13 #include <string> 14 #include <sycl/sycl.hpp> 15 16 #include "../sycl/ceed-sycl-compile.hpp" 17 #include "ceed-sycl-ref.hpp" 18 19 class CeedElemRestrSyclStridedNT; 20 class CeedElemRestrSyclOffsetNT; 21 class CeedElemRestrSyclStridedT; 22 class CeedElemRestrSyclOffsetT; 23 24 //------------------------------------------------------------------------------ 25 // Restriction Kernel : L-vector -> E-vector, strided 26 //------------------------------------------------------------------------------ 27 static int CeedElemRestrictionStridedNoTranspose_Sycl(sycl::queue &sycl_queue, const CeedElemRestriction_Sycl *impl, const CeedScalar *u, 28 CeedScalar *v) { 29 const CeedInt elem_size = impl->elem_size; 30 const CeedInt num_elem = impl->num_elem; 31 const CeedInt num_comp = impl->num_comp; 32 const CeedInt stride_nodes = impl->strides[0]; 33 const CeedInt stride_comp = impl->strides[1]; 34 const CeedInt stride_elem = impl->strides[2]; 35 sycl::range<1> kernel_range(num_elem * elem_size); 36 37 // Order queue 38 sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); 39 sycl_queue.parallel_for<CeedElemRestrSyclStridedNT>(kernel_range, {e}, [=](sycl::id<1> node) { 40 const CeedInt loc_node = node % elem_size; 41 const CeedInt elem = node / elem_size; 42 43 for (CeedInt comp = 0; comp < num_comp; comp++) { 44 v[loc_node + comp * elem_size * num_elem + elem * elem_size] = u[loc_node * stride_nodes + comp * stride_comp + elem * stride_elem]; 45 } 46 }); 47 return CEED_ERROR_SUCCESS; 48 } 49 50 //------------------------------------------------------------------------------ 51 // Restriction Kernel : L-vector -> E-vector, offsets provided 52 //------------------------------------------------------------------------------ 53 static int CeedElemRestrictionOffsetNoTranspose_Sycl(sycl::queue &sycl_queue, const CeedElemRestriction_Sycl *impl, const CeedScalar *u, 54 CeedScalar *v) { 55 const CeedInt elem_size = impl->elem_size; 56 const CeedInt num_elem = impl->num_elem; 57 const CeedInt num_comp = impl->num_comp; 58 const CeedInt comp_stride = impl->comp_stride; 59 const CeedInt *indices = impl->d_ind; 60 61 sycl::range<1> kernel_range(num_elem * elem_size); 62 63 // Order queue 64 sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); 65 sycl_queue.parallel_for<CeedElemRestrSyclOffsetNT>(kernel_range, {e}, [=](sycl::id<1> node) { 66 const CeedInt ind = indices[node]; 67 const CeedInt loc_node = node % elem_size; 68 const CeedInt elem = node / elem_size; 69 70 for (CeedInt comp = 0; comp < num_comp; comp++) { 71 v[loc_node + comp * elem_size * num_elem + elem * elem_size] = u[ind + comp * comp_stride]; 72 } 73 }); 74 return CEED_ERROR_SUCCESS; 75 } 76 77 //------------------------------------------------------------------------------ 78 // Kernel: E-vector -> L-vector, strided 79 //------------------------------------------------------------------------------ 80 static int CeedElemRestrictionStridedTranspose_Sycl(sycl::queue &sycl_queue, const CeedElemRestriction_Sycl *impl, const CeedScalar *u, 81 CeedScalar *v) { 82 const CeedInt elem_size = impl->elem_size; 83 const CeedInt num_elem = impl->num_elem; 84 const CeedInt num_comp = impl->num_comp; 85 const CeedInt stride_nodes = impl->strides[0]; 86 const CeedInt stride_comp = impl->strides[1]; 87 const CeedInt stride_elem = impl->strides[2]; 88 89 sycl::range<1> kernel_range(num_elem * elem_size); 90 91 // Order queue 92 sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); 93 sycl_queue.parallel_for<CeedElemRestrSyclStridedT>(kernel_range, {e}, [=](sycl::id<1> node) { 94 const CeedInt loc_node = node % elem_size; 95 const CeedInt elem = node / elem_size; 96 97 for (CeedInt comp = 0; comp < num_comp; comp++) { 98 v[loc_node * stride_nodes + comp * stride_comp + elem * stride_elem] += u[loc_node + comp * elem_size * num_elem + elem * elem_size]; 99 } 100 }); 101 return CEED_ERROR_SUCCESS; 102 } 103 104 //------------------------------------------------------------------------------ 105 // Kernel: E-vector -> L-vector, offsets provided 106 //------------------------------------------------------------------------------ 107 static int CeedElemRestrictionOffsetTranspose_Sycl(sycl::queue &sycl_queue, const CeedElemRestriction_Sycl *impl, const CeedScalar *u, 108 CeedScalar *v) { 109 const CeedInt num_nodes = impl->num_nodes; 110 const CeedInt elem_size = impl->elem_size; 111 const CeedInt num_elem = impl->num_elem; 112 const CeedInt num_comp = impl->num_comp; 113 const CeedInt comp_stride = impl->comp_stride; 114 const CeedInt *l_vec_indices = impl->d_l_vec_indices; 115 const CeedInt *t_offsets = impl->d_t_offsets; 116 const CeedInt *t_indices = impl->d_t_indices; 117 118 sycl::range<1> kernel_range(num_nodes * num_comp); 119 120 // Order queue 121 sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); 122 sycl_queue.parallel_for<CeedElemRestrSyclOffsetT>(kernel_range, {e}, [=](sycl::id<1> id) { 123 const CeedInt node = id % num_nodes; 124 const CeedInt comp = id / num_nodes; 125 const CeedInt ind = l_vec_indices[node]; 126 const CeedInt range_1 = t_offsets[node]; 127 const CeedInt range_N = t_offsets[node + 1]; 128 CeedScalar value = 0.0; 129 130 for (CeedInt j = range_1; j < range_N; j++) { 131 const CeedInt t_ind = t_indices[j]; 132 CeedInt loc_node = t_ind % elem_size; 133 CeedInt elem = t_ind / elem_size; 134 135 value += u[loc_node + comp * elem_size * num_elem + elem * elem_size]; 136 } 137 v[ind + comp * comp_stride] += value; 138 }); 139 return CEED_ERROR_SUCCESS; 140 } 141 142 //------------------------------------------------------------------------------ 143 // Apply restriction 144 //------------------------------------------------------------------------------ 145 static int CeedElemRestrictionApply_Sycl(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { 146 Ceed ceed; 147 Ceed_Sycl *data; 148 const CeedScalar *d_u; 149 CeedScalar *d_v; 150 CeedElemRestriction_Sycl *impl; 151 152 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 153 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 154 CeedCallBackend(CeedGetData(ceed, &data)); 155 156 // Get vectors 157 CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 158 if (t_mode == CEED_TRANSPOSE) { 159 // Sum into for transpose mode, e-vec to l-vec 160 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 161 } else { 162 // Overwrite for notranspose mode, l-vec to e-vec 163 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 164 } 165 166 // Restrict 167 if (t_mode == CEED_NOTRANSPOSE) { 168 // L-vector -> E-vector 169 if (impl->d_ind) { 170 // -- Offsets provided 171 CeedCallBackend(CeedElemRestrictionOffsetNoTranspose_Sycl(data->sycl_queue, impl, d_u, d_v)); 172 } else { 173 // -- Strided restriction 174 CeedCallBackend(CeedElemRestrictionStridedNoTranspose_Sycl(data->sycl_queue, impl, d_u, d_v)); 175 } 176 } else { 177 // E-vector -> L-vector 178 if (impl->d_ind) { 179 // -- Offsets provided 180 CeedCallBackend(CeedElemRestrictionOffsetTranspose_Sycl(data->sycl_queue, impl, d_u, d_v)); 181 } else { 182 // -- Strided restriction 183 CeedCallBackend(CeedElemRestrictionStridedTranspose_Sycl(data->sycl_queue, impl, d_u, d_v)); 184 } 185 } 186 // Wait for queues to be completed. NOTE: This may not be necessary 187 CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); 188 189 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; 190 191 // Restore arrays 192 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 193 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 194 return CEED_ERROR_SUCCESS; 195 } 196 197 //------------------------------------------------------------------------------ 198 // Get offsets 199 //------------------------------------------------------------------------------ 200 static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction r, CeedMemType m_type, const CeedInt **offsets) { 201 Ceed ceed; 202 CeedElemRestriction_Sycl *impl; 203 204 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 205 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 206 207 switch (m_type) { 208 case CEED_MEM_HOST: 209 *offsets = impl->h_ind; 210 break; 211 case CEED_MEM_DEVICE: 212 *offsets = impl->d_ind; 213 break; 214 } 215 return CEED_ERROR_SUCCESS; 216 } 217 218 //------------------------------------------------------------------------------ 219 // Destroy restriction 220 //------------------------------------------------------------------------------ 221 static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction r) { 222 Ceed ceed; 223 Ceed_Sycl *data; 224 CeedElemRestriction_Sycl *impl; 225 226 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 227 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 228 CeedCallBackend(CeedGetData(ceed, &data)); 229 230 // Wait for all work to finish before freeing memory 231 CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); 232 233 CeedCallBackend(CeedFree(&impl->h_ind_allocated)); 234 CeedCallSycl(ceed, sycl::free(impl->d_ind_allocated, data->sycl_context)); 235 CeedCallSycl(ceed, sycl::free(impl->d_t_offsets, data->sycl_context)); 236 CeedCallSycl(ceed, sycl::free(impl->d_t_indices, data->sycl_context)); 237 CeedCallSycl(ceed, sycl::free(impl->d_l_vec_indices, data->sycl_context)); 238 CeedCallBackend(CeedFree(&impl)); 239 return CEED_ERROR_SUCCESS; 240 } 241 242 //------------------------------------------------------------------------------ 243 // Create transpose offsets and indices 244 //------------------------------------------------------------------------------ 245 static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const CeedInt *indices) { 246 Ceed ceed; 247 Ceed_Sycl *data; 248 bool *is_node; 249 CeedSize l_size; 250 CeedInt num_elem, elem_size, num_comp, num_nodes = 0, *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; 251 CeedElemRestriction_Sycl *impl; 252 253 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 254 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 255 CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); 256 CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 257 CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); 258 CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); 259 260 // Count num_nodes 261 CeedCallBackend(CeedCalloc(l_size, &is_node)); 262 const CeedInt size_indices = num_elem * elem_size; 263 264 for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; 265 for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; 266 impl->num_nodes = num_nodes; 267 268 // L-vector offsets array 269 CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); 270 CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); 271 for (CeedInt i = 0, j = 0; i < l_size; i++) { 272 if (is_node[i]) { 273 l_vec_indices[j] = i; 274 ind_to_offset[i] = j++; 275 } 276 } 277 CeedCallBackend(CeedFree(&is_node)); 278 279 // Compute transpose offsets and indices 280 const CeedInt size_offsets = num_nodes + 1; 281 282 CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); 283 CeedCallBackend(CeedMalloc(size_indices, &t_indices)); 284 // Count node multiplicity 285 for (CeedInt e = 0; e < num_elem; ++e) { 286 for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1]; 287 } 288 // Convert to running sum 289 for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1]; 290 // List all E-vec indices associated with L-vec node 291 for (CeedInt e = 0; e < num_elem; ++e) { 292 for (CeedInt i = 0; i < elem_size; ++i) { 293 const CeedInt lid = elem_size * e + i; 294 const CeedInt gid = indices[lid]; 295 t_indices[t_offsets[ind_to_offset[gid]]++] = lid; 296 } 297 } 298 // Reset running sum 299 for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1]; 300 t_offsets[0] = 0; 301 302 // Copy data to device 303 CeedCallBackend(CeedGetData(ceed, &data)); 304 305 // Order queue 306 sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); 307 308 // -- L-vector indices 309 CeedCallSycl(ceed, impl->d_l_vec_indices = sycl::malloc_device<CeedInt>(num_nodes, data->sycl_device, data->sycl_context)); 310 sycl::event copy_lvec = data->sycl_queue.copy<CeedInt>(l_vec_indices, impl->d_l_vec_indices, num_nodes, {e}); 311 // -- Transpose offsets 312 CeedCallSycl(ceed, impl->d_t_offsets = sycl::malloc_device<CeedInt>(size_offsets, data->sycl_device, data->sycl_context)); 313 sycl::event copy_offsets = data->sycl_queue.copy<CeedInt>(t_offsets, impl->d_t_offsets, size_offsets, {e}); 314 // -- Transpose indices 315 CeedCallSycl(ceed, impl->d_t_indices = sycl::malloc_device<CeedInt>(size_indices, data->sycl_device, data->sycl_context)); 316 sycl::event copy_indices = data->sycl_queue.copy<CeedInt>(t_indices, impl->d_t_indices, size_indices, {e}); 317 318 // Wait for all copies to complete and handle exceptions 319 CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_lvec, copy_offsets, copy_indices})); 320 321 // Cleanup 322 CeedCallBackend(CeedFree(&ind_to_offset)); 323 CeedCallBackend(CeedFree(&l_vec_indices)); 324 CeedCallBackend(CeedFree(&t_offsets)); 325 CeedCallBackend(CeedFree(&t_indices)); 326 return CEED_ERROR_SUCCESS; 327 } 328 329 //------------------------------------------------------------------------------ 330 // Create restriction 331 //------------------------------------------------------------------------------ 332 int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, 333 const CeedInt8 *curl_orients, CeedElemRestriction r) { 334 Ceed ceed; 335 Ceed_Sycl *data; 336 bool is_strided; 337 CeedInt num_elem, num_comp, elem_size, comp_stride = 1; 338 CeedRestrictionType rstr_type; 339 CeedElemRestriction_Sycl *impl; 340 341 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 342 CeedCallBackend(CeedGetData(ceed, &data)); 343 CeedCallBackend(CeedCalloc(1, &impl)); 344 CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); 345 CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); 346 CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 347 CeedInt size = num_elem * elem_size; 348 CeedInt strides[3] = {1, size, elem_size}; 349 350 CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); 351 CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, 352 "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); 353 354 // Stride data 355 CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); 356 if (is_strided) { 357 bool has_backend_strides; 358 359 CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); 360 if (!has_backend_strides) { 361 CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); 362 } 363 } else { 364 CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); 365 } 366 367 impl->h_ind = NULL; 368 impl->h_ind_allocated = NULL; 369 impl->d_ind = NULL; 370 impl->d_ind_allocated = NULL; 371 impl->d_t_indices = NULL; 372 impl->d_t_offsets = NULL; 373 impl->num_nodes = size; 374 impl->num_elem = num_elem; 375 impl->num_comp = num_comp; 376 impl->elem_size = elem_size; 377 impl->comp_stride = comp_stride; 378 impl->strides[0] = strides[0]; 379 impl->strides[1] = strides[1]; 380 impl->strides[2] = strides[2]; 381 CeedCallBackend(CeedElemRestrictionSetData(r, impl)); 382 CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; 383 CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); 384 385 // Set up device indices/offset arrays 386 if (mem_type == CEED_MEM_HOST) { 387 switch (copy_mode) { 388 case CEED_OWN_POINTER: 389 impl->h_ind_allocated = (CeedInt *)indices; 390 impl->h_ind = (CeedInt *)indices; 391 break; 392 case CEED_USE_POINTER: 393 impl->h_ind = (CeedInt *)indices; 394 break; 395 case CEED_COPY_VALUES: 396 if (indices != NULL) { 397 CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); 398 memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); 399 impl->h_ind = impl->h_ind_allocated; 400 } 401 break; 402 } 403 if (indices != NULL) { 404 CeedCallSycl(ceed, impl->d_ind = sycl::malloc_device<CeedInt>(size, data->sycl_device, data->sycl_context)); 405 impl->d_ind_allocated = impl->d_ind; // We own the device memory 406 // Order queue 407 sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); 408 // Copy from host to device 409 sycl::event copy_event = data->sycl_queue.copy<CeedInt>(indices, impl->d_ind, size, {e}); 410 // Wait for copy to finish and handle exceptions 411 CeedCallSycl(ceed, copy_event.wait_and_throw()); 412 CeedCallBackend(CeedElemRestrictionOffset_Sycl(r, indices)); 413 } 414 } else if (mem_type == CEED_MEM_DEVICE) { 415 switch (copy_mode) { 416 case CEED_COPY_VALUES: 417 if (indices != NULL) { 418 CeedCallSycl(ceed, impl->d_ind = sycl::malloc_device<CeedInt>(size, data->sycl_device, data->sycl_context)); 419 impl->d_ind_allocated = impl->d_ind; // We own the device memory 420 // Copy from device to device 421 // Order queue 422 sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); 423 sycl::event copy_event = data->sycl_queue.copy<CeedInt>(indices, impl->d_ind, size, {e}); 424 // Wait for copy to finish and handle exceptions 425 CeedCallSycl(ceed, copy_event.wait_and_throw()); 426 } 427 break; 428 case CEED_OWN_POINTER: 429 impl->d_ind = (CeedInt *)indices; 430 impl->d_ind_allocated = impl->d_ind; 431 break; 432 case CEED_USE_POINTER: 433 impl->d_ind = (CeedInt *)indices; 434 } 435 if (indices != NULL) { 436 CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); 437 // Order queue 438 sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); 439 // Copy from device to host 440 sycl::event copy_event = data->sycl_queue.copy<CeedInt>(impl->d_ind, impl->h_ind_allocated, elem_size * num_elem, {e}); 441 CeedCallSycl(ceed, copy_event.wait_and_throw()); 442 impl->h_ind = impl->h_ind_allocated; 443 CeedCallBackend(CeedElemRestrictionOffset_Sycl(r, indices)); 444 } 445 } else { 446 // LCOV_EXCL_START 447 return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported"); 448 // LCOV_EXCL_STOP 449 } 450 451 // Register backend functions 452 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Sycl)); 453 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Sycl)); 454 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Sycl)); 455 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Sycl)); 456 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Sycl)); 457 return CEED_ERROR_SUCCESS; 458 } 459