15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2ff1e7120SSebastian Grimberg // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3ff1e7120SSebastian Grimberg // 4ff1e7120SSebastian Grimberg // SPDX-License-Identifier: BSD-2-Clause 5ff1e7120SSebastian Grimberg // 6ff1e7120SSebastian Grimberg // This file is part of CEED: http://github.com/ceed 7ff1e7120SSebastian Grimberg 8ff1e7120SSebastian Grimberg #include <ceed.h> 9ff1e7120SSebastian Grimberg #include <ceed/backend.h> 10ff1e7120SSebastian Grimberg #include <ceed/jit-tools.h> 11ff1e7120SSebastian Grimberg #include <cuda.h> 12ff1e7120SSebastian Grimberg #include <cuda_runtime.h> 13ff1e7120SSebastian Grimberg #include <stdbool.h> 14ff1e7120SSebastian Grimberg #include <stddef.h> 15ff1e7120SSebastian Grimberg #include <string.h> 16ff1e7120SSebastian Grimberg 17ff1e7120SSebastian Grimberg #include "../cuda/ceed-cuda-common.h" 18ff1e7120SSebastian Grimberg #include "../cuda/ceed-cuda-compile.h" 19ff1e7120SSebastian Grimberg #include "ceed-cuda-ref.h" 20ff1e7120SSebastian Grimberg 21ff1e7120SSebastian Grimberg //------------------------------------------------------------------------------ 22cf8cbdd6SSebastian Grimberg // Compile restriction kernels 23cf8cbdd6SSebastian Grimberg //------------------------------------------------------------------------------ 24cf8cbdd6SSebastian Grimberg static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr) { 25cf8cbdd6SSebastian Grimberg Ceed ceed; 26cf8cbdd6SSebastian Grimberg bool is_deterministic; 2722070f95SJeremy L Thompson char *restriction_kernel_source; 2822070f95SJeremy L Thompson const char *restriction_kernel_path; 29cf8cbdd6SSebastian Grimberg CeedInt num_elem, num_comp, elem_size, comp_stride; 30cf8cbdd6SSebastian Grimberg CeedRestrictionType rstr_type; 31cf8cbdd6SSebastian Grimberg CeedElemRestriction_Cuda *impl; 32cf8cbdd6SSebastian Grimberg 33cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 34cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 35b20a4af9SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 36cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 37cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 38cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); 39b20a4af9SJeremy L Thompson if (rstr_type == CEED_RESTRICTION_POINTS) { 40b20a4af9SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr, &elem_size)); 41b20a4af9SJeremy L Thompson } else { 42b20a4af9SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 43b20a4af9SJeremy L Thompson } 44cf8cbdd6SSebastian Grimberg is_deterministic = impl->d_l_vec_indices != NULL; 45cf8cbdd6SSebastian Grimberg 46cf8cbdd6SSebastian Grimberg // Compile CUDA kernels 47cf8cbdd6SSebastian Grimberg switch (rstr_type) { 48cf8cbdd6SSebastian Grimberg case CEED_RESTRICTION_STRIDED: { 49cf8cbdd6SSebastian Grimberg bool has_backend_strides; 50509d4af6SJeremy L Thompson CeedInt strides[3] = {1, num_elem * elem_size, elem_size}; 51cf8cbdd6SSebastian Grimberg 52cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 53cf8cbdd6SSebastian Grimberg if (!has_backend_strides) { 5456c48462SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides)); 55cf8cbdd6SSebastian Grimberg } 56cf8cbdd6SSebastian Grimberg 57cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-strided.h", &restriction_kernel_path)); 58cf8cbdd6SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 59cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); 60cf8cbdd6SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 61cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, 62cf8cbdd6SSebastian Grimberg "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", 63cf8cbdd6SSebastian Grimberg strides[2])); 64cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose)); 65cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose)); 66cf8cbdd6SSebastian Grimberg } break; 670b63de31SJeremy L Thompson case CEED_RESTRICTION_POINTS: { 680b63de31SJeremy L Thompson const char *offset_kernel_path; 690b63de31SJeremy L Thompson char **file_paths = NULL; 700b63de31SJeremy L Thompson CeedInt num_file_paths = 0; 710b63de31SJeremy L Thompson 720b63de31SJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-at-points.h", &restriction_kernel_path)); 730b63de31SJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 740b63de31SJeremy L Thompson CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); 750b63de31SJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path)); 760b63de31SJeremy L Thompson CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); 770b63de31SJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 780b63de31SJeremy L Thompson CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, 790b63de31SJeremy L Thompson "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, 800b63de31SJeremy L Thompson "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); 810b63de31SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose)); 820b63de31SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose)); 830b63de31SJeremy L Thompson } break; 84cf8cbdd6SSebastian Grimberg case CEED_RESTRICTION_STANDARD: { 85cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &restriction_kernel_path)); 86cf8cbdd6SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 87cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); 88cf8cbdd6SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 89cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, 90cf8cbdd6SSebastian Grimberg "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, 91cf8cbdd6SSebastian Grimberg "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); 92cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose)); 93cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose)); 94cf8cbdd6SSebastian Grimberg } break; 95cf8cbdd6SSebastian Grimberg case CEED_RESTRICTION_ORIENTED: { 9622070f95SJeremy L Thompson const char *offset_kernel_path; 97509d4af6SJeremy L Thompson char **file_paths = NULL; 98509d4af6SJeremy L Thompson CeedInt num_file_paths = 0; 99cf8cbdd6SSebastian Grimberg 100cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-oriented.h", &restriction_kernel_path)); 101cf8cbdd6SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 102509d4af6SJeremy L Thompson CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); 103cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path)); 104509d4af6SJeremy L Thompson CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); 105cf8cbdd6SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 106cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, 107cf8cbdd6SSebastian Grimberg "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, 108cf8cbdd6SSebastian Grimberg "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); 109cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedNoTranspose", &impl->ApplyNoTranspose)); 110cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose)); 111cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose)); 112cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose)); 113509d4af6SJeremy L Thompson // Cleanup 114cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedFree(&offset_kernel_path)); 1155a5594ffSJeremy L Thompson for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i])); 1165a5594ffSJeremy L Thompson CeedCallBackend(CeedFree(&file_paths)); 117cf8cbdd6SSebastian Grimberg } break; 118cf8cbdd6SSebastian Grimberg case CEED_RESTRICTION_CURL_ORIENTED: { 11922070f95SJeremy L Thompson const char *offset_kernel_path; 120509d4af6SJeremy L Thompson char **file_paths = NULL; 121509d4af6SJeremy L Thompson CeedInt num_file_paths = 0; 122cf8cbdd6SSebastian Grimberg 123cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h", &restriction_kernel_path)); 124cf8cbdd6SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 125509d4af6SJeremy L Thompson CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); 126cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path)); 127509d4af6SJeremy L Thompson CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); 128cf8cbdd6SSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 129cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, 130cf8cbdd6SSebastian Grimberg "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, 131cf8cbdd6SSebastian Grimberg "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); 132cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedNoTranspose", &impl->ApplyNoTranspose)); 133cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->ApplyUnsignedNoTranspose)); 134cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnorientedNoTranspose)); 135cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose)); 136cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose)); 137cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose)); 138509d4af6SJeremy L Thompson // Cleanup 139cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedFree(&offset_kernel_path)); 1405a5594ffSJeremy L Thompson for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i])); 1415a5594ffSJeremy L Thompson CeedCallBackend(CeedFree(&file_paths)); 142cf8cbdd6SSebastian Grimberg } break; 143cf8cbdd6SSebastian Grimberg } 144cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedFree(&restriction_kernel_path)); 145cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedFree(&restriction_kernel_source)); 146cf8cbdd6SSebastian Grimberg return CEED_ERROR_SUCCESS; 147cf8cbdd6SSebastian Grimberg } 148cf8cbdd6SSebastian Grimberg 149cf8cbdd6SSebastian Grimberg //------------------------------------------------------------------------------ 150dce49693SSebastian Grimberg // Core apply restriction code 151ff1e7120SSebastian Grimberg //------------------------------------------------------------------------------ 152dce49693SSebastian Grimberg static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients, 153dce49693SSebastian Grimberg CeedVector u, CeedVector v, CeedRequest *request) { 154ff1e7120SSebastian Grimberg Ceed ceed; 155dce49693SSebastian Grimberg CeedRestrictionType rstr_type; 156ff1e7120SSebastian Grimberg const CeedScalar *d_u; 157ff1e7120SSebastian Grimberg CeedScalar *d_v; 158ca735530SJeremy L Thompson CeedElemRestriction_Cuda *impl; 159ca735530SJeremy L Thompson 160dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 161dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 162dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 163cf8cbdd6SSebastian Grimberg 164cf8cbdd6SSebastian Grimberg // Assemble kernel if needed 165cf8cbdd6SSebastian Grimberg if (!impl->module) { 166cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedElemRestrictionSetupCompile_Cuda(rstr)); 167cf8cbdd6SSebastian Grimberg } 168ca735530SJeremy L Thompson 169ca735530SJeremy L Thompson // Get vectors 170ff1e7120SSebastian Grimberg CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 171ff1e7120SSebastian Grimberg if (t_mode == CEED_TRANSPOSE) { 172ff1e7120SSebastian Grimberg // Sum into for transpose mode, e-vec to l-vec 173ff1e7120SSebastian Grimberg CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 174ff1e7120SSebastian Grimberg } else { 175ff1e7120SSebastian Grimberg // Overwrite for notranspose mode, l-vec to e-vec 176ff1e7120SSebastian Grimberg CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 177ff1e7120SSebastian Grimberg } 178ff1e7120SSebastian Grimberg 179ff1e7120SSebastian Grimberg // Restrict 180ff1e7120SSebastian Grimberg if (t_mode == CEED_NOTRANSPOSE) { 181ff1e7120SSebastian Grimberg // L-vector -> E-vector 182cf8cbdd6SSebastian Grimberg CeedInt elem_size; 183cf8cbdd6SSebastian Grimberg 184cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 185dce49693SSebastian Grimberg const CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; 186cf8cbdd6SSebastian Grimberg const CeedInt grid = CeedDivUpInt(impl->num_nodes, block_size); 18758549094SSebastian Grimberg 188dce49693SSebastian Grimberg switch (rstr_type) { 189dce49693SSebastian Grimberg case CEED_RESTRICTION_STRIDED: { 190cf8cbdd6SSebastian Grimberg void *args[] = {&d_u, &d_v}; 19158549094SSebastian Grimberg 192cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); 193dce49693SSebastian Grimberg } break; 194b20a4af9SJeremy L Thompson case CEED_RESTRICTION_POINTS: 195dce49693SSebastian Grimberg case CEED_RESTRICTION_STANDARD: { 196a267acd1SJeremy L Thompson void *args[] = {&impl->d_offsets, &d_u, &d_v}; 197dce49693SSebastian Grimberg 198cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); 199dce49693SSebastian Grimberg } break; 200dce49693SSebastian Grimberg case CEED_RESTRICTION_ORIENTED: { 201dce49693SSebastian Grimberg if (use_signs) { 202a267acd1SJeremy L Thompson void *args[] = {&impl->d_offsets, &impl->d_orients, &d_u, &d_v}; 203dce49693SSebastian Grimberg 204cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); 205dce49693SSebastian Grimberg } else { 206a267acd1SJeremy L Thompson void *args[] = {&impl->d_offsets, &d_u, &d_v}; 207dce49693SSebastian Grimberg 208cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args)); 209dce49693SSebastian Grimberg } 210dce49693SSebastian Grimberg } break; 211dce49693SSebastian Grimberg case CEED_RESTRICTION_CURL_ORIENTED: { 212dce49693SSebastian Grimberg if (use_signs && use_orients) { 213a267acd1SJeremy L Thompson void *args[] = {&impl->d_offsets, &impl->d_curl_orients, &d_u, &d_v}; 214dce49693SSebastian Grimberg 215cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); 216dce49693SSebastian Grimberg } else if (use_orients) { 217a267acd1SJeremy L Thompson void *args[] = {&impl->d_offsets, &impl->d_curl_orients, &d_u, &d_v}; 218dce49693SSebastian Grimberg 219cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args)); 220dce49693SSebastian Grimberg } else { 221a267acd1SJeremy L Thompson void *args[] = {&impl->d_offsets, &d_u, &d_v}; 222dce49693SSebastian Grimberg 223cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args)); 224dce49693SSebastian Grimberg } 225dce49693SSebastian Grimberg } break; 226ff1e7120SSebastian Grimberg } 227ff1e7120SSebastian Grimberg } else { 228ff1e7120SSebastian Grimberg // E-vector -> L-vector 229cf8cbdd6SSebastian Grimberg const bool is_deterministic = impl->d_l_vec_indices != NULL; 230dce49693SSebastian Grimberg const CeedInt block_size = 32; 231cf8cbdd6SSebastian Grimberg const CeedInt grid = CeedDivUpInt(impl->num_nodes, block_size); 232ca735530SJeremy L Thompson 233dce49693SSebastian Grimberg switch (rstr_type) { 234dce49693SSebastian Grimberg case CEED_RESTRICTION_STRIDED: { 235cf8cbdd6SSebastian Grimberg void *args[] = {&d_u, &d_v}; 236dce49693SSebastian Grimberg 237cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 238dce49693SSebastian Grimberg } break; 2390b63de31SJeremy L Thompson case CEED_RESTRICTION_POINTS: { 2400b63de31SJeremy L Thompson if (!is_deterministic) { 2410b63de31SJeremy L Thompson void *args[] = {&impl->d_offsets, &impl->d_points_per_elem, &d_u, &d_v}; 2420b63de31SJeremy L Thompson 2430b63de31SJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 2440b63de31SJeremy L Thompson } else { 2450b63de31SJeremy L Thompson void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_points_per_elem, &impl->d_t_offsets, &d_u, &d_v}; 2460b63de31SJeremy L Thompson 2470b63de31SJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 2480b63de31SJeremy L Thompson } 2490b63de31SJeremy L Thompson } break; 250dce49693SSebastian Grimberg case CEED_RESTRICTION_STANDARD: { 251cf8cbdd6SSebastian Grimberg if (!is_deterministic) { 252a267acd1SJeremy L Thompson void *args[] = {&impl->d_offsets, &d_u, &d_v}; 25358549094SSebastian Grimberg 254cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 255ff1e7120SSebastian Grimberg } else { 25658549094SSebastian Grimberg void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 25758549094SSebastian Grimberg 258cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 25958549094SSebastian Grimberg } 260dce49693SSebastian Grimberg } break; 261dce49693SSebastian Grimberg case CEED_RESTRICTION_ORIENTED: { 262dce49693SSebastian Grimberg if (use_signs) { 263cf8cbdd6SSebastian Grimberg if (!is_deterministic) { 264a267acd1SJeremy L Thompson void *args[] = {&impl->d_offsets, &impl->d_orients, &d_u, &d_v}; 26558549094SSebastian Grimberg 266cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 267dce49693SSebastian Grimberg } else { 2687aa91133SSebastian Grimberg void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_orients, &d_u, &d_v}; 2697aa91133SSebastian Grimberg 270cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 2717aa91133SSebastian Grimberg } 2727aa91133SSebastian Grimberg } else { 273cf8cbdd6SSebastian Grimberg if (!is_deterministic) { 274a267acd1SJeremy L Thompson void *args[] = {&impl->d_offsets, &d_u, &d_v}; 275dce49693SSebastian Grimberg 276cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); 277dce49693SSebastian Grimberg } else { 278dce49693SSebastian Grimberg void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 279dce49693SSebastian Grimberg 280cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); 281dce49693SSebastian Grimberg } 282dce49693SSebastian Grimberg } 283dce49693SSebastian Grimberg } break; 284dce49693SSebastian Grimberg case CEED_RESTRICTION_CURL_ORIENTED: { 285dce49693SSebastian Grimberg if (use_signs && use_orients) { 286cf8cbdd6SSebastian Grimberg if (!is_deterministic) { 287a267acd1SJeremy L Thompson void *args[] = {&impl->d_offsets, &impl->d_curl_orients, &d_u, &d_v}; 288dce49693SSebastian Grimberg 289cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 2907aa91133SSebastian Grimberg } else { 2917aa91133SSebastian Grimberg void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; 2927aa91133SSebastian Grimberg 293cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 2947aa91133SSebastian Grimberg } 295dce49693SSebastian Grimberg } else if (use_orients) { 296cf8cbdd6SSebastian Grimberg if (!is_deterministic) { 297a267acd1SJeremy L Thompson void *args[] = {&impl->d_offsets, &impl->d_curl_orients, &d_u, &d_v}; 298dce49693SSebastian Grimberg 299cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); 300dce49693SSebastian Grimberg } else { 3017aa91133SSebastian Grimberg void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; 3027aa91133SSebastian Grimberg 303cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); 3047aa91133SSebastian Grimberg } 3057aa91133SSebastian Grimberg } else { 306cf8cbdd6SSebastian Grimberg if (!is_deterministic) { 307a267acd1SJeremy L Thompson void *args[] = {&impl->d_offsets, &d_u, &d_v}; 308dce49693SSebastian Grimberg 309cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args)); 310dce49693SSebastian Grimberg } else { 311dce49693SSebastian Grimberg void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 312dce49693SSebastian Grimberg 313cf8cbdd6SSebastian Grimberg CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args)); 314dce49693SSebastian Grimberg } 315dce49693SSebastian Grimberg } 316dce49693SSebastian Grimberg } break; 317ff1e7120SSebastian Grimberg } 318ff1e7120SSebastian Grimberg } 319ff1e7120SSebastian Grimberg 320ff1e7120SSebastian Grimberg if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; 321ff1e7120SSebastian Grimberg 322ff1e7120SSebastian Grimberg // Restore arrays 323ff1e7120SSebastian Grimberg CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 324ff1e7120SSebastian Grimberg CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 325ff1e7120SSebastian Grimberg return CEED_ERROR_SUCCESS; 326ff1e7120SSebastian Grimberg } 327ff1e7120SSebastian Grimberg 328ff1e7120SSebastian Grimberg //------------------------------------------------------------------------------ 329dce49693SSebastian Grimberg // Apply restriction 330dce49693SSebastian Grimberg //------------------------------------------------------------------------------ 331dce49693SSebastian Grimberg static int CeedElemRestrictionApply_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { 332dce49693SSebastian Grimberg return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, true, true, u, v, request); 333dce49693SSebastian Grimberg } 334dce49693SSebastian Grimberg 335dce49693SSebastian Grimberg //------------------------------------------------------------------------------ 336dce49693SSebastian Grimberg // Apply unsigned restriction 337dce49693SSebastian Grimberg //------------------------------------------------------------------------------ 338dce49693SSebastian Grimberg static int CeedElemRestrictionApplyUnsigned_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 339dce49693SSebastian Grimberg CeedRequest *request) { 340dce49693SSebastian Grimberg return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, true, u, v, request); 341dce49693SSebastian Grimberg } 342dce49693SSebastian Grimberg 343dce49693SSebastian Grimberg //------------------------------------------------------------------------------ 344dce49693SSebastian Grimberg // Apply unoriented restriction 345dce49693SSebastian Grimberg //------------------------------------------------------------------------------ 346dce49693SSebastian Grimberg static int CeedElemRestrictionApplyUnoriented_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 347dce49693SSebastian Grimberg CeedRequest *request) { 348dce49693SSebastian Grimberg return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, false, u, v, request); 349dce49693SSebastian Grimberg } 350dce49693SSebastian Grimberg 351dce49693SSebastian Grimberg //------------------------------------------------------------------------------ 352ff1e7120SSebastian Grimberg // Get offsets 353ff1e7120SSebastian Grimberg //------------------------------------------------------------------------------ 354ff1e7120SSebastian Grimberg static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { 355ff1e7120SSebastian Grimberg CeedElemRestriction_Cuda *impl; 356b20a4af9SJeremy L Thompson CeedRestrictionType rstr_type; 357ff1e7120SSebastian Grimberg 358ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 359b20a4af9SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 360ff1e7120SSebastian Grimberg switch (mem_type) { 361ff1e7120SSebastian Grimberg case CEED_MEM_HOST: 362b20a4af9SJeremy L Thompson *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->h_offsets_at_points : impl->h_offsets; 363ff1e7120SSebastian Grimberg break; 364ff1e7120SSebastian Grimberg case CEED_MEM_DEVICE: 365b20a4af9SJeremy L Thompson *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->d_offsets_at_points : impl->d_offsets; 366ff1e7120SSebastian Grimberg break; 367ff1e7120SSebastian Grimberg } 368ff1e7120SSebastian Grimberg return CEED_ERROR_SUCCESS; 369ff1e7120SSebastian Grimberg } 370ff1e7120SSebastian Grimberg 371ff1e7120SSebastian Grimberg //------------------------------------------------------------------------------ 372dce49693SSebastian Grimberg // Get orientations 373dce49693SSebastian Grimberg //------------------------------------------------------------------------------ 374dce49693SSebastian Grimberg static int CeedElemRestrictionGetOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) { 375dce49693SSebastian Grimberg CeedElemRestriction_Cuda *impl; 376dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 377dce49693SSebastian Grimberg 378dce49693SSebastian Grimberg switch (mem_type) { 379dce49693SSebastian Grimberg case CEED_MEM_HOST: 380dce49693SSebastian Grimberg *orients = impl->h_orients; 381dce49693SSebastian Grimberg break; 382dce49693SSebastian Grimberg case CEED_MEM_DEVICE: 383dce49693SSebastian Grimberg *orients = impl->d_orients; 384dce49693SSebastian Grimberg break; 385dce49693SSebastian Grimberg } 386dce49693SSebastian Grimberg return CEED_ERROR_SUCCESS; 387dce49693SSebastian Grimberg } 388dce49693SSebastian Grimberg 389dce49693SSebastian Grimberg //------------------------------------------------------------------------------ 390dce49693SSebastian Grimberg // Get curl-conforming orientations 391dce49693SSebastian Grimberg //------------------------------------------------------------------------------ 392dce49693SSebastian Grimberg static int CeedElemRestrictionGetCurlOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) { 393dce49693SSebastian Grimberg CeedElemRestriction_Cuda *impl; 394dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 395dce49693SSebastian Grimberg 396dce49693SSebastian Grimberg switch (mem_type) { 397dce49693SSebastian Grimberg case CEED_MEM_HOST: 398dce49693SSebastian Grimberg *curl_orients = impl->h_curl_orients; 399dce49693SSebastian Grimberg break; 400dce49693SSebastian Grimberg case CEED_MEM_DEVICE: 401dce49693SSebastian Grimberg *curl_orients = impl->d_curl_orients; 402dce49693SSebastian Grimberg break; 403dce49693SSebastian Grimberg } 404dce49693SSebastian Grimberg return CEED_ERROR_SUCCESS; 405dce49693SSebastian Grimberg } 406dce49693SSebastian Grimberg 407dce49693SSebastian Grimberg //------------------------------------------------------------------------------ 408b20a4af9SJeremy L Thompson // Get offset for padded AtPoints E-layout 409b20a4af9SJeremy L Thompson //------------------------------------------------------------------------------ 410b20a4af9SJeremy L Thompson static int CeedElemRestrictionGetAtPointsElementOffset_Cuda(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset) { 411b20a4af9SJeremy L Thompson CeedInt layout[3]; 412b20a4af9SJeremy L Thompson 413b20a4af9SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetELayout(rstr, layout)); 414b20a4af9SJeremy L Thompson *elem_offset = 0 * layout[0] + 0 * layout[1] + elem * layout[2]; 415b20a4af9SJeremy L Thompson return CEED_ERROR_SUCCESS; 416b20a4af9SJeremy L Thompson } 417b20a4af9SJeremy L Thompson 418b20a4af9SJeremy L Thompson //------------------------------------------------------------------------------ 419ff1e7120SSebastian Grimberg // Destroy restriction 420ff1e7120SSebastian Grimberg //------------------------------------------------------------------------------ 421dce49693SSebastian Grimberg static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) { 422ff1e7120SSebastian Grimberg Ceed ceed; 423ca735530SJeremy L Thompson CeedElemRestriction_Cuda *impl; 424ca735530SJeremy L Thompson 425dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 426dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 427cf8cbdd6SSebastian Grimberg if (impl->module) { 428ff1e7120SSebastian Grimberg CeedCallCuda(ceed, cuModuleUnload(impl->module)); 429cf8cbdd6SSebastian Grimberg } 430a267acd1SJeremy L Thompson CeedCallBackend(CeedFree(&impl->h_offsets_owned)); 431f5d1e504SJeremy L Thompson CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_offsets_owned)); 432081aa29dSJeremy L Thompson CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_t_offsets)); 433081aa29dSJeremy L Thompson CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_t_indices)); 434081aa29dSJeremy L Thompson CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_l_vec_indices)); 435a267acd1SJeremy L Thompson CeedCallBackend(CeedFree(&impl->h_orients_owned)); 436f5d1e504SJeremy L Thompson CeedCallCuda(ceed, cudaFree((bool *)impl->d_orients_owned)); 437a267acd1SJeremy L Thompson CeedCallBackend(CeedFree(&impl->h_curl_orients_owned)); 438f5d1e504SJeremy L Thompson CeedCallCuda(ceed, cudaFree((CeedInt8 *)impl->d_curl_orients_owned)); 439b20a4af9SJeremy L Thompson CeedCallBackend(CeedFree(&impl->h_offsets_at_points_owned)); 440b20a4af9SJeremy L Thompson CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_offsets_at_points_owned)); 4410b63de31SJeremy L Thompson CeedCallBackend(CeedFree(&impl->h_points_per_elem_owned)); 4420b63de31SJeremy L Thompson CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_points_per_elem_owned)); 443ff1e7120SSebastian Grimberg CeedCallBackend(CeedFree(&impl)); 444ff1e7120SSebastian Grimberg return CEED_ERROR_SUCCESS; 445ff1e7120SSebastian Grimberg } 446ff1e7120SSebastian Grimberg 447ff1e7120SSebastian Grimberg //------------------------------------------------------------------------------ 448ff1e7120SSebastian Grimberg // Create transpose offsets and indices 449ff1e7120SSebastian Grimberg //------------------------------------------------------------------------------ 450b20a4af9SJeremy L Thompson static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt elem_size, const CeedInt *indices) { 451ff1e7120SSebastian Grimberg Ceed ceed; 452ca735530SJeremy L Thompson bool *is_node; 453ff1e7120SSebastian Grimberg CeedSize l_size; 454b20a4af9SJeremy L Thompson CeedInt num_elem, num_comp, num_nodes = 0; 455ca735530SJeremy L Thompson CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; 456b20a4af9SJeremy L Thompson CeedRestrictionType rstr_type; 457ca735530SJeremy L Thompson CeedElemRestriction_Cuda *impl; 458ca735530SJeremy L Thompson 459dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 460dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 461dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 462b20a4af9SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 463dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); 464dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 465ca735530SJeremy L Thompson const CeedInt size_indices = num_elem * elem_size; 466ff1e7120SSebastian Grimberg 467ff1e7120SSebastian Grimberg // Count num_nodes 468ff1e7120SSebastian Grimberg CeedCallBackend(CeedCalloc(l_size, &is_node)); 469ca735530SJeremy L Thompson 470ff1e7120SSebastian Grimberg for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; 471ff1e7120SSebastian Grimberg for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; 472ff1e7120SSebastian Grimberg impl->num_nodes = num_nodes; 473ff1e7120SSebastian Grimberg 474ff1e7120SSebastian Grimberg // L-vector offsets array 475ff1e7120SSebastian Grimberg CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); 476ff1e7120SSebastian Grimberg CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); 477ca735530SJeremy L Thompson for (CeedInt i = 0, j = 0; i < l_size; i++) { 478ff1e7120SSebastian Grimberg if (is_node[i]) { 479ff1e7120SSebastian Grimberg l_vec_indices[j] = i; 480ff1e7120SSebastian Grimberg ind_to_offset[i] = j++; 481ff1e7120SSebastian Grimberg } 482ff1e7120SSebastian Grimberg } 483ff1e7120SSebastian Grimberg CeedCallBackend(CeedFree(&is_node)); 484ff1e7120SSebastian Grimberg 485ff1e7120SSebastian Grimberg // Compute transpose offsets and indices 486ff1e7120SSebastian Grimberg const CeedInt size_offsets = num_nodes + 1; 487ca735530SJeremy L Thompson 488ff1e7120SSebastian Grimberg CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); 489ff1e7120SSebastian Grimberg CeedCallBackend(CeedMalloc(size_indices, &t_indices)); 490ff1e7120SSebastian Grimberg // Count node multiplicity 491ff1e7120SSebastian Grimberg for (CeedInt e = 0; e < num_elem; ++e) { 492ff1e7120SSebastian Grimberg for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1]; 493ff1e7120SSebastian Grimberg } 494ff1e7120SSebastian Grimberg // Convert to running sum 495ff1e7120SSebastian Grimberg for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1]; 496ff1e7120SSebastian Grimberg // List all E-vec indices associated with L-vec node 497ff1e7120SSebastian Grimberg for (CeedInt e = 0; e < num_elem; ++e) { 498ff1e7120SSebastian Grimberg for (CeedInt i = 0; i < elem_size; ++i) { 499ff1e7120SSebastian Grimberg const CeedInt lid = elem_size * e + i; 500ff1e7120SSebastian Grimberg const CeedInt gid = indices[lid]; 501ca735530SJeremy L Thompson 502ff1e7120SSebastian Grimberg t_indices[t_offsets[ind_to_offset[gid]]++] = lid; 503ff1e7120SSebastian Grimberg } 504ff1e7120SSebastian Grimberg } 505ff1e7120SSebastian Grimberg // Reset running sum 506ff1e7120SSebastian Grimberg for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1]; 507ff1e7120SSebastian Grimberg t_offsets[0] = 0; 508ff1e7120SSebastian Grimberg 509ff1e7120SSebastian Grimberg // Copy data to device 510ff1e7120SSebastian Grimberg // -- L-vector indices 511ff1e7120SSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt))); 512081aa29dSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), cudaMemcpyHostToDevice)); 513ff1e7120SSebastian Grimberg // -- Transpose offsets 514ff1e7120SSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt))); 515081aa29dSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), cudaMemcpyHostToDevice)); 516ff1e7120SSebastian Grimberg // -- Transpose indices 517ff1e7120SSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt))); 518081aa29dSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), cudaMemcpyHostToDevice)); 519ff1e7120SSebastian Grimberg 520ff1e7120SSebastian Grimberg // Cleanup 521ff1e7120SSebastian Grimberg CeedCallBackend(CeedFree(&ind_to_offset)); 522ff1e7120SSebastian Grimberg CeedCallBackend(CeedFree(&l_vec_indices)); 523ff1e7120SSebastian Grimberg CeedCallBackend(CeedFree(&t_offsets)); 524ff1e7120SSebastian Grimberg CeedCallBackend(CeedFree(&t_indices)); 525ff1e7120SSebastian Grimberg return CEED_ERROR_SUCCESS; 526ff1e7120SSebastian Grimberg } 527ff1e7120SSebastian Grimberg 528ff1e7120SSebastian Grimberg //------------------------------------------------------------------------------ 529ff1e7120SSebastian Grimberg // Create restriction 530ff1e7120SSebastian Grimberg //------------------------------------------------------------------------------ 531a267acd1SJeremy L Thompson int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, const bool *orients, 532dce49693SSebastian Grimberg const CeedInt8 *curl_orients, CeedElemRestriction rstr) { 533ca735530SJeremy L Thompson Ceed ceed, ceed_parent; 534dce49693SSebastian Grimberg bool is_deterministic; 535b20a4af9SJeremy L Thompson CeedInt num_elem, num_comp, elem_size; 536ca735530SJeremy L Thompson CeedRestrictionType rstr_type; 537ff1e7120SSebastian Grimberg CeedElemRestriction_Cuda *impl; 538ca735530SJeremy L Thompson 539dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 540ca735530SJeremy L Thompson CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); 541ca735530SJeremy L Thompson CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); 542dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 543b20a4af9SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 544dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 54522eb1385SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 546b20a4af9SJeremy L Thompson // Use max number of points as elem size for AtPoints restrictions 547b20a4af9SJeremy L Thompson if (rstr_type == CEED_RESTRICTION_POINTS) { 548b20a4af9SJeremy L Thompson CeedInt max_points = 0; 549b20a4af9SJeremy L Thompson 550b20a4af9SJeremy L Thompson for (CeedInt i = 0; i < num_elem; i++) { 551b20a4af9SJeremy L Thompson max_points = CeedIntMax(max_points, offsets[i + 1] - offsets[i]); 552b20a4af9SJeremy L Thompson } 553b20a4af9SJeremy L Thompson elem_size = max_points; 554b20a4af9SJeremy L Thompson } 555ca735530SJeremy L Thompson const CeedInt size = num_elem * elem_size; 556ff1e7120SSebastian Grimberg 557dce49693SSebastian Grimberg CeedCallBackend(CeedCalloc(1, &impl)); 558dce49693SSebastian Grimberg impl->num_nodes = size; 559dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); 56022eb1385SJeremy L Thompson 56122eb1385SJeremy L Thompson // Set layouts 56222eb1385SJeremy L Thompson { 56322eb1385SJeremy L Thompson bool has_backend_strides; 56422eb1385SJeremy L Thompson CeedInt layout[3] = {1, size, elem_size}; 56522eb1385SJeremy L Thompson 566dce49693SSebastian Grimberg CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); 56722eb1385SJeremy L Thompson if (rstr_type == CEED_RESTRICTION_STRIDED) { 56822eb1385SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 56922eb1385SJeremy L Thompson if (has_backend_strides) { 57022eb1385SJeremy L Thompson CeedCallBackend(CeedElemRestrictionSetLLayout(rstr, layout)); 57122eb1385SJeremy L Thompson } 57222eb1385SJeremy L Thompson } 57322eb1385SJeremy L Thompson } 574ff1e7120SSebastian Grimberg 575b20a4af9SJeremy L Thompson // Pad AtPoints indices 576b20a4af9SJeremy L Thompson if (rstr_type == CEED_RESTRICTION_POINTS) { 577b20a4af9SJeremy L Thompson CeedSize offsets_len = elem_size * num_elem, at_points_size = num_elem + 1; 5780b63de31SJeremy L Thompson CeedInt max_points = elem_size, *offsets_padded, *points_per_elem; 579b20a4af9SJeremy L Thompson 580b20a4af9SJeremy L Thompson CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "only MemType Host supported when creating AtPoints restriction"); 581b20a4af9SJeremy L Thompson CeedCallBackend(CeedMalloc(offsets_len, &offsets_padded)); 5820b63de31SJeremy L Thompson CeedCallBackend(CeedMalloc(num_elem, &points_per_elem)); 583b20a4af9SJeremy L Thompson for (CeedInt i = 0; i < num_elem; i++) { 584b20a4af9SJeremy L Thompson CeedInt num_points = offsets[i + 1] - offsets[i]; 585b20a4af9SJeremy L Thompson 5860b63de31SJeremy L Thompson points_per_elem[i] = num_points; 587b20a4af9SJeremy L Thompson at_points_size += num_points; 588b20a4af9SJeremy L Thompson // -- Copy all points in element 589b20a4af9SJeremy L Thompson for (CeedInt j = 0; j < num_points; j++) { 5908be297eeSJeremy L Thompson offsets_padded[i * max_points + j] = offsets[offsets[i] + j] * num_comp; 591b20a4af9SJeremy L Thompson } 592b20a4af9SJeremy L Thompson // -- Replicate out last point in element 593b20a4af9SJeremy L Thompson for (CeedInt j = num_points; j < max_points; j++) { 5948be297eeSJeremy L Thompson offsets_padded[i * max_points + j] = offsets[offsets[i] + num_points - 1] * num_comp; 595b20a4af9SJeremy L Thompson } 596b20a4af9SJeremy L Thompson } 597b20a4af9SJeremy L Thompson CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, at_points_size, &impl->h_offsets_at_points_owned, &impl->h_offsets_at_points_borrowed, 598b20a4af9SJeremy L Thompson &impl->h_offsets_at_points)); 599b20a4af9SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_offsets_at_points_owned, at_points_size * sizeof(CeedInt))); 600b20a4af9SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy((CeedInt **)impl->d_offsets_at_points_owned, impl->h_offsets_at_points, at_points_size * sizeof(CeedInt), 601b20a4af9SJeremy L Thompson cudaMemcpyHostToDevice)); 602b20a4af9SJeremy L Thompson impl->d_offsets_at_points = (CeedInt *)impl->d_offsets_at_points_owned; 603b20a4af9SJeremy L Thompson 604b20a4af9SJeremy L Thompson // -- Use padded offsets for the rest of the setup 605b20a4af9SJeremy L Thompson offsets = (const CeedInt *)offsets_padded; 606b20a4af9SJeremy L Thompson copy_mode = CEED_OWN_POINTER; 607*2e88d319SJeremy L Thompson CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, elem_size * num_elem * num_comp)); 6080b63de31SJeremy L Thompson 6090b63de31SJeremy L Thompson // -- Points per element 6100b63de31SJeremy L Thompson CeedCallBackend(CeedSetHostCeedIntArray(points_per_elem, CEED_OWN_POINTER, num_elem, &impl->h_points_per_elem_owned, 6110b63de31SJeremy L Thompson &impl->h_points_per_elem_borrowed, &impl->h_points_per_elem)); 6120b63de31SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_points_per_elem_owned, num_elem * sizeof(CeedInt))); 6130b63de31SJeremy L Thompson CeedCallCuda(ceed, 6140b63de31SJeremy L Thompson cudaMemcpy((CeedInt **)impl->d_points_per_elem_owned, impl->h_points_per_elem, num_elem * sizeof(CeedInt), cudaMemcpyHostToDevice)); 6150b63de31SJeremy L Thompson impl->d_points_per_elem = (CeedInt *)impl->d_points_per_elem_owned; 616b20a4af9SJeremy L Thompson } 617b20a4af9SJeremy L Thompson 618dce49693SSebastian Grimberg // Set up device offset/orientation arrays 619dce49693SSebastian Grimberg if (rstr_type != CEED_RESTRICTION_STRIDED) { 620ff1e7120SSebastian Grimberg switch (mem_type) { 621ff1e7120SSebastian Grimberg case CEED_MEM_HOST: { 622f5d1e504SJeremy L Thompson CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, size, &impl->h_offsets_owned, &impl->h_offsets_borrowed, &impl->h_offsets)); 623a267acd1SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_offsets_owned, size * sizeof(CeedInt))); 624f5d1e504SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_offsets_owned, impl->h_offsets, size * sizeof(CeedInt), cudaMemcpyHostToDevice)); 625f5d1e504SJeremy L Thompson impl->d_offsets = (CeedInt *)impl->d_offsets_owned; 626b20a4af9SJeremy L Thompson if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, elem_size, offsets)); 627dce49693SSebastian Grimberg } break; 628ff1e7120SSebastian Grimberg case CEED_MEM_DEVICE: { 629f5d1e504SJeremy L Thompson CeedCallBackend(CeedSetDeviceCeedIntArray_Cuda(ceed, offsets, copy_mode, size, &impl->d_offsets_owned, &impl->d_offsets_borrowed, 630f5d1e504SJeremy L Thompson (const CeedInt **)&impl->d_offsets)); 631a267acd1SJeremy L Thompson CeedCallBackend(CeedMalloc(size, &impl->h_offsets_owned)); 632f5d1e504SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->h_offsets_owned, impl->d_offsets, size * sizeof(CeedInt), cudaMemcpyDeviceToHost)); 633a267acd1SJeremy L Thompson impl->h_offsets = impl->h_offsets_owned; 634b20a4af9SJeremy L Thompson if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, elem_size, offsets)); 635dce49693SSebastian Grimberg } break; 636ff1e7120SSebastian Grimberg } 637ff1e7120SSebastian Grimberg 638dce49693SSebastian Grimberg // Orientation data 639dce49693SSebastian Grimberg if (rstr_type == CEED_RESTRICTION_ORIENTED) { 640dce49693SSebastian Grimberg switch (mem_type) { 641dce49693SSebastian Grimberg case CEED_MEM_HOST: { 642f5d1e504SJeremy L Thompson CeedCallBackend(CeedSetHostBoolArray(orients, copy_mode, size, &impl->h_orients_owned, &impl->h_orients_borrowed, &impl->h_orients)); 643a267acd1SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_orients_owned, size * sizeof(bool))); 644f5d1e504SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy((bool *)impl->d_orients_owned, impl->h_orients, size * sizeof(bool), cudaMemcpyHostToDevice)); 645a267acd1SJeremy L Thompson impl->d_orients = impl->d_orients_owned; 646dce49693SSebastian Grimberg } break; 647dce49693SSebastian Grimberg case CEED_MEM_DEVICE: { 648f5d1e504SJeremy L Thompson CeedCallBackend(CeedSetDeviceBoolArray_Cuda(ceed, orients, copy_mode, size, &impl->d_orients_owned, &impl->d_orients_borrowed, 649f5d1e504SJeremy L Thompson (const bool **)&impl->d_orients)); 650a267acd1SJeremy L Thompson CeedCallBackend(CeedMalloc(size, &impl->h_orients_owned)); 651f5d1e504SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy((bool *)impl->h_orients_owned, impl->d_orients, size * sizeof(bool), cudaMemcpyDeviceToHost)); 652a267acd1SJeremy L Thompson impl->h_orients = impl->h_orients_owned; 653dce49693SSebastian Grimberg } break; 654dce49693SSebastian Grimberg } 655dce49693SSebastian Grimberg } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) { 656dce49693SSebastian Grimberg switch (mem_type) { 657dce49693SSebastian Grimberg case CEED_MEM_HOST: { 658f5d1e504SJeremy L Thompson CeedCallBackend(CeedSetHostCeedInt8Array(curl_orients, copy_mode, 3 * size, &impl->h_curl_orients_owned, &impl->h_curl_orients_borrowed, 659f5d1e504SJeremy L Thompson &impl->h_curl_orients)); 660a267acd1SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_curl_orients_owned, 3 * size * sizeof(CeedInt8))); 661f5d1e504SJeremy L Thompson CeedCallCuda(ceed, 662f5d1e504SJeremy L Thompson cudaMemcpy((CeedInt8 *)impl->d_curl_orients_owned, impl->h_curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyHostToDevice)); 663a267acd1SJeremy L Thompson impl->d_curl_orients = impl->d_curl_orients_owned; 664dce49693SSebastian Grimberg } break; 665dce49693SSebastian Grimberg case CEED_MEM_DEVICE: { 666f5d1e504SJeremy L Thompson CeedCallBackend(CeedSetDeviceCeedInt8Array_Cuda(ceed, curl_orients, copy_mode, 3 * size, &impl->d_curl_orients_owned, 667f5d1e504SJeremy L Thompson &impl->d_curl_orients_borrowed, (const CeedInt8 **)&impl->d_curl_orients)); 668a267acd1SJeremy L Thompson CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_owned)); 669f5d1e504SJeremy L Thompson CeedCallCuda(ceed, 670f5d1e504SJeremy L Thompson cudaMemcpy((CeedInt8 *)impl->h_curl_orients_owned, impl->d_curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyDeviceToHost)); 671a267acd1SJeremy L Thompson impl->h_curl_orients = impl->h_curl_orients_owned; 672dce49693SSebastian Grimberg } break; 673dce49693SSebastian Grimberg } 674dce49693SSebastian Grimberg } 675dce49693SSebastian Grimberg } 676ca735530SJeremy L Thompson 677ff1e7120SSebastian Grimberg // Register backend functions 678dce49693SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Cuda)); 679dce49693SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Cuda)); 680dce49693SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Cuda)); 681dce49693SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda)); 682dce49693SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Cuda)); 683dce49693SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Cuda)); 684b20a4af9SJeremy L Thompson if (rstr_type == CEED_RESTRICTION_POINTS) { 685b20a4af9SJeremy L Thompson CeedCallBackend( 686b20a4af9SJeremy L Thompson CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset", CeedElemRestrictionGetAtPointsElementOffset_Cuda)); 687b20a4af9SJeremy L Thompson } 688dce49693SSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Cuda)); 689ff1e7120SSebastian Grimberg return CEED_ERROR_SUCCESS; 690ff1e7120SSebastian Grimberg } 691ff1e7120SSebastian Grimberg 692ff1e7120SSebastian Grimberg //------------------------------------------------------------------------------ 693