xref: /libCEED/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp (revision fe96005463bdbb79b892d21a5c89e2b475ecf62b)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/backend.h>
9 #include <ceed/ceed.h>
10 #include <ceed/jit-tools.h>
11 
12 #include <map>
13 #include <string_view>
14 #include <sycl/sycl.hpp>
15 
16 #include "../sycl/ceed-sycl-compile.hpp"
17 #include "ceed-sycl-shared.hpp"
18 
19 //------------------------------------------------------------------------------
20 // Compute the local range of for basis kernels
21 //------------------------------------------------------------------------------
22 static int ComputeLocalRange(Ceed ceed, CeedInt dim, CeedInt thread_1d, CeedInt *local_range, CeedInt max_group_size = 256) {
23   local_range[0]               = thread_1d;
24   local_range[1]               = (dim > 1) ? thread_1d : 1;
25   const CeedInt min_group_size = local_range[0] * local_range[1];
26 
27   CeedCheck(min_group_size <= max_group_size, ceed, CEED_ERROR_BACKEND, "Requested group size is smaller than the required minimum.");
28 
29   local_range[2] = max_group_size / min_group_size;  // elements per group
30   return CEED_ERROR_SUCCESS;
31 }
32 
33 //------------------------------------------------------------------------------
34 // Apply basis
35 //------------------------------------------------------------------------------
36 int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
37                                      CeedVector v) {
38   Ceed                   ceed;
39   Ceed_Sycl             *ceed_Sycl;
40   const CeedScalar      *d_u;
41   CeedScalar            *d_v;
42   CeedBasis_Sycl_shared *impl;
43 
44   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
45   CeedCallBackend(CeedGetData(ceed, &ceed_Sycl));
46   CeedCallBackend(CeedBasisGetData(basis, &impl));
47 
48   // Get read/write access to u, v
49   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
50   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
51   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
52 
53   // Apply basis operation
54   switch (eval_mode) {
55     case CEED_EVAL_INTERP: {
56       CeedInt       *lrange         = impl->interp_local_range;
57       const CeedInt &elem_per_group = lrange[2];
58       const CeedInt  group_count    = (num_elem / elem_per_group) + !!(num_elem % elem_per_group);
59       //-----------
60       sycl::range<3>    local_range(lrange[2], lrange[1], lrange[0]);
61       sycl::range<3>    global_range(group_count * lrange[2], lrange[1], lrange[0]);
62       sycl::nd_range<3> kernel_range(global_range, local_range);
63       //-----------
64       sycl::kernel *interp_kernel = (t_mode == CEED_TRANSPOSE) ? impl->interp_transpose_kernel : impl->interp_kernel;
65 
66       std::vector<sycl::event> e;
67 
68       if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()};
69       ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
70         cgh.depends_on(e);
71         cgh.set_args(num_elem, impl->d_interp_1d, d_u, d_v);
72         cgh.parallel_for(kernel_range, *interp_kernel);
73       });
74 
75     } break;
76     case CEED_EVAL_GRAD: {
77       CeedInt       *lrange         = impl->grad_local_range;
78       const CeedInt &elem_per_group = lrange[2];
79       const CeedInt  group_count    = (num_elem / elem_per_group) + !!(num_elem % elem_per_group);
80       //-----------
81       sycl::range<3>    local_range(lrange[2], lrange[1], lrange[0]);
82       sycl::range<3>    global_range(group_count * lrange[2], lrange[1], lrange[0]);
83       sycl::nd_range<3> kernel_range(global_range, local_range);
84       //-----------
85       sycl::kernel     *grad_kernel = (t_mode == CEED_TRANSPOSE) ? impl->grad_transpose_kernel : impl->grad_kernel;
86       const CeedScalar *d_grad_1d   = (impl->d_collo_grad_1d) ? impl->d_collo_grad_1d : impl->d_grad_1d;
87 
88       std::vector<sycl::event> e;
89 
90       if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()};
91 
92       ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
93         cgh.depends_on(e);
94         cgh.set_args(num_elem, impl->d_interp_1d, d_grad_1d, d_u, d_v);
95         cgh.parallel_for(kernel_range, *grad_kernel);
96       });
97     } break;
98     case CEED_EVAL_WEIGHT: {
99       CeedInt       *lrange         = impl->weight_local_range;
100       const CeedInt &elem_per_group = lrange[2];
101       const CeedInt  group_count    = (num_elem / elem_per_group) + !!(num_elem % elem_per_group);
102       //-----------
103       sycl::range<3>    local_range(lrange[2], lrange[1], lrange[0]);
104       sycl::range<3>    global_range(group_count * lrange[2], lrange[1], lrange[0]);
105       sycl::nd_range<3> kernel_range(global_range, local_range);
106       //-----------
107       std::vector<sycl::event> e;
108 
109       if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()};
110 
111       ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
112         cgh.depends_on(e);
113         cgh.set_args(num_elem, impl->d_q_weight_1d, d_v);
114         cgh.parallel_for(kernel_range, *(impl->weight_kernel));
115       });
116     } break;
117     case CEED_EVAL_NONE: /* handled separately below */
118       break;
119     // LCOV_EXCL_START
120     case CEED_EVAL_DIV:
121     case CEED_EVAL_CURL:
122       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
123       // LCOV_EXCL_STOP
124   }
125 
126   // Restore vectors, cover CEED_EVAL_NONE
127   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
128   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
129   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
130 
131   return CEED_ERROR_SUCCESS;
132 }
133 
134 //------------------------------------------------------------------------------
135 // Destroy basis
136 //------------------------------------------------------------------------------
137 static int CeedBasisDestroy_Sycl_shared(CeedBasis basis) {
138   Ceed                   ceed;
139   Ceed_Sycl             *data;
140   CeedBasis_Sycl_shared *impl;
141 
142   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
143   CeedCallBackend(CeedBasisGetData(basis, &impl));
144   CeedCallBackend(CeedGetData(ceed, &data));
145   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
146   CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
147   CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context));
148   CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context));
149   CeedCallSycl(ceed, sycl::free(impl->d_collo_grad_1d, data->sycl_context));
150 
151   delete impl->interp_kernel;
152   delete impl->interp_transpose_kernel;
153   delete impl->grad_kernel;
154   delete impl->grad_transpose_kernel;
155   delete impl->weight_kernel;
156   delete impl->sycl_module;
157 
158   CeedCallBackend(CeedFree(&impl));
159   return CEED_ERROR_SUCCESS;
160 }
161 
162 //------------------------------------------------------------------------------
163 // Create tensor basis
164 // TODO: Refactor
165 //------------------------------------------------------------------------------
166 int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
167                                         const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
168   Ceed                   ceed;
169   Ceed_Sycl             *data;
170   char                  *basis_kernel_source;
171   const char            *basis_kernel_path;
172   CeedInt                num_comp;
173   CeedBasis_Sycl_shared *impl;
174 
175   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
176   CeedCallBackend(CeedCalloc(1, &impl));
177   CeedCallBackend(CeedGetData(ceed, &data));
178   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
179 
180   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
181   const CeedInt num_nodes = CeedIntPow(P_1d, dim);
182   const CeedInt num_qpts  = CeedIntPow(Q_1d, dim);
183 
184   CeedInt *interp_lrange = impl->interp_local_range;
185 
186   CeedCallBackend(ComputeLocalRange(ceed, dim, thread_1d, interp_lrange));
187   const CeedInt interp_group_size = interp_lrange[0] * interp_lrange[1] * interp_lrange[2];
188 
189   CeedInt *grad_lrange = impl->grad_local_range;
190 
191   CeedCallBackend(ComputeLocalRange(ceed, dim, thread_1d, grad_lrange));
192   const CeedInt grad_group_size = grad_lrange[0] * grad_lrange[1] * grad_lrange[2];
193 
194   CeedCallBackend(ComputeLocalRange(ceed, dim, Q_1d, impl->weight_local_range));
195 
196   std::vector<sycl::event> e;
197 
198   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
199 
200   // Copy basis data to GPU
201   CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
202   sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);
203 
204   const CeedInt interp_length = Q_1d * P_1d;
205   CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
206   sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, e);
207 
208   CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
209   sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, e);
210 
211   CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
212 
213   // Compute collocated gradient and copy to GPU
214   impl->d_collo_grad_1d          = NULL;
215   const bool has_collocated_grad = (dim == 3) && (Q_1d >= P_1d);
216 
217   if (has_collocated_grad) {
218     CeedScalar   *collo_grad_1d;
219     const CeedInt cgrad_length = Q_1d * Q_1d;
220 
221     CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d));
222     CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d));
223     CeedCallSycl(ceed, impl->d_collo_grad_1d = sycl::malloc_device<CeedScalar>(cgrad_length, data->sycl_device, data->sycl_context));
224     CeedCallSycl(ceed, data->sycl_queue.copy<CeedScalar>(collo_grad_1d, impl->d_collo_grad_1d, cgrad_length, e).wait_and_throw());
225     CeedCallBackend(CeedFree(&collo_grad_1d));
226   }
227 
228   // ---[Refactor into separate function]------>
229   // Define compile-time constants
230   std::map<std::string, CeedInt> jit_constants;
231   jit_constants["BASIS_DIM"]                 = dim;
232   jit_constants["BASIS_Q_1D"]                = Q_1d;
233   jit_constants["BASIS_P_1D"]                = P_1d;
234   jit_constants["T_1D"]                      = thread_1d;
235   jit_constants["BASIS_NUM_COMP"]            = num_comp;
236   jit_constants["BASIS_NUM_NODES"]           = num_nodes;
237   jit_constants["BASIS_NUM_QPTS"]            = num_qpts;
238   jit_constants["BASIS_HAS_COLLOCATED_GRAD"] = has_collocated_grad;
239   jit_constants["BASIS_INTERP_SCRATCH_SIZE"] = interp_group_size;
240   jit_constants["BASIS_GRAD_SCRATCH_SIZE"]   = grad_group_size;
241 
242   // Load kernel source
243   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor.h", &basis_kernel_path));
244   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
245   {
246     char *source;
247 
248     CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &source));
249     basis_kernel_source = source;
250   }
251   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete -----\n");
252 
253   // Compile kernels into a kernel bundle
254   CeedCallBackend(CeedBuildModule_Sycl(ceed, basis_kernel_source, &impl->sycl_module, jit_constants));
255 
256   // Load kernel functions
257   CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "Interp", &impl->interp_kernel));
258   CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "InterpTranspose", &impl->interp_transpose_kernel));
259   CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "Grad", &impl->grad_kernel));
260   CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "GradTranspose", &impl->grad_transpose_kernel));
261   CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "Weight", &impl->weight_kernel));
262 
263   // Clean-up
264   CeedCallBackend(CeedFree(&basis_kernel_path));
265   CeedCallBackend(CeedFree(&basis_kernel_source));
266   // <---[Refactor into separate function]------
267 
268   CeedCallBackend(CeedBasisSetData(basis, impl));
269 
270   // Register backend functions
271   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Sycl_shared));
272   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl_shared));
273   return CEED_ERROR_SUCCESS;
274 }
275 
276 //------------------------------------------------------------------------------
277