1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 #define CEED_DEBUG_COLOR 12
9
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/gen-tools.h>
13 #include <ceed/jit-tools.h>
14
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18
19 #include "../hip-ref/ceed-hip-ref.h"
20 #include "../hip-shared/ceed-hip-shared.h"
21 #include "../hip/ceed-hip-common.h"
22 #include "../hip/ceed-hip-compile.h"
23 #include "ceed-hip-gen.h"
24
25 struct FieldReuse_Hip {
26 CeedInt index;
27 bool is_input;
28 CeedEvalMode eval_mode;
29 };
30
31 //------------------------------------------------------------------------------
32 // Calculate the block size used for launching the operator kernel
33 //------------------------------------------------------------------------------
BlockGridCalculate_Hip_gen(const CeedInt dim,const CeedInt num_elem,const CeedInt P_1d,const CeedInt Q_1d,CeedInt * block_sizes)34 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
35 const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
36 if (dim == 1) {
37 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
38
39 elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
40 block_sizes[0] = thread_1d;
41 block_sizes[1] = 1;
42 block_sizes[2] = elems_per_block;
43 } else if (dim == 2) {
44 const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
45
46 block_sizes[0] = thread_1d;
47 block_sizes[1] = thread_1d;
48 block_sizes[2] = elems_per_block;
49 } else if (dim == 3) {
50 const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
51
52 block_sizes[0] = thread_1d;
53 block_sizes[1] = thread_1d;
54 block_sizes[2] = elems_per_block;
55 }
56 return CEED_ERROR_SUCCESS;
57 }
58
59 //------------------------------------------------------------------------------
60 // Determine type of operator
61 //------------------------------------------------------------------------------
CeedOperatorBuildKernelData_Hip_gen(Ceed ceed,CeedInt num_input_fields,CeedOperatorField * op_input_fields,CeedQFunctionField * qf_input_fields,CeedInt num_output_fields,CeedOperatorField * op_output_fields,CeedQFunctionField * qf_output_fields,CeedInt * max_P,CeedInt * max_P_1d,CeedInt * Q,CeedInt * Q_1d,CeedInt * max_dim,bool * is_all_tensor,bool * use_3d_slices)62 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
63 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
64 CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
65 CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
66 // Check if all are tensor
67 *is_all_tensor = true;
68 for (CeedInt i = 0; i < num_input_fields; i++) {
69 CeedBasis basis;
70
71 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
72 if (basis != CEED_BASIS_NONE) {
73 bool is_field_tensor;
74
75 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
76 *is_all_tensor = *is_all_tensor && is_field_tensor;
77 }
78 CeedCallBackend(CeedBasisDestroy(&basis));
79 }
80 for (CeedInt i = 0; i < num_output_fields; i++) {
81 CeedBasis basis;
82
83 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
84 if (basis != CEED_BASIS_NONE) {
85 bool is_field_tensor;
86
87 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
88 *is_all_tensor = *is_all_tensor && is_field_tensor;
89 }
90 CeedCallBackend(CeedBasisDestroy(&basis));
91 }
92
93 // Find max_P, max_P_1d, Q, and Q_1d
94 bool is_all_3d = true;
95
96 *max_P = 0;
97 *max_P_1d = 0;
98 *Q = 0;
99 *Q_1d = 0;
100 for (CeedInt i = 0; i < num_input_fields; i++) {
101 CeedBasis basis;
102
103 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
104 if (basis != CEED_BASIS_NONE) {
105 bool is_field_tensor;
106 CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
107
108 // Check if 3D
109 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
110 is_all_3d = is_all_3d && (field_dim == 3);
111 *max_dim = CeedIntMax(*max_dim, field_dim);
112
113 // Collect P, P_1d, Q, and Q_1d
114 CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
115 *max_P = CeedIntMax(*max_P, field_P);
116 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
117 if (is_field_tensor) {
118 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
119 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
120 }
121 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
122 CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
123 *Q = field_Q;
124 if (is_field_tensor) {
125 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
126 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
127 *Q_1d = field_Q_1d;
128 }
129 }
130 CeedCallBackend(CeedBasisDestroy(&basis));
131 }
132 for (CeedInt i = 0; i < num_output_fields; i++) {
133 CeedBasis basis;
134
135 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
136 if (basis != CEED_BASIS_NONE) {
137 bool is_field_tensor;
138 CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
139
140 // Check if 3D
141 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
142 is_all_3d = is_all_3d && (field_dim == 3);
143 *max_dim = CeedIntMax(*max_dim, field_dim);
144
145 // Collect P, P_1d, Q, and Q_1d
146 CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
147 *max_P = CeedIntMax(*max_P, field_P);
148 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
149 if (is_field_tensor) {
150 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
151 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
152 }
153 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
154 CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
155 *Q = field_Q;
156 if (is_field_tensor) {
157 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
158 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
159 *Q_1d = field_Q_1d;
160 }
161 }
162 CeedCallBackend(CeedBasisDestroy(&basis));
163 }
164
165 // Only use 3D collocated gradient parallelization strategy when gradient is computed
166 *use_3d_slices = false;
167 if (is_all_3d && *is_all_tensor) {
168 bool was_grad_found = false;
169
170 for (CeedInt i = 0; i < num_input_fields; i++) {
171 CeedEvalMode eval_mode;
172
173 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
174 if (eval_mode == CEED_EVAL_GRAD) {
175 CeedBasis_Hip_shared *basis_data;
176 CeedBasis basis;
177
178 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
179 CeedCallBackend(CeedBasisGetData(basis, &basis_data));
180 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
181 was_grad_found = true;
182 CeedCallBackend(CeedBasisDestroy(&basis));
183 }
184 }
185 for (CeedInt i = 0; i < num_output_fields; i++) {
186 CeedEvalMode eval_mode;
187
188 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
189 if (eval_mode == CEED_EVAL_GRAD) {
190 CeedBasis_Hip_shared *basis_data;
191 CeedBasis basis;
192
193 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
194 CeedCallBackend(CeedBasisGetData(basis, &basis_data));
195 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
196 was_grad_found = true;
197 CeedCallBackend(CeedBasisDestroy(&basis));
198 }
199 }
200 }
201 return CEED_ERROR_SUCCESS;
202 }
203
204 //------------------------------------------------------------------------------
205 // Setup fields
206 //------------------------------------------------------------------------------
CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream & code,CeedOperator_Hip_gen * data,Tab & tab,CeedInt i,CeedOperatorField op_field,CeedQFunctionField qf_field,FieldReuse_Hip field_reuse,CeedInt max_dim,CeedInt Q,CeedInt Q_1d,bool is_input,bool is_all_tensor,bool is_at_points,bool use_3d_slices,bool skip_active_load)207 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
208 CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
209 CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
210 bool use_3d_slices, bool skip_active_load) {
211 bool is_tensor = true, is_active = true;
212 CeedBasis basis;
213
214 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
215 if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
216 {
217 CeedVector vec;
218
219 CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec));
220 is_active = vec == CEED_VECTOR_ACTIVE;
221 CeedCallBackend(CeedVectorDestroy(&vec));
222 }
223
224 const char *field_name;
225 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
226 std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
227 std::string option_name = (is_input ? "inputs" : "outputs");
228 CeedEvalMode eval_mode = CEED_EVAL_NONE;
229 CeedInt elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
230 CeedElemRestriction elem_rstr;
231 CeedBasis_Hip_shared *basis_data;
232
233 // Field reuse info
234 bool use_previous_field = field_reuse.index != -1;
235
236 CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
237 code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
238
239 // Get field data
240 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
241 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
242 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
243 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
244 }
245 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
246 if (basis != CEED_BASIS_NONE) {
247 CeedCallBackend(CeedBasisGetData(basis, &basis_data));
248 CeedCallBackend(CeedBasisGetDimension(basis, &dim));
249 if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
250 else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
251 }
252 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
253
254 // Set field constants
255 code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
256 if (is_tensor && !is_all_tensor) {
257 CeedInt P = 0;
258
259 CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
260 code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
261 }
262 code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
263 if (eval_mode != CEED_EVAL_WEIGHT) {
264 code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
265 }
266
267 // Load basis data
268 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
269 switch (eval_mode) {
270 case CEED_EVAL_NONE:
271 break;
272 case CEED_EVAL_INTERP:
273 if (is_at_points) {
274 // AtPoints
275 if (!basis_data->d_chebyshev_interp_1d) {
276 CeedSize interp_bytes;
277 CeedScalar *chebyshev_interp_1d;
278
279 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
280 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
281 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
282 CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
283 CeedCallHip(CeedBasisReturnCeed(basis),
284 hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
285 CeedCallBackend(CeedFree(&chebyshev_interp_1d));
286 }
287 if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
288 else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
289 } else {
290 // Standard quadrature
291 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
292 else data->B.outputs[i] = basis_data->d_interp_1d;
293 }
294 if (use_previous_field && !skip_active_load) {
295 std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
296
297 code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
298 } else {
299 bool is_collocated = false;
300
301 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
302 if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
303 code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
304 } else {
305 code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
306 code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
307 }
308 }
309 break;
310 case CEED_EVAL_GRAD:
311 if (is_at_points) {
312 // AtPoints
313 if (!basis_data->d_chebyshev_interp_1d) {
314 CeedSize interp_bytes;
315 CeedScalar *chebyshev_interp_1d;
316
317 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
318 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
319 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
320 CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
321 CeedCallHip(CeedBasisReturnCeed(basis),
322 hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
323 CeedCallBackend(CeedFree(&chebyshev_interp_1d));
324 }
325 if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
326 else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
327 } else {
328 // Standard quadrature
329 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
330 else data->B.outputs[i] = basis_data->d_interp_1d;
331 }
332 if (is_tensor) {
333 if (use_previous_field && !skip_active_load) {
334 std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
335
336 code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
337 } else {
338 bool is_collocated = false;
339
340 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
341 if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
342 code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
343 } else {
344 code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
345 code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
346 }
347 }
348 }
349 if (is_at_points) break; // No G mat for AtPoints
350 if (use_3d_slices) {
351 if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
352 else data->G.outputs[i] = basis_data->d_collo_grad_1d;
353 if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
354 std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
355
356 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
357 } else if (is_active && skip_active_load) {
358 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
359 } else {
360 code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
361 code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
362 }
363 } else {
364 bool has_collo_grad = basis_data->d_collo_grad_1d;
365
366 if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
367 else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
368 if (has_collo_grad) {
369 if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
370 std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
371
372 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
373 } else if (is_active && skip_active_load) {
374 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
375 } else {
376 code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
377 code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
378 }
379 } else {
380 if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
381 std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
382
383 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
384 } else if (is_active && skip_active_load) {
385 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
386 } else {
387 code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
388 << (is_tensor ? "" : var_suffix) << "];\n";
389 code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
390 << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
391 }
392 }
393 }
394 break;
395 case CEED_EVAL_WEIGHT:
396 break; // No action
397 // LCOV_EXCL_START
398 case CEED_EVAL_DIV:
399 case CEED_EVAL_CURL:
400 break; // TODO: Not implemented
401 // LCOV_EXCL_STOP
402 }
403 CeedCallBackend(CeedBasisDestroy(&basis));
404 return CEED_ERROR_SUCCESS;
405 }
406
407 //------------------------------------------------------------------------------
408 // Restriction
409 //------------------------------------------------------------------------------
CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream & code,CeedOperator_Hip_gen * data,Tab & tab,CeedInt i,CeedInt field_input_buffer[],CeedOperatorField op_field,CeedQFunctionField qf_field,CeedInt max_dim,CeedInt Q_1d,bool is_input,bool is_all_tensor,bool is_at_points,bool use_3d_slices)410 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
411 CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
412 CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
413 bool use_3d_slices) {
414 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
415 std::string P_name = (is_all_tensor ? "P_1d" : "P") + var_suffix;
416 CeedEvalMode eval_mode = CEED_EVAL_NONE;
417 CeedInt elem_size = 0, num_comp = 0;
418 CeedSize l_size;
419 CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD;
420 CeedElemRestriction_Hip *rstr_data;
421 CeedElemRestriction elem_rstr;
422
423 // Get field data
424 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
425 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
426 CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
427 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
428 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
429 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
430 }
431 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
432
433 // Restriction
434 if (is_input) {
435 // Input
436 if (field_input_buffer[i] != i) {
437 std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
438
439 // Restriction was already done for previous input
440 code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
441 } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
442 if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
443 // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
444 code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
445 } else if (rstr_type != CEED_RESTRICTION_POINTS) {
446 // Otherwise we're using the scratch space
447 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
448 }
449 switch (rstr_type) {
450 case CEED_RESTRICTION_STANDARD: {
451 CeedInt comp_stride;
452
453 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
454 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
455 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
456 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
457 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
458 code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
459 << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
460 << ");\n";
461 break;
462 }
463 case CEED_RESTRICTION_STRIDED: {
464 bool has_backend_strides;
465 CeedInt num_elem;
466
467 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
468 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
469 CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
470
471 if (!has_backend_strides) {
472 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
473 }
474 code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
475 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
476 code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
477 << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
478 << var_suffix << ");\n";
479 break;
480 }
481 case CEED_RESTRICTION_POINTS: {
482 CeedInt comp_stride;
483
484 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
485 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
486 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
487 break;
488 }
489 // LCOV_EXCL_START
490 case CEED_RESTRICTION_ORIENTED:
491 case CEED_RESTRICTION_CURL_ORIENTED:
492 break; // TODO: Not implemented
493 // LCOV_EXCL_STOP
494 }
495 }
496 } else {
497 // Output
498 switch (rstr_type) {
499 case CEED_RESTRICTION_STANDARD: {
500 CeedInt comp_stride;
501
502 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
503 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
504 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
505 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
506 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
507 code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
508 << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
509 << ");\n";
510 break;
511 }
512 case CEED_RESTRICTION_STRIDED: {
513 bool has_backend_strides;
514 CeedInt num_elem;
515
516 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
517 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
518 CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
519
520 if (!has_backend_strides) {
521 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
522 }
523 code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
524 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
525 code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
526 << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
527 << ");\n";
528 break;
529 }
530 case CEED_RESTRICTION_POINTS:
531 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
532 break;
533 // LCOV_EXCL_START
534 case CEED_RESTRICTION_ORIENTED:
535 case CEED_RESTRICTION_CURL_ORIENTED:
536 break; // TODO: Not implemented
537 // LCOV_EXCL_STOP
538 }
539 }
540 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
541 return CEED_ERROR_SUCCESS;
542 }
543
544 //------------------------------------------------------------------------------
545 // Basis
546 //------------------------------------------------------------------------------
CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream & code,CeedOperator_Hip_gen * data,Tab & tab,CeedInt i,CeedOperatorField op_field,CeedQFunctionField qf_field,CeedInt max_dim,CeedInt Q_1d,bool is_input,bool is_all_tensor,bool is_at_points,bool use_3d_slices)547 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
548 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
549 bool is_at_points, bool use_3d_slices) {
550 bool is_tensor = true, is_collocated = true;
551 CeedBasis basis;
552 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
553 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
554 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
555
556 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
557 std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
558 CeedEvalMode eval_mode = CEED_EVAL_NONE;
559 CeedInt dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
560 CeedElemRestriction elem_rstr;
561
562 // Get field data
563 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
564 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
565 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
566 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
567 }
568 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
569 if (basis != CEED_BASIS_NONE) {
570 CeedCallBackend(CeedBasisGetDimension(basis, &dim));
571 if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
572 else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
573 }
574 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
575
576 // Basis
577 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
578 if (is_input) {
579 switch (eval_mode) {
580 case CEED_EVAL_NONE:
581 if (!use_3d_slices && !is_at_points) {
582 code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
583 }
584 break;
585 case CEED_EVAL_INTERP:
586 if (is_at_points) {
587 std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
588
589 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
590 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
591 << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
592 } else {
593 std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
594 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
595 : "InterpNonTensor";
596 std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
597
598 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
599 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
600 << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
601 }
602 break;
603 case CEED_EVAL_GRAD:
604 if (is_at_points) {
605 std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
606
607 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
608 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
609 << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
610 } else if (use_3d_slices) {
611 std::string function_name =
612 (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d";
613
614 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
615 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
616 << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
617 } else if (is_tensor) {
618 bool is_collocated_grad = dim == 3 && Q_1d >= P_1d;
619 std::string function_name =
620 (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
621 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
622 std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
623
624 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
625 << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
626 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
627 << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
628 } else {
629 std::string function_name = "GradNonTensor";
630
631 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
632 code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
633 << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
634 }
635 break;
636 case CEED_EVAL_WEIGHT: {
637 if (is_at_points) {
638 code << tab << "// Nothing to do AtPoints\n";
639 } else {
640 CeedBasis_Hip_shared *basis_data;
641 std::string function_name = is_tensor
642 ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
643 : "WeightNonTensor";
644
645 code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
646 CeedCallBackend(CeedBasisGetData(basis, &basis_data));
647 data->W = basis_data->d_q_weight_1d;
648 code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
649 }
650 break;
651 }
652 // LCOV_EXCL_START
653 case CEED_EVAL_DIV:
654 case CEED_EVAL_CURL:
655 break; // TODO: Not implemented
656 // LCOV_EXCL_STOP
657 }
658 } else {
659 switch (eval_mode) {
660 case CEED_EVAL_NONE:
661 code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
662 break; // No action
663 case CEED_EVAL_INTERP:
664 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
665 if (is_at_points) {
666 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
667
668 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
669 << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
670 } else {
671 std::string function_name =
672 is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
673 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
674 : "InterpTransposeNonTensor";
675 std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
676
677 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
678 << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
679 }
680 break;
681 case CEED_EVAL_GRAD:
682 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
683 if (is_at_points) {
684 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
685
686 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
687 << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
688 } else if (use_3d_slices) {
689 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
690 std::to_string(dim) + "d";
691
692 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
693 << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
694 } else if (is_tensor) {
695 bool is_collocated_grad = dim == 3 && Q_1d >= P_1d;
696 std::string function_name =
697 (dim == 1 ? "GradTranspose"
698 : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
699 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
700 std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
701
702 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
703 << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
704 } else {
705 std::string function_name = "GradTransposeNonTensor";
706
707 code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
708 << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
709 }
710 break;
711 // LCOV_EXCL_START
712 case CEED_EVAL_WEIGHT:
713 break; // Should not occur
714 case CEED_EVAL_DIV:
715 case CEED_EVAL_CURL:
716 break; // TODO: Not implemented
717 // LCOV_EXCL_STOP
718 }
719 }
720 CeedCallBackend(CeedBasisDestroy(&basis));
721 return CEED_ERROR_SUCCESS;
722 }
723
724 //------------------------------------------------------------------------------
725 // QFunction
726 //------------------------------------------------------------------------------
CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream & code,CeedOperator_Hip_gen * data,Tab & tab,CeedInt max_dim,CeedInt max_num_points,CeedInt num_input_fields,CeedOperatorField * op_input_fields,CeedQFunctionField * qf_input_fields,CeedInt num_output_fields,CeedOperatorField * op_output_fields,CeedQFunctionField * qf_output_fields,std::string qfunction_name,CeedInt Q_1d,bool is_all_tensor,bool is_at_points,bool use_3d_slices,bool is_assemble)727 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim,
728 CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
729 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
730 CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
731 std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
732 bool use_3d_slices, bool is_assemble) {
733 std::string Q_name = is_all_tensor ? "Q_1d" : "Q";
734 CeedEvalMode eval_mode = CEED_EVAL_NONE;
735 CeedElemRestriction elem_rstr;
736
737 // Setup output arrays
738 code << "\n";
739 code << tab << "// -- Output field setup\n";
740 for (CeedInt i = 0; i < num_output_fields; i++) {
741 const char *field_name;
742 std::string var_suffix = "_out_" + std::to_string(i);
743
744 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
745 code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
746 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
747 switch (eval_mode) {
748 case CEED_EVAL_NONE:
749 if (is_at_points) {
750 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
751 } else {
752 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
753 << "];\n";
754 }
755 break;
756 case CEED_EVAL_INTERP:
757 if (is_at_points) {
758 // Accumulator for point data
759 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
760 code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
761 << "[i] = 0.0;\n";
762 } else {
763 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
764 << "];\n";
765 }
766 break;
767 case CEED_EVAL_GRAD:
768 if (is_at_points) {
769 // Accumulator for point data
770 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
771 code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
772 << "[i] = 0.0;\n";
773 } else if (use_3d_slices) {
774 // Accumulator for gradient slices
775 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
776 code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
777 } else {
778 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
779 << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
780 }
781 break;
782 case CEED_EVAL_WEIGHT:
783 break;
784 // LCOV_EXCL_START
785 case CEED_EVAL_DIV:
786 case CEED_EVAL_CURL:
787 break; // TODO: Not implemented
788 // LCOV_EXCL_STOP
789 }
790 }
791
792 if (is_at_points) {
793 // We need to handle batches of points
794 code << "\n";
795 code << tab << "// Note: Using batches of points\n";
796 code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
797 code << tab << "#pragma unroll\n";
798 code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
799 tab.push();
800 code << tab << "const CeedInt p = i % max_num_points;\n\n";
801
802 code << tab << "// -- Coordinates\n";
803 code << tab << "CeedScalar r_x[max_dim];\n";
804 code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
805
806 code << tab << "// -- Input fields\n";
807 for (CeedInt i = 0; i < num_input_fields; i++) {
808 const char *field_name;
809 std::string var_suffix = "_in_" + std::to_string(i);
810 std::string P_name = "P_1d" + var_suffix;
811
812 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
813 code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
814 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
815 // Basis action
816 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
817 switch (eval_mode) {
818 case CEED_EVAL_NONE:
819 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
820 code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
821 << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
822 break;
823 case CEED_EVAL_INTERP:
824 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
825 code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
826 << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
827 break;
828 case CEED_EVAL_GRAD:
829 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
830 code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
831 << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
832 break;
833 case CEED_EVAL_WEIGHT:
834 code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
835 code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
836 break;
837 // LCOV_EXCL_START
838 case CEED_EVAL_DIV:
839 case CEED_EVAL_CURL:
840 break; // TODO: Not implemented
841 // LCOV_EXCL_STOP
842 }
843 }
844 code << "\n";
845 code << tab << "// -- Output fields\n";
846 for (CeedInt i = 0; i < num_output_fields; i++) {
847 const char *field_name;
848 std::string var_suffix = "_out_" + std::to_string(i);
849
850 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
851 code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
852 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
853 // Basis action
854 switch (eval_mode) {
855 case CEED_EVAL_NONE:
856 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
857 break;
858 case CEED_EVAL_INTERP:
859 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
860 break;
861 case CEED_EVAL_GRAD:
862 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
863 break;
864 // LCOV_EXCL_START
865 case CEED_EVAL_WEIGHT:
866 break; // Should not occur
867 case CEED_EVAL_DIV:
868 case CEED_EVAL_CURL:
869 break; // TODO: Not implemented
870 // LCOV_EXCL_STOP
871 }
872 }
873
874 } else if (use_3d_slices) {
875 // We treat quadrature points per slice in 3d to save registers
876 code << "\n";
877 code << tab << "// Note: Using planes of 3D elements\n";
878 code << tab << "#pragma unroll\n";
879 code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
880 tab.push();
881 code << tab << "// -- Input fields\n";
882 for (CeedInt i = 0; i < num_input_fields; i++) {
883 const char *field_name;
884 std::string var_suffix = "_in_" + std::to_string(i);
885
886 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
887 code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
888 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
889 // Basis action
890 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
891 switch (eval_mode) {
892 case CEED_EVAL_NONE:
893 bool is_strided;
894
895 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
896
897 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
898 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
899 if (is_strided) {
900 bool has_backend_strides;
901 CeedInt num_elem, elem_size;
902
903 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
904 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
905 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
906 CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
907
908 if (!has_backend_strides) {
909 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
910 }
911 code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
912 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
913 code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
914 << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
915 } else {
916 CeedSize l_size = 0;
917 CeedInt comp_stride;
918 CeedElemRestriction_Hip *rstr_data;
919
920 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
921 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
922 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
923 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
924 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
925 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
926 code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
927 << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
928 }
929 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
930 break;
931 case CEED_EVAL_INTERP:
932 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
933 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
934 tab.push();
935 code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
936 tab.pop();
937 code << tab << "}\n";
938 break;
939 case CEED_EVAL_GRAD:
940 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
941 code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
942 << var_suffix << ", r_s" << var_suffix << ");\n";
943 break;
944 case CEED_EVAL_WEIGHT:
945 code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
946 code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
947 break;
948 // LCOV_EXCL_START
949 case CEED_EVAL_DIV:
950 case CEED_EVAL_CURL:
951 break; // TODO: Not implemented
952 // LCOV_EXCL_STOP
953 }
954 }
955 code << "\n";
956 code << tab << "// -- Output fields\n";
957 for (CeedInt i = 0; i < num_output_fields; i++) {
958 const char *field_name;
959 std::string var_suffix = "_out_" + std::to_string(i);
960
961 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
962 code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
963 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
964 // Basis action
965 switch (eval_mode) {
966 case CEED_EVAL_NONE:
967 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
968 break;
969 case CEED_EVAL_INTERP:
970 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
971 break;
972 case CEED_EVAL_GRAD:
973 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
974 break;
975 // LCOV_EXCL_START
976 case CEED_EVAL_WEIGHT:
977 break; // Should not occur
978 case CEED_EVAL_DIV:
979 case CEED_EVAL_CURL:
980 break; // TODO: Not implemented
981 // LCOV_EXCL_STOP
982 }
983 }
984 } else {
985 code << "\n";
986 code << tab << "// Note: Using full elements\n";
987 code << tab << "{\n";
988 tab.push();
989 code << tab << "// -- Input fields\n";
990 for (CeedInt i = 0; i < num_input_fields; i++) {
991 const char *field_name;
992
993 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
994 code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
995 code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
996 }
997 code << tab << "// -- Output fields\n";
998 for (CeedInt i = 0; i < num_output_fields; i++) {
999 const char *field_name;
1000
1001 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1002 code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1003 code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
1004 }
1005 }
1006
1007 // Input and output buffers
1008 code << "\n";
1009 code << tab << "// -- QFunction inputs and outputs\n";
1010 code << tab << "// ---- Inputs\n";
1011 code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
1012 for (CeedInt i = 0; i < num_input_fields; i++) {
1013 const char *field_name;
1014
1015 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
1016 code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
1017 code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
1018 }
1019 code << tab << "// ---- Outputs\n";
1020 code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
1021 for (CeedInt i = 0; i < num_output_fields; i++) {
1022 const char *field_name;
1023
1024 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1025 code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
1026 code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
1027 }
1028
1029 // Apply QFunction
1030 code << "\n";
1031 code << tab << "// -- Apply QFunction\n";
1032 code << tab << "" << qfunction_name << "(ctx, ";
1033 if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1034 code << "1";
1035 } else {
1036 code << Q_name;
1037 }
1038 code << ", inputs, outputs);\n";
1039
1040 if (is_at_points) {
1041 // Map back to coefficients
1042 code << "\n";
1043 code << tab << "// -- Output fields\n";
1044 for (CeedInt i = 0; i < num_output_fields; i++) {
1045 const char *field_name;
1046 std::string var_suffix = "_out_" + std::to_string(i);
1047 std::string P_name = "P_1d" + var_suffix;
1048
1049 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1050 code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1051 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1052 // Basis action
1053 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1054 switch (eval_mode) {
1055 case CEED_EVAL_NONE: {
1056 CeedInt comp_stride;
1057 CeedElemRestriction elem_rstr;
1058
1059 if (is_assemble) break;
1060 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1061 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
1062 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1063 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
1064 code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
1065 << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
1066 << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
1067 break;
1068 }
1069 case CEED_EVAL_INTERP:
1070 code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1071 tab.push();
1072 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1073 tab.pop();
1074 code << tab << "}\n";
1075 code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1076 << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1077 break;
1078 case CEED_EVAL_GRAD:
1079 code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1080 tab.push();
1081 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1082 tab.pop();
1083 code << tab << "}\n";
1084 code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1085 << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1086 break;
1087 // LCOV_EXCL_START
1088 case CEED_EVAL_WEIGHT:
1089 break; // Should not occur
1090 case CEED_EVAL_DIV:
1091 case CEED_EVAL_CURL:
1092 break; // TODO: Not implemented
1093 // LCOV_EXCL_STOP
1094 }
1095 }
1096 } else if (use_3d_slices) {
1097 // Copy or apply transpose grad, if needed
1098 code << "\n";
1099 code << tab << "// -- Output fields\n";
1100 for (CeedInt i = 0; i < num_output_fields; i++) {
1101 const char *field_name;
1102 std::string var_suffix = "_out_" + std::to_string(i);
1103 std::string P_name = "P_1d" + var_suffix;
1104
1105 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1106 code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1107 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1108 // Basis action
1109 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1110 switch (eval_mode) {
1111 case CEED_EVAL_NONE:
1112 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1113 tab.push();
1114 code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1115 tab.pop();
1116 code << tab << "}\n";
1117 break;
1118 case CEED_EVAL_INTERP:
1119 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1120 tab.push();
1121 code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1122 tab.pop();
1123 code << tab << "}\n";
1124 break;
1125 case CEED_EVAL_GRAD:
1126 code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1127 << var_suffix << ", r_q" << var_suffix << ");\n";
1128 break;
1129 // LCOV_EXCL_START
1130 case CEED_EVAL_WEIGHT:
1131 break; // Should not occur
1132 case CEED_EVAL_DIV:
1133 case CEED_EVAL_CURL:
1134 break; // TODO: Not implemented
1135 // LCOV_EXCL_STOP
1136 }
1137 }
1138 }
1139 tab.pop();
1140 code << tab << "}\n";
1141 return CEED_ERROR_SUCCESS;
1142 }
1143
1144 //------------------------------------------------------------------------------
1145 // Build single operator kernel
1146 //------------------------------------------------------------------------------
CeedOperatorBuildKernel_Hip_gen(CeedOperator op,bool * is_good_build)1147 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
1148 bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
1149 Ceed ceed;
1150 CeedInt Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1151 CeedQFunctionField *qf_input_fields, *qf_output_fields;
1152 CeedQFunction_Hip_gen *qf_data;
1153 CeedQFunction qf;
1154 CeedOperatorField *op_input_fields, *op_output_fields;
1155 CeedOperator_Hip_gen *data;
1156 std::ostringstream code;
1157 Tab tab;
1158
1159 CeedCallBackend(CeedOperatorGetData(op, &data));
1160 {
1161 bool is_setup_done;
1162
1163 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
1164 if (is_setup_done) {
1165 *is_good_build = !data->use_fallback;
1166 return CEED_ERROR_SUCCESS;
1167 }
1168 }
1169
1170 // Check field compatibility
1171 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1172 {
1173 bool has_shared_bases = true;
1174
1175 for (CeedInt i = 0; i < num_input_fields; i++) {
1176 CeedBasis basis;
1177
1178 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1179 if (basis != CEED_BASIS_NONE) {
1180 bool is_tensor = true;
1181 const char *resource;
1182 char *resource_root;
1183 Ceed basis_ceed;
1184
1185 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1186 is_all_tensor = is_all_tensor && is_tensor;
1187 is_all_nontensor = is_all_nontensor && !is_tensor;
1188 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1189 CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1190 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1191 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1192 CeedCallBackend(CeedFree(&resource_root));
1193 CeedCallBackend(CeedDestroy(&basis_ceed));
1194 }
1195 CeedCallBackend(CeedBasisDestroy(&basis));
1196 }
1197
1198 for (CeedInt i = 0; i < num_output_fields; i++) {
1199 CeedBasis basis;
1200
1201 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1202 if (basis != CEED_BASIS_NONE) {
1203 bool is_tensor = true;
1204 const char *resource;
1205 char *resource_root;
1206 Ceed basis_ceed;
1207
1208 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1209 is_all_tensor = is_all_tensor && is_tensor;
1210 is_all_nontensor = is_all_nontensor && !is_tensor;
1211
1212 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1213 CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1214 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1215 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1216 CeedCallBackend(CeedFree(&resource_root));
1217 CeedCallBackend(CeedDestroy(&basis_ceed));
1218 }
1219 CeedCallBackend(CeedBasisDestroy(&basis));
1220 }
1221 // -- Fallback to ref if not all bases are shared
1222 if (!has_shared_bases) {
1223 *is_good_build = false;
1224 return CEED_ERROR_SUCCESS;
1225 }
1226 }
1227 CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1228 CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1229 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1230 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1231
1232 // Get operator data
1233 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1234 {
1235 CeedInt max_P = 0, max_P_1d = 0;
1236
1237 CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
1238 qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
1239 data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
1240 }
1241 if (is_at_points) {
1242 CeedInt coords_dim = 0;
1243 CeedElemRestriction_Hip *rstr_data;
1244 CeedElemRestriction rstr_points = NULL;
1245
1246 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1247 CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1248 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1249 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim));
1250 CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1251 data->points.indices = (CeedInt *)rstr_data->d_offsets;
1252 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1253 if (max_dim == 0) max_dim = coords_dim;
1254 if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim));
1255 }
1256 if (max_dim == 0) max_dim = 1;
1257 data->dim = max_dim;
1258 if (is_at_points) use_3d_slices = false;
1259 if (Q_1d == 0) {
1260 if (is_at_points) Q_1d = max_num_points;
1261 else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1262 }
1263 if (Q == 0) Q = Q_1d;
1264 data->Q = Q;
1265 data->Q_1d = Q_1d;
1266
1267 // Check for restriction only identity operator
1268 {
1269 bool is_identity_qf;
1270
1271 CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1272 if (is_identity_qf) {
1273 CeedEvalMode eval_mode_in, eval_mode_out;
1274
1275 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1276 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1277 CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1278 "Backend does not implement restriction only identity operators");
1279 }
1280 }
1281
1282 // Load basis source files
1283 if (!is_all_nontensor) {
1284 code << tab << "// Tensor basis source\n";
1285 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1286 }
1287 if (!is_all_tensor) {
1288 code << tab << "// Non-tensor basis source\n";
1289 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
1290 }
1291 if (is_at_points) {
1292 code << tab << "// AtPoints basis source\n";
1293 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1294 }
1295 if (!is_all_tensor && !is_all_nontensor) {
1296 code << tab << "// Tensor basis source\n";
1297 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
1298 }
1299 code << tab << "// CodeGen operator source\n";
1300 code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1301
1302 // Get QFunction name
1303 std::string qfunction_name(qf_data->qfunction_name);
1304 std::string operator_name;
1305
1306 operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
1307
1308 // Define CEED_Q_VLA
1309 code << "\n" << tab << "#undef CEED_Q_VLA\n";
1310 if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1311 code << tab << "#define CEED_Q_VLA 1\n\n";
1312 } else {
1313 code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1314 }
1315
1316 // Add user QFunction source
1317 {
1318 const char *source_path;
1319
1320 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1321 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1322
1323 code << tab << "// User QFunction source\n";
1324 code << tab << "#include \"" << source_path << "\"\n\n";
1325 }
1326
1327 // Setup
1328 code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1329 code << tab << "// Operator Kernel\n";
1330 code << tab << "// \n";
1331 code << tab << "// d_[in,out]_i: CeedVector device array\n";
1332 code << tab << "// r_[in,out]_e_i: Element vector register\n";
1333 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1334 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1335 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1336 code << tab << "// \n";
1337 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1338 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1339 code << tab << "// -----------------------------------------------------------------------------\n";
1340 code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
1341 code << "__global__ void " << operator_name
1342 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
1343 tab.push();
1344
1345 // Scratch buffers
1346 for (CeedInt i = 0; i < num_input_fields; i++) {
1347 CeedEvalMode eval_mode;
1348
1349 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1350 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
1351 code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1352 }
1353 }
1354 for (CeedInt i = 0; i < num_output_fields; i++) {
1355 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1356 }
1357
1358 code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1359 if (!is_all_tensor) {
1360 code << tab << "const CeedInt Q = " << Q << ";\n";
1361 }
1362 if (!is_all_nontensor) {
1363 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1364 }
1365 if (is_at_points) {
1366 code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1367 code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1368 }
1369
1370 // Shared data
1371 code << tab << "extern __shared__ CeedScalar slice[];\n";
1372 code << tab << "SharedData_Hip data;\n";
1373 code << tab << "data.t_id_x = threadIdx.x;\n";
1374 code << tab << "data.t_id_y = threadIdx.y;\n";
1375 code << tab << "data.t_id_z = threadIdx.z;\n";
1376 code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1377 code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1378
1379 // -- Determine input mat reuse
1380 FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1381
1382 for (CeedInt i = 0; i < num_input_fields; i++) {
1383 input_matrix_reuse[i].index = -1;
1384 }
1385 for (CeedInt i = 0; i < num_input_fields; i++) {
1386 bool is_tensor = true;
1387 CeedEvalMode eval_mode_i;
1388 CeedBasis basis_i;
1389
1390 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1391 if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1392 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1393 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1394 for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1395 CeedEvalMode eval_mode_j;
1396 CeedBasis basis_j;
1397
1398 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1399 if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1400 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1401 if (basis_i == basis_j) {
1402 if (is_tensor) {
1403 input_matrix_reuse[i].index = j;
1404 input_matrix_reuse[i].is_input = true;
1405 input_matrix_reuse[i].eval_mode = eval_mode_j;
1406 } else {
1407 // For non-tensor can only re-use with the same eval mode
1408 if (eval_mode_i == eval_mode_j) {
1409 input_matrix_reuse[i].index = j;
1410 input_matrix_reuse[i].is_input = true;
1411 input_matrix_reuse[i].eval_mode = eval_mode_j;
1412 }
1413 }
1414 }
1415 CeedCallBackend(CeedBasisDestroy(&basis_j));
1416 }
1417 CeedCallBackend(CeedBasisDestroy(&basis_i));
1418 }
1419
1420 // -- Determine output mat reuse
1421 FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1422
1423 for (CeedInt i = 0; i < num_output_fields; i++) {
1424 output_matrix_reuse[i].index = -1;
1425 }
1426 for (CeedInt i = 0; i < num_output_fields; i++) {
1427 bool is_tensor = true;
1428 CeedEvalMode eval_mode_i;
1429 CeedBasis basis_i;
1430
1431 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1432 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1433 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1434 CeedEvalMode eval_mode_j;
1435 CeedBasis basis_j;
1436
1437 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1438 if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1439 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1440 if (basis_i == basis_j) {
1441 if (is_tensor) {
1442 output_matrix_reuse[i].index = j;
1443 output_matrix_reuse[i].is_input = true;
1444 output_matrix_reuse[i].eval_mode = eval_mode_j;
1445 } else {
1446 // For non-tensor can only re-use with the same eval mode
1447 if (eval_mode_i == eval_mode_j) {
1448 output_matrix_reuse[i].index = j;
1449 output_matrix_reuse[i].is_input = true;
1450 output_matrix_reuse[i].eval_mode = eval_mode_j;
1451 }
1452 }
1453 }
1454 CeedCallBackend(CeedBasisDestroy(&basis_j));
1455 }
1456 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1457 CeedEvalMode eval_mode_j;
1458 CeedBasis basis_j;
1459
1460 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1461 if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1462 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1463 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1464 if (basis_i == basis_j) {
1465 if (is_tensor) {
1466 output_matrix_reuse[i].index = j;
1467 output_matrix_reuse[i].is_input = false;
1468 output_matrix_reuse[i].eval_mode = eval_mode_j;
1469 } else {
1470 // For non-tensor can only re-use with the same eval mode
1471 if (eval_mode_i == eval_mode_j) {
1472 output_matrix_reuse[i].index = j;
1473 output_matrix_reuse[i].is_input = false;
1474 output_matrix_reuse[i].eval_mode = eval_mode_j;
1475 }
1476 }
1477 }
1478 CeedCallBackend(CeedBasisDestroy(&basis_j));
1479 }
1480 CeedCallBackend(CeedBasisDestroy(&basis_i));
1481 }
1482
1483 // Initialize constants, and matrices B and G
1484 code << "\n" << tab << "// Input field constants and basis data\n";
1485 for (CeedInt i = 0; i < num_input_fields; i++) {
1486 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1487 max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
1488 }
1489 code << "\n" << tab << "// Output field constants and basis data\n";
1490 for (CeedInt i = 0; i < num_output_fields; i++) {
1491 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1492 max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
1493 }
1494
1495 // Loop over all elements
1496 code << "\n" << tab << "// Element loop\n";
1497 code << tab << "__syncthreads();\n";
1498 code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1499 tab.push();
1500
1501 // -- Compute minimum buffer space needed
1502 CeedInt max_rstr_buffer_size = 1;
1503
1504 for (CeedInt i = 0; i < num_input_fields; i++) {
1505 CeedEvalMode eval_mode;
1506
1507 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1508 if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1509 CeedInt num_comp;
1510 CeedElemRestriction elem_rstr;
1511
1512 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1513 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1514 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1515 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1516 }
1517 }
1518 for (CeedInt i = 0; i < num_output_fields; i++) {
1519 CeedEvalMode eval_mode;
1520
1521 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1522 if (eval_mode != CEED_EVAL_NONE) {
1523 CeedInt num_comp;
1524 CeedElemRestriction elem_rstr;
1525
1526 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1527 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1528 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1529 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1530 }
1531 }
1532 code << tab << "// Scratch restriction buffer space\n";
1533 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1534
1535 // -- Determine best input field processing order
1536 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1537
1538 for (CeedInt i = 0; i < num_input_fields; i++) {
1539 field_rstr_in_buffer[i] = -1;
1540 input_field_order[i] = -1;
1541 }
1542 {
1543 bool is_ordered[CEED_FIELD_MAX];
1544 CeedInt curr_index = 0;
1545
1546 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1547 for (CeedInt i = 0; i < num_input_fields; i++) {
1548 CeedVector vec_i;
1549 CeedElemRestriction rstr_i;
1550
1551 if (is_ordered[i]) continue;
1552 field_rstr_in_buffer[i] = i;
1553 is_ordered[i] = true;
1554 input_field_order[curr_index] = i;
1555 curr_index++;
1556 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1557 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT
1558 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1559 for (CeedInt j = i + 1; j < num_input_fields; j++) {
1560 CeedVector vec_j;
1561 CeedElemRestriction rstr_j;
1562
1563 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1564 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1565 if (rstr_i == rstr_j && vec_i == vec_j) {
1566 field_rstr_in_buffer[j] = i;
1567 is_ordered[j] = true;
1568 input_field_order[curr_index] = j;
1569 curr_index++;
1570 }
1571 CeedCallBackend(CeedVectorDestroy(&vec_j));
1572 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1573 }
1574 CeedCallBackend(CeedVectorDestroy(&vec_i));
1575 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1576 }
1577 }
1578
1579 // -- Input restriction and basis
1580 code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1581 for (CeedInt i = 0; i < num_input_fields; i++) {
1582 const char *field_name;
1583 const CeedInt f = input_field_order[i];
1584
1585 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1586 code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1587
1588 // ---- Restriction
1589 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1590 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1591
1592 // ---- Basis action
1593 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1594 is_all_tensor, is_at_points, use_3d_slices));
1595 }
1596
1597 // -- Q function
1598 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1599 qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1600 Q_1d, is_all_tensor, is_at_points, use_3d_slices, false));
1601
1602 // -- Output basis and restriction
1603 code << "\n" << tab << "// -- Output field basis action and restrictions\n";
1604 for (CeedInt i = 0; i < num_output_fields; i++) {
1605 const char *field_name;
1606
1607 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1608 code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1609
1610 // ---- Basis action
1611 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1612 is_all_tensor, is_at_points, use_3d_slices));
1613
1614 // ---- Restriction
1615 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
1616 false, is_all_tensor, is_at_points, use_3d_slices));
1617 }
1618
1619 // Close loop and function
1620 tab.pop();
1621 code << tab << "}\n";
1622 tab.pop();
1623 code << tab << "}\n";
1624 code << tab << "// -----------------------------------------------------------------------------\n\n";
1625
1626 CeedInt block_sizes[3] = {0, 0, 0};
1627 CeedInt num_elem;
1628
1629 // Compile
1630 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
1631 CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
1632 {
1633 bool is_compile_good = false;
1634
1635 data->thread_1d = block_sizes[0];
1636 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
1637 block_sizes[0] * block_sizes[1] * block_sizes[2]));
1638 if (is_compile_good) {
1639 *is_good_build = true;
1640 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
1641 } else {
1642 *is_good_build = false;
1643 data->use_fallback = true;
1644 }
1645 }
1646 CeedCallBackend(CeedOperatorSetSetupDone(op));
1647 CeedCallBackend(CeedDestroy(&ceed));
1648 CeedCallBackend(CeedQFunctionDestroy(&qf));
1649 return CEED_ERROR_SUCCESS;
1650 }
1651
1652 //------------------------------------------------------------------------------
1653 // Build AtPoints assembly operator kernel
1654 //------------------------------------------------------------------------------
CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op,bool is_full,bool * is_good_build)1655 static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) {
1656 bool is_all_tensor = true, is_at_points = false, use_3d_slices = false;
1657 Ceed ceed;
1658 CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1659 CeedQFunctionField *qf_input_fields, *qf_output_fields;
1660 CeedQFunction_Hip_gen *qf_data;
1661 CeedQFunction qf;
1662 CeedOperatorField *op_input_fields, *op_output_fields;
1663 CeedOperator_Hip_gen *data;
1664 std::ostringstream code;
1665 Tab tab;
1666
1667 // Check compatibility
1668 CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1669 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1670 CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
1671
1672 // Retrieve operator data
1673 CeedCallBackend(CeedOperatorGetData(op, &data));
1674 Q = data->Q;
1675 Q_1d = data->Q_1d;
1676 max_dim = data->dim;
1677 {
1678 CeedElemRestriction rstr_points = NULL;
1679
1680 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1681 CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1682 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1683 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1684 }
1685 CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1686 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1687 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1688 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1689
1690 // Load basis source files
1691 code << tab << "// Tensor basis source\n";
1692 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1693 code << tab << "// AtPoints basis source\n";
1694 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1695 code << tab << "// CodeGen operator source\n";
1696 code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1697
1698 // Get QFunction name
1699 std::string qfunction_name(qf_data->qfunction_name);
1700 std::string operator_name;
1701
1702 if (is_full) {
1703 operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name;
1704 } else {
1705 operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name;
1706 }
1707
1708 // Define CEED_Q_VLA
1709 code << "\n" << tab << "#undef CEED_Q_VLA\n";
1710 code << tab << "#define CEED_Q_VLA 1\n\n";
1711
1712 // Add user QFunction source
1713 {
1714 const char *source_path;
1715
1716 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1717 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1718
1719 code << tab << "// User QFunction source\n";
1720 code << tab << "#include \"" << source_path << "\"\n\n";
1721 }
1722
1723 // Setup
1724 code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1725 code << tab << "// Operator Assembly Kernel\n";
1726 code << tab << "// \n";
1727 code << tab << "// d_[in,out]_i: CeedVector device array\n";
1728 code << tab << "// r_[in,out]_e_i: Element vector register\n";
1729 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1730 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1731 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1732 code << tab << "// \n";
1733 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1734 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1735 code << tab << "// -----------------------------------------------------------------------------\n";
1736 code << tab << "extern \"C\" __global__ void " << operator_name
1737 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
1738 "points, CeedScalar *__restrict__ values_array) {\n";
1739 tab.push();
1740
1741 // Scratch buffers
1742 for (CeedInt i = 0; i < num_input_fields; i++) {
1743 CeedEvalMode eval_mode;
1744
1745 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1746 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
1747 code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1748 }
1749 }
1750 for (CeedInt i = 0; i < num_output_fields; i++) {
1751 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1752 }
1753
1754 code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1755 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1756 code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1757 code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1758
1759 // Shared data
1760 code << tab << "extern __shared__ CeedScalar slice[];\n";
1761 code << tab << "SharedData_Hip data;\n";
1762 code << tab << "data.t_id_x = threadIdx.x;\n";
1763 code << tab << "data.t_id_y = threadIdx.y;\n";
1764 code << tab << "data.t_id_z = threadIdx.z;\n";
1765 code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1766 code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1767
1768 // -- Determine input mat reuse
1769 FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1770
1771 for (CeedInt i = 0; i < num_input_fields; i++) {
1772 input_matrix_reuse[i].index = -1;
1773 }
1774 for (CeedInt i = 0; i < num_input_fields; i++) {
1775 CeedEvalMode eval_mode_i;
1776 CeedBasis basis_i;
1777
1778 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1779 if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1780 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1781 for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1782 CeedEvalMode eval_mode_j;
1783 CeedBasis basis_j;
1784
1785 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1786 if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1787 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1788 if (basis_i == basis_j) {
1789 input_matrix_reuse[i].index = j;
1790 input_matrix_reuse[i].is_input = true;
1791 input_matrix_reuse[i].eval_mode = eval_mode_j;
1792 }
1793 CeedCallBackend(CeedBasisDestroy(&basis_j));
1794 }
1795 CeedCallBackend(CeedBasisDestroy(&basis_i));
1796 }
1797
1798 // -- Determine output mat reuse
1799 FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1800
1801 for (CeedInt i = 0; i < num_output_fields; i++) {
1802 output_matrix_reuse[i].index = -1;
1803 }
1804 for (CeedInt i = 0; i < num_output_fields; i++) {
1805 CeedEvalMode eval_mode_i;
1806 CeedBasis basis_i;
1807
1808 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1809 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1810 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1811 CeedEvalMode eval_mode_j;
1812 CeedBasis basis_j;
1813
1814 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1815 if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1816 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1817 if (basis_i == basis_j) {
1818 output_matrix_reuse[i].index = j;
1819 output_matrix_reuse[i].is_input = true;
1820 output_matrix_reuse[i].eval_mode = eval_mode_j;
1821 }
1822 CeedCallBackend(CeedBasisDestroy(&basis_j));
1823 }
1824 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1825 CeedEvalMode eval_mode_j;
1826 CeedBasis basis_j;
1827
1828 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1829 if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1830 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1831 if (basis_i == basis_j) {
1832 output_matrix_reuse[i].index = j;
1833 output_matrix_reuse[i].is_input = false;
1834 output_matrix_reuse[i].eval_mode = eval_mode_j;
1835 }
1836 CeedCallBackend(CeedBasisDestroy(&basis_j));
1837 }
1838 CeedCallBackend(CeedBasisDestroy(&basis_i));
1839 }
1840
1841 // Initialize constants, and matrices B and G
1842 code << "\n" << tab << "// Input field constants and basis data\n";
1843 for (CeedInt i = 0; i < num_input_fields; i++) {
1844 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1845 max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
1846 }
1847 code << "\n" << tab << "// Output field constants and basis data\n";
1848 for (CeedInt i = 0; i < num_output_fields; i++) {
1849 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1850 max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
1851 }
1852
1853 // Loop over all elements
1854 code << "\n" << tab << "// Element loop\n";
1855 code << tab << "__syncthreads();\n";
1856 code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1857 tab.push();
1858
1859 // -- Compute minimum buffer space needed
1860 CeedInt max_rstr_buffer_size = 1;
1861
1862 for (CeedInt i = 0; i < num_input_fields; i++) {
1863 CeedEvalMode eval_mode;
1864
1865 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1866 if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1867 CeedInt num_comp;
1868 CeedElemRestriction elem_rstr;
1869
1870 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1871 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1872 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1873 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1874 }
1875 }
1876 for (CeedInt i = 0; i < num_output_fields; i++) {
1877 CeedEvalMode eval_mode;
1878
1879 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1880 if (eval_mode != CEED_EVAL_NONE) {
1881 CeedInt num_comp;
1882 CeedElemRestriction elem_rstr;
1883
1884 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1885 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1886 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1887 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1888 }
1889 }
1890 code << tab << "// Scratch restriction buffer space\n";
1891 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1892
1893 // -- Determine best input field processing order
1894 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1895
1896 for (CeedInt i = 0; i < num_input_fields; i++) {
1897 field_rstr_in_buffer[i] = -1;
1898 input_field_order[i] = -1;
1899 }
1900 {
1901 bool is_ordered[CEED_FIELD_MAX];
1902 CeedInt curr_index = 0;
1903
1904 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1905 for (CeedInt i = 0; i < num_input_fields; i++) {
1906 CeedVector vec_i;
1907 CeedElemRestriction rstr_i;
1908
1909 if (is_ordered[i]) continue;
1910 field_rstr_in_buffer[i] = i;
1911 is_ordered[i] = true;
1912 input_field_order[curr_index] = i;
1913 curr_index++;
1914 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1915 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT
1916 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1917 for (CeedInt j = i + 1; j < num_input_fields; j++) {
1918 CeedVector vec_j;
1919 CeedElemRestriction rstr_j;
1920
1921 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1922 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1923 if (rstr_i == rstr_j && vec_i == vec_j) {
1924 field_rstr_in_buffer[j] = i;
1925 is_ordered[j] = true;
1926 input_field_order[curr_index] = j;
1927 curr_index++;
1928 }
1929 CeedCallBackend(CeedVectorDestroy(&vec_j));
1930 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1931 }
1932 CeedCallBackend(CeedVectorDestroy(&vec_i));
1933 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1934 }
1935 }
1936
1937 // -- Input restriction and basis
1938 code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1939 CeedInt active_field_index = -1;
1940
1941 for (CeedInt i = 0; i < num_input_fields; i++) {
1942 bool is_active = false;
1943 const char *field_name;
1944 const CeedInt f = input_field_order[i];
1945
1946 {
1947 CeedVector vec;
1948
1949 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1950 is_active = vec == CEED_VECTOR_ACTIVE;
1951 CeedCallBackend(CeedVectorDestroy(&vec));
1952 }
1953
1954 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1955 code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1956
1957 if (is_active) {
1958 std::string var_suffix = "_in_" + std::to_string(f);
1959
1960 code << tab << "// Active field - no restriction or basis action here\n";
1961 if (active_field_index == -1) {
1962 active_field_index = f;
1963 code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
1964 << "] = {0.0};\n";
1965 } else {
1966 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
1967 }
1968 } else {
1969 // ---- Restriction
1970 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1971 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1972
1973 // ---- Basis action
1974 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1975 is_all_tensor, is_at_points, use_3d_slices));
1976 }
1977 }
1978
1979 // -- Loop over active field
1980 std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
1981
1982 code << "\n" << tab << "// Loop over nodes in active field\n";
1983 code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
1984 << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
1985 tab.push();
1986
1987 // -- Set current active node and component to 1
1988 code << tab << "// Set current active node and component to 1.0\n";
1989 code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
1990 << active_var_suffix << ");\n\n";
1991
1992 for (CeedInt i = 0; i < num_input_fields; i++) {
1993 bool is_active = false;
1994 const char *field_name;
1995 const CeedInt f = input_field_order[i];
1996
1997 {
1998 CeedVector vec;
1999
2000 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
2001 is_active = vec == CEED_VECTOR_ACTIVE;
2002 CeedCallBackend(CeedVectorDestroy(&vec));
2003 }
2004 if (!is_active) continue;
2005
2006 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
2007 code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
2008
2009 // ---- Basis action
2010 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
2011 is_all_tensor, is_at_points, use_3d_slices));
2012 }
2013
2014 // -- Q function
2015 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
2016 qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2017 Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
2018
2019 // -- Output basis and restriction
2020 code << "\n" << tab << "// -- Output field basis action and restrictions\n";
2021 for (CeedInt i = 0; i < num_output_fields; i++) {
2022 bool is_active = false;
2023 const char *field_name;
2024
2025 {
2026 CeedVector vec;
2027
2028 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2029 is_active = vec == CEED_VECTOR_ACTIVE;
2030 CeedCallBackend(CeedVectorDestroy(&vec));
2031 }
2032 if (!is_active) continue;
2033
2034 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2035 code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2036
2037 // ---- Basis action
2038 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
2039 is_all_tensor, is_at_points, use_3d_slices));
2040
2041 // ---- Restriction
2042 if (is_full) {
2043 std::string var_suffix = "_out_" + std::to_string(i);
2044 CeedInt comp_stride;
2045 CeedSize l_size;
2046 CeedElemRestriction elem_rstr;
2047
2048 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2049 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2050 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2051 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2052 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2053 code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2054 << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
2055 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2056 } else {
2057 std::string var_suffix = "_out_" + std::to_string(i);
2058 CeedInt comp_stride;
2059 CeedSize l_size;
2060 CeedElemRestriction elem_rstr;
2061
2062 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2063 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2064 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2065 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2066 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2067 code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2068 << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
2069 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2070 }
2071 }
2072
2073 // -- Reset current active node and component
2074 code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2075 code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
2076 << active_var_suffix << ");\n";
2077
2078 // -- End of loop over active field
2079 tab.pop();
2080 code << tab << "}\n";
2081
2082 // Close loop and function
2083 tab.pop();
2084 code << tab << "}\n";
2085 tab.pop();
2086 code << tab << "}\n";
2087 code << tab << "// -----------------------------------------------------------------------------\n\n";
2088
2089 CeedInt block_sizes[3] = {0, 0, 0};
2090 CeedInt num_elem;
2091
2092 // Compile
2093 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2094 CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2095 {
2096 bool is_compile_good = false;
2097
2098 data->thread_1d = block_sizes[0];
2099 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good,
2100 is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0],
2101 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2102 if (is_compile_good) {
2103 *is_good_build = true;
2104 CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
2105 is_full ? &data->assemble_full : &data->assemble_diagonal));
2106 } else {
2107 *is_good_build = false;
2108 data->use_assembly_fallback = true;
2109 }
2110 }
2111 CeedCallBackend(CeedDestroy(&ceed));
2112 CeedCallBackend(CeedQFunctionDestroy(&qf));
2113 return CEED_ERROR_SUCCESS;
2114 }
2115
CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op,bool * is_good_build)2116 extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2117 return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
2118 }
2119
CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op,bool * is_good_build)2120 extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2121 return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build);
2122 }
2123 //------------------------------------------------------------------------------
2124 // Build QFunction assembly operator kernel
2125 //------------------------------------------------------------------------------
CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op,bool * is_good_build)2126 extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) {
2127 bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
2128 Ceed ceed;
2129 CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0;
2130 CeedQFunctionField *qf_input_fields, *qf_output_fields;
2131 CeedQFunction_Hip_gen *qf_data;
2132 CeedQFunction qf;
2133 CeedOperatorField *op_input_fields, *op_output_fields;
2134 CeedOperator_Hip_gen *data;
2135 std::ostringstream code;
2136 Tab tab;
2137
2138 // Check compatibility
2139 CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
2140 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
2141 CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported");
2142
2143 // Check field compatibility
2144 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
2145 {
2146 bool has_shared_bases = true;
2147
2148 for (CeedInt i = 0; i < num_input_fields; i++) {
2149 CeedBasis basis;
2150
2151 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
2152 if (basis != CEED_BASIS_NONE) {
2153 bool is_tensor = true;
2154 const char *resource;
2155 char *resource_root;
2156 Ceed basis_ceed;
2157
2158 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2159 is_all_tensor = is_all_tensor && is_tensor;
2160 is_all_nontensor = is_all_nontensor && !is_tensor;
2161 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2162 CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2163 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2164 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2165 CeedCallBackend(CeedFree(&resource_root));
2166 CeedCallBackend(CeedDestroy(&basis_ceed));
2167 }
2168 CeedCallBackend(CeedBasisDestroy(&basis));
2169 }
2170
2171 for (CeedInt i = 0; i < num_output_fields; i++) {
2172 CeedBasis basis;
2173
2174 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
2175 if (basis != CEED_BASIS_NONE) {
2176 bool is_tensor = true;
2177 const char *resource;
2178 char *resource_root;
2179 Ceed basis_ceed;
2180
2181 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2182 is_all_tensor = is_all_tensor && is_tensor;
2183 is_all_nontensor = is_all_nontensor && !is_tensor;
2184
2185 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2186 CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2187 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2188 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2189 CeedCallBackend(CeedFree(&resource_root));
2190 CeedCallBackend(CeedDestroy(&basis_ceed));
2191 }
2192 CeedCallBackend(CeedBasisDestroy(&basis));
2193 }
2194 }
2195
2196 // Retrieve operator data
2197 CeedCallBackend(CeedOperatorGetData(op, &data));
2198 Q = data->Q;
2199 Q_1d = data->Q_1d;
2200 max_dim = data->dim;
2201 CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
2202 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
2203 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
2204
2205 // Load basis source files
2206 if (!is_all_nontensor) {
2207 code << tab << "// Tensor basis source\n";
2208 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
2209 }
2210 if (!is_all_tensor) {
2211 code << tab << "// Non-tensor basis source\n";
2212 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
2213 }
2214 if (!is_all_tensor && !is_all_nontensor) {
2215 code << "// Tensor basis source\n";
2216 code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
2217 }
2218 code << "// CodeGen operator source\n";
2219 code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
2220
2221 // Get QFunction name
2222 std::string qfunction_name(qf_data->qfunction_name);
2223 std::string operator_name;
2224
2225 operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name;
2226
2227 // Define CEED_Q_VLA
2228 code << "\n" << tab << "#undef CEED_Q_VLA\n";
2229 if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
2230 code << tab << "#define CEED_Q_VLA 1\n\n";
2231 } else {
2232 code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
2233 }
2234
2235 // Add user QFunction source
2236 {
2237 const char *source_path;
2238
2239 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
2240 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
2241
2242 code << tab << "// User QFunction source\n";
2243 code << tab << "#include \"" << source_path << "\"\n\n";
2244 }
2245
2246 // Setup
2247 code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
2248 code << tab << "// Operator Assembly Kernel\n";
2249 code << tab << "// \n";
2250 code << tab << "// d_[in,out]_i: CeedVector device array\n";
2251 code << tab << "// r_[in,out]_e_i: Element vector register\n";
2252 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
2253 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
2254 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
2255 code << tab << "// \n";
2256 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
2257 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
2258 code << tab << "// -----------------------------------------------------------------------------\n";
2259 code << tab << "extern \"C\" __global__ void " << operator_name
2260 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
2261 "points, CeedScalar *__restrict__ values_array) {\n";
2262 tab.push();
2263
2264 // Scratch buffers
2265 for (CeedInt i = 0; i < num_input_fields; i++) {
2266 CeedEvalMode eval_mode;
2267
2268 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2269 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
2270 code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
2271 }
2272 }
2273 for (CeedInt i = 0; i < num_output_fields; i++) {
2274 bool is_active = false;
2275
2276 {
2277 CeedVector vec;
2278
2279 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2280 is_active = vec == CEED_VECTOR_ACTIVE;
2281 CeedCallBackend(CeedVectorDestroy(&vec));
2282 }
2283 if (is_active) {
2284 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
2285 }
2286 }
2287
2288 code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
2289 if (!is_all_tensor) {
2290 code << tab << "const CeedInt Q = " << Q << ";\n";
2291 }
2292 if (!is_all_nontensor) {
2293 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
2294 }
2295
2296 // Shared data
2297 code << tab << "extern __shared__ CeedScalar slice[];\n";
2298 code << tab << "SharedData_Hip data;\n";
2299 code << tab << "data.t_id_x = threadIdx.x;\n";
2300 code << tab << "data.t_id_y = threadIdx.y;\n";
2301 code << tab << "data.t_id_z = threadIdx.z;\n";
2302 code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
2303 code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
2304
2305 // -- Determine input mat reuse
2306 FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
2307
2308 for (CeedInt i = 0; i < num_input_fields; i++) {
2309 input_matrix_reuse[i].index = -1;
2310 }
2311 for (CeedInt i = 0; i < num_input_fields; i++) {
2312 bool is_tensor = true;
2313 CeedEvalMode eval_mode_i;
2314 CeedBasis basis_i;
2315
2316 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
2317 if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
2318 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
2319 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2320 for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
2321 CeedEvalMode eval_mode_j;
2322 CeedBasis basis_j;
2323
2324 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2325 if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2326 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2327 if (basis_i == basis_j) {
2328 if (is_tensor) {
2329 input_matrix_reuse[i].index = j;
2330 input_matrix_reuse[i].is_input = true;
2331 input_matrix_reuse[i].eval_mode = eval_mode_j;
2332 } else {
2333 // For non-tensor can only re-use with the same eval mode
2334 if (eval_mode_i == eval_mode_j) {
2335 input_matrix_reuse[i].index = j;
2336 input_matrix_reuse[i].is_input = true;
2337 input_matrix_reuse[i].eval_mode = eval_mode_j;
2338 }
2339 }
2340 }
2341 CeedCallBackend(CeedBasisDestroy(&basis_j));
2342 }
2343 CeedCallBackend(CeedBasisDestroy(&basis_i));
2344 }
2345
2346 // -- Determine output mat reuse
2347 FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
2348
2349 for (CeedInt i = 0; i < num_output_fields; i++) {
2350 output_matrix_reuse[i].index = -1;
2351 }
2352 for (CeedInt i = 0; i < num_output_fields; i++) {
2353 bool is_tensor = true;
2354 CeedEvalMode eval_mode_i;
2355 CeedBasis basis_i;
2356
2357 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
2358 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
2359 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2360 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
2361 CeedEvalMode eval_mode_j;
2362 CeedBasis basis_j;
2363
2364 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2365 if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2366 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2367 if (basis_i == basis_j) {
2368 if (is_tensor) {
2369 output_matrix_reuse[i].index = j;
2370 output_matrix_reuse[i].is_input = true;
2371 output_matrix_reuse[i].eval_mode = eval_mode_j;
2372 } else {
2373 // For non-tensor can only re-use with the same eval mode
2374 if (eval_mode_i == eval_mode_j) {
2375 output_matrix_reuse[i].index = j;
2376 output_matrix_reuse[i].is_input = true;
2377 output_matrix_reuse[i].eval_mode = eval_mode_j;
2378 }
2379 }
2380 }
2381 CeedCallBackend(CeedBasisDestroy(&basis_j));
2382 }
2383 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
2384 CeedEvalMode eval_mode_j;
2385 CeedBasis basis_j;
2386
2387 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
2388 if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2389 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
2390 if (basis_i == basis_j) {
2391 if (is_tensor) {
2392 output_matrix_reuse[i].index = j;
2393 output_matrix_reuse[i].is_input = false;
2394 output_matrix_reuse[i].eval_mode = eval_mode_j;
2395 } else {
2396 // For non-tensor can only re-use with the same eval mode
2397 if (eval_mode_i == eval_mode_j) {
2398 output_matrix_reuse[i].index = j;
2399 output_matrix_reuse[i].is_input = false;
2400 output_matrix_reuse[i].eval_mode = eval_mode_j;
2401 }
2402 }
2403 }
2404 CeedCallBackend(CeedBasisDestroy(&basis_j));
2405 }
2406 CeedCallBackend(CeedBasisDestroy(&basis_i));
2407 }
2408
2409 // Initialize constants, and matrices B and G
2410 code << "\n" << tab << "// Input field constants and basis data\n";
2411 for (CeedInt i = 0; i < num_input_fields; i++) {
2412 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
2413 max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true));
2414 }
2415 code << "\n" << tab << "// Output field constants and basis data\n";
2416 for (CeedInt i = 0; i < num_output_fields; i++) {
2417 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
2418 max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true));
2419 }
2420
2421 // Loop over all elements
2422 code << "\n" << tab << "// Element loop\n";
2423 code << tab << "__syncthreads();\n";
2424 code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
2425 tab.push();
2426
2427 // -- Compute minimum buffer space needed
2428 CeedInt max_rstr_buffer_size = 1;
2429
2430 for (CeedInt i = 0; i < num_input_fields; i++) {
2431 CeedEvalMode eval_mode;
2432
2433 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2434 if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
2435 CeedInt num_comp;
2436 CeedElemRestriction elem_rstr;
2437
2438 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
2439 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2440 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2441 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2442 }
2443 }
2444 for (CeedInt i = 0; i < num_output_fields; i++) {
2445 CeedEvalMode eval_mode;
2446
2447 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
2448 if (eval_mode != CEED_EVAL_NONE) {
2449 CeedInt num_comp;
2450 CeedElemRestriction elem_rstr;
2451
2452 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2453 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2454 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2455 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2456 }
2457 }
2458 code << tab << "// Scratch restriction buffer space\n";
2459 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
2460
2461 // -- Determine best input field processing order
2462 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
2463
2464 for (CeedInt i = 0; i < num_input_fields; i++) {
2465 field_rstr_in_buffer[i] = -1;
2466 input_field_order[i] = -1;
2467 }
2468 {
2469 bool is_ordered[CEED_FIELD_MAX];
2470 CeedInt curr_index = 0;
2471
2472 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
2473 for (CeedInt i = 0; i < num_input_fields; i++) {
2474 CeedVector vec_i;
2475 CeedElemRestriction rstr_i;
2476
2477 if (is_ordered[i]) continue;
2478 field_rstr_in_buffer[i] = i;
2479 is_ordered[i] = true;
2480 input_field_order[curr_index] = i;
2481 curr_index++;
2482 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
2483 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT
2484 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
2485 for (CeedInt j = i + 1; j < num_input_fields; j++) {
2486 CeedVector vec_j;
2487 CeedElemRestriction rstr_j;
2488
2489 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
2490 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
2491 if (rstr_i == rstr_j && vec_i == vec_j) {
2492 field_rstr_in_buffer[j] = i;
2493 is_ordered[j] = true;
2494 input_field_order[curr_index] = j;
2495 curr_index++;
2496 }
2497 CeedCallBackend(CeedVectorDestroy(&vec_j));
2498 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
2499 }
2500 CeedCallBackend(CeedVectorDestroy(&vec_i));
2501 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
2502 }
2503 }
2504
2505 // -- Input restriction and basis
2506 code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
2507 CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0;
2508 CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX];
2509
2510 for (CeedInt i = 0; i < num_input_fields; i++) {
2511 bool is_active = false;
2512 const char *field_name;
2513 const CeedInt f = input_field_order[i];
2514
2515 {
2516 CeedVector vec;
2517
2518 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
2519 is_active = vec == CEED_VECTOR_ACTIVE;
2520 CeedCallBackend(CeedVectorDestroy(&vec));
2521 }
2522
2523 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
2524 code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
2525
2526 if (is_active) {
2527 CeedEvalMode eval_mode;
2528 CeedInt field_size;
2529
2530 active_fields_in[num_active_in] = f;
2531 num_active_in++;
2532 CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
2533 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
2534 if (eval_mode == CEED_EVAL_GRAD) {
2535 code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
2536 << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2537 } else {
2538 code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2539 }
2540 code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n";
2541 } else {
2542 // ---- Restriction
2543 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
2544 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
2545
2546 // ---- Basis action
2547 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
2548 is_all_tensor, is_at_points, use_3d_slices));
2549 }
2550 }
2551 code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {";
2552 for (CeedInt i = 0; i < num_active_in; i++) {
2553 code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2554 }
2555 code << "};\n";
2556 code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {";
2557 for (CeedInt i = 0; i < num_active_in; i++) {
2558 code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2559 }
2560 code << "};\n";
2561
2562 for (CeedInt i = 0; i < num_output_fields; i++) {
2563 bool is_active = false;
2564
2565 {
2566 CeedVector vec;
2567
2568 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2569 is_active = vec == CEED_VECTOR_ACTIVE;
2570 CeedCallBackend(CeedVectorDestroy(&vec));
2571 }
2572 if (is_active) {
2573 const char *field_name;
2574 CeedInt field_size;
2575
2576 active_fields_out[num_active_out] = i;
2577 num_active_out++;
2578 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2579 qf_assembly_size_out += field_size;
2580 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2581 code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2582 code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n";
2583 }
2584 }
2585 code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {";
2586 for (CeedInt i = 0; i < num_active_out; i++) {
2587 code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : "");
2588 }
2589 code << "};\n";
2590 code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n";
2591
2592 // -- Loop over active field
2593 code << "\n" << tab << "CeedInt input_offset = 0;\n";
2594 code << tab << "// Loop over active QFunction input fields\n";
2595 code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n";
2596 code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n";
2597 tab.push();
2598
2599 // -- Loop over size of active field
2600 code << "\n" << tab << "// Loop over current active input field size\n";
2601 code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n";
2602 code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n";
2603 tab.push();
2604
2605 // -- Set current active point and component to 1
2606 code << tab << "// Set current active point and component to 1.0\n";
2607 if (is_all_tensor && (max_dim >= 3)) {
2608 code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n";
2609 } else {
2610 code << tab << "r_q_in[a][s] = 1.0;\n";
2611 }
2612
2613 // -- Q function
2614 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
2615 qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2616 Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
2617
2618 // -- Output basis and restriction
2619 code << "\n" << tab << "// -- Output field basis action and restrictions\n";
2620 CeedScalar offset = 0;
2621
2622 for (CeedInt i = 0; i < num_output_fields; i++) {
2623 bool is_active = false;
2624 const char *field_name;
2625
2626 {
2627 CeedVector vec;
2628
2629 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2630 is_active = vec == CEED_VECTOR_ACTIVE;
2631 CeedCallBackend(CeedVectorDestroy(&vec));
2632 }
2633 if (!is_active) continue;
2634
2635 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2636 code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2637
2638 // ---- Restriction
2639 CeedInt field_size;
2640
2641 code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
2642 << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
2643 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2644 offset += field_size;
2645 }
2646
2647 // -- Reset current active node and component
2648 code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2649 if (is_all_tensor && (max_dim >= 3)) {
2650 code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n";
2651 } else {
2652 code << tab << "r_q_in[a][s] = 0.0;\n";
2653 }
2654
2655 // -- End of loop over size of active field
2656 tab.pop();
2657 code << tab << "}\n";
2658 code << tab << "input_offset += field_size_in;\n";
2659
2660 // -- End of loop over active field
2661 tab.pop();
2662 code << tab << "}\n";
2663
2664 // Close loop and function
2665 tab.pop();
2666 code << tab << "}\n";
2667 tab.pop();
2668 code << tab << "}\n";
2669 code << tab << "// -----------------------------------------------------------------------------\n\n";
2670
2671 CeedInt block_sizes[3] = {0, 0, 0};
2672 CeedInt num_elem;
2673
2674 // Compile
2675 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2676 CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2677 {
2678 bool is_compile_good = false;
2679
2680 data->thread_1d = block_sizes[0];
2681 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0],
2682 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2683 if (is_compile_good) {
2684 *is_good_build = true;
2685 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
2686 } else {
2687 *is_good_build = false;
2688 data->use_assembly_fallback = true;
2689 }
2690 }
2691 CeedCallBackend(CeedDestroy(&ceed));
2692 CeedCallBackend(CeedQFunctionDestroy(&qf));
2693 return CEED_ERROR_SUCCESS;
2694 }
2695
2696 //------------------------------------------------------------------------------
2697