1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <string.h>
14 #include <hip/hip_runtime.h>
15
16 #include "../hip/ceed-hip-common.h"
17 #include "../hip/ceed-hip-compile.h"
18 #include "ceed-hip-shared.h"
19
20 //------------------------------------------------------------------------------
21 // Compute a block size based on required minimum threads
22 //------------------------------------------------------------------------------
ComputeBlockSizeFromRequirement(const CeedInt required)23 static CeedInt ComputeBlockSizeFromRequirement(const CeedInt required) {
24 CeedInt maxSize = 1024; // Max total threads per block
25 CeedInt currentSize = 64; // Start with one group
26
27 while (currentSize < maxSize) {
28 if (currentSize > required) break;
29 else currentSize = currentSize * 2;
30 }
31 return currentSize;
32 }
33
34 //------------------------------------------------------------------------------
35 // Compute required thread block sizes for basis kernels given P, Q, dim, and
36 // num_comp (num_comp not currently used, but may be again in other basis
37 // parallelization options)
38 //------------------------------------------------------------------------------
ComputeBasisThreadBlockSizes(const CeedInt dim,const CeedInt P_1d,const CeedInt Q_1d,const CeedInt num_comp,CeedInt * block_sizes)39 static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, const CeedInt num_comp, CeedInt *block_sizes) {
40 // Note that this will use the same block sizes for all dimensions when compiling,
41 // but as each basis object is defined for a particular dimension, we will never
42 // call any kernels except the ones for the dimension for which we have computed the
43 // block sizes.
44 const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d);
45
46 switch (dim) {
47 case 1: {
48 // Interp kernels:
49 block_sizes[0] = 256;
50
51 // Grad kernels:
52 block_sizes[1] = 256;
53
54 // Weight kernels:
55 block_sizes[2] = 256;
56 } break;
57 case 2: {
58 // Interp kernels:
59 CeedInt required = thread_1d * thread_1d;
60
61 block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
62
63 // Grad kernels: currently use same required minimum threads
64 block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
65
66 // Weight kernels:
67 required = CeedIntMax(64, Q_1d * Q_1d);
68 block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
69
70 } break;
71 case 3: {
72 // Interp kernels:
73 CeedInt required = thread_1d * thread_1d;
74
75 block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
76
77 // Grad kernels: currently use same required minimum threads
78 block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
79
80 // Weight kernels:
81 required = Q_1d * Q_1d * Q_1d;
82 block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
83 }
84 }
85 return CEED_ERROR_SUCCESS;
86 }
87
88 //------------------------------------------------------------------------------
89 // Apply basis
90 //------------------------------------------------------------------------------
CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis,bool apply_add,const CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode eval_mode,CeedVector u,CeedVector v)91 static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
92 CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
93 Ceed ceed;
94 Ceed_Hip *ceed_Hip;
95 CeedInt dim, num_comp;
96 const CeedScalar *d_u;
97 CeedScalar *d_v;
98 CeedBasis_Hip_shared *data;
99
100 CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
101 CeedCallBackend(CeedGetData(ceed, &ceed_Hip));
102 CeedCallBackend(CeedBasisGetData(basis, &data));
103 CeedCallBackend(CeedBasisGetDimension(basis, &dim));
104 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
105
106 // Get read/write access to u, v
107 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
108 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
109 if (apply_add) {
110 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
111 } else {
112 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
113 }
114
115 // Apply basis operation
116 switch (eval_mode) {
117 case CEED_EVAL_INTERP: {
118 CeedInt P_1d, Q_1d;
119 CeedInt block_size = data->block_sizes[0];
120
121 CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", CeedEvalModes[eval_mode]);
122 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
123 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
124 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
125 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};
126
127 if (dim == 1) {
128 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
129 elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
130 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
131 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar);
132
133 if (t_mode == CEED_TRANSPOSE) {
134 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1,
135 elems_per_block, shared_mem, interp_args));
136 } else {
137 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
138 }
139 } else if (dim == 2) {
140 // Check if required threads is small enough to do multiple elems
141 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
142 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
143 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
144
145 if (t_mode == CEED_TRANSPOSE) {
146 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,
147 thread_1d, elems_per_block, shared_mem, interp_args));
148 } else {
149 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
150 }
151 } else if (dim == 3) {
152 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
153 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
154 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
155
156 if (t_mode == CEED_TRANSPOSE) {
157 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,
158 thread_1d, elems_per_block, shared_mem, interp_args));
159 } else {
160 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
161 }
162 }
163 } break;
164 case CEED_EVAL_GRAD: {
165 CeedInt P_1d, Q_1d;
166 CeedInt block_size = data->block_sizes[1];
167
168 CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEvalModes[eval_mode]);
169 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
170 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
171 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
172 CeedScalar *d_grad_1d = data->d_grad_1d;
173
174 if (data->d_collo_grad_1d) {
175 d_grad_1d = data->d_collo_grad_1d;
176 }
177 void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v};
178
179 if (dim == 1) {
180 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
181 elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
182 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
183 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar);
184
185 if (t_mode == CEED_TRANSPOSE) {
186 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 1,
187 elems_per_block, shared_mem, grad_args));
188 } else {
189 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
190 }
191 } else if (dim == 2) {
192 // Check if required threads is small enough to do multiple elems
193 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
194 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
195 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
196
197 if (t_mode == CEED_TRANSPOSE) {
198 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, thread_1d,
199 elems_per_block, shared_mem, grad_args));
200 } else {
201 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
202 }
203 } else if (dim == 3) {
204 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
205 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
206 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
207
208 if (t_mode == CEED_TRANSPOSE) {
209 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, thread_1d,
210 elems_per_block, shared_mem, grad_args));
211 } else {
212 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
213 }
214 }
215 } break;
216 case CEED_EVAL_WEIGHT: {
217 CeedInt Q_1d;
218 CeedInt block_size = data->block_sizes[2];
219
220 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
221 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
222 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
223
224 if (dim == 1) {
225 const CeedInt opt_elems = block_size / Q_1d;
226 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
227 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
228
229 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args));
230 } else if (dim == 2) {
231 const CeedInt opt_elems = block_size / (Q_1d * Q_1d);
232 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
233 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
234
235 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
236 } else if (dim == 3) {
237 const CeedInt opt_elems = block_size / (Q_1d * Q_1d);
238 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
239 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
240
241 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
242 }
243 } break;
244 case CEED_EVAL_NONE: /* handled separately below */
245 break;
246 // LCOV_EXCL_START
247 case CEED_EVAL_DIV:
248 case CEED_EVAL_CURL:
249 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
250 // LCOV_EXCL_STOP
251 }
252
253 // Restore vectors, cover CEED_EVAL_NONE
254 CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
255 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
256 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
257 CeedCallBackend(CeedDestroy(&ceed));
258 return CEED_ERROR_SUCCESS;
259 }
260
CeedBasisApplyTensor_Hip_shared(CeedBasis basis,const CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode eval_mode,CeedVector u,CeedVector v)261 int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
262 CeedVector v) {
263 CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
264 return CEED_ERROR_SUCCESS;
265 }
266
CeedBasisApplyAddTensor_Hip_shared(CeedBasis basis,const CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode eval_mode,CeedVector u,CeedVector v)267 int CeedBasisApplyAddTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
268 CeedVector v) {
269 CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
270 return CEED_ERROR_SUCCESS;
271 }
272
273 //------------------------------------------------------------------------------
274 // Basis apply - tensor AtPoints
275 //------------------------------------------------------------------------------
CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis,bool apply_add,const CeedInt num_elem,const CeedInt * num_points,CeedTransposeMode t_mode,CeedEvalMode eval_mode,CeedVector x_ref,CeedVector u,CeedVector v)276 static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
277 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
278 Ceed ceed;
279 CeedInt Q_1d, dim, max_num_points = num_points[0];
280 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE;
281 const CeedScalar *d_x, *d_u;
282 CeedScalar *d_v;
283 CeedBasis_Hip_shared *data;
284
285 CeedCallBackend(CeedBasisGetData(basis, &data));
286 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
287 CeedCallBackend(CeedBasisGetDimension(basis, &dim));
288
289 // Weight handled separately
290 if (eval_mode == CEED_EVAL_WEIGHT) {
291 CeedCallBackend(CeedVectorSetValue(v, 1.0));
292 return CEED_ERROR_SUCCESS;
293 }
294
295 CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
296
297 // Check padded to uniform number of points per elem
298 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
299 {
300 CeedInt num_comp, q_comp;
301 CeedSize len, len_required;
302
303 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
304 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
305 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
306 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
307 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
308 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
309 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
310 len, len_required);
311 }
312
313 // Move num_points array to device
314 if (is_transpose) {
315 const CeedInt num_bytes = num_elem * sizeof(CeedInt);
316
317 if (num_elem != data->num_elem_at_points) {
318 data->num_elem_at_points = num_elem;
319
320 if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
321 CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
322 CeedCallBackend(CeedFree(&data->h_points_per_elem));
323 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
324 }
325 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
326 memcpy(data->h_points_per_elem, num_points, num_bytes);
327 CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
328 }
329 }
330
331 // Build kernels if needed
332 if (data->num_points != max_num_points) {
333 CeedInt P_1d;
334
335 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
336 data->num_points = max_num_points;
337
338 // -- Create interp matrix to Chebyshev coefficients
339 if (!data->d_chebyshev_interp_1d) {
340 CeedSize interp_bytes;
341 CeedScalar *chebyshev_interp_1d;
342
343 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
344 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
345 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
346 CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
347 CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
348 CeedCallBackend(CeedFree(&chebyshev_interp_1d));
349 }
350
351 // -- Compile kernels
352 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h>\n";
353 CeedInt num_comp;
354
355 if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
356 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
357 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D",
358 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
359 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "BASIS_INTERP_BLOCK_SIZE",
360 data->block_sizes[0]));
361 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
362 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
363 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAddAtPoints", &data->InterpTransposeAddAtPoints));
364 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
365 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
366 CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAddAtPoints", &data->GradTransposeAddAtPoints));
367 }
368
369 // Get read/write access to u, v
370 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
371 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
372 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
373 if (apply_add) {
374 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
375 } else {
376 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
377 }
378
379 // Basis action
380 switch (eval_mode) {
381 case CEED_EVAL_INTERP: {
382 CeedInt P_1d, Q_1d;
383 CeedInt block_size = data->block_sizes[0];
384
385 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
386 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
387 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
388 void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
389
390 if (dim == 1) {
391 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
392 elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
393 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
394 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar);
395
396 if (is_transpose) {
397 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
398 thread_1d, 1, elems_per_block, shared_mem, interp_args));
399 } else {
400 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
401 }
402 } else if (dim == 2) {
403 // Check if required threads is small enough to do multiple elems
404 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
405 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
406 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
407
408 if (is_transpose) {
409 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
410 thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
411 } else {
412 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
413 interp_args));
414 }
415 } else if (dim == 3) {
416 const CeedInt elems_per_block = 1;
417 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
418 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
419
420 if (is_transpose) {
421 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
422 thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
423 } else {
424 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
425 interp_args));
426 }
427 }
428 } break;
429 case CEED_EVAL_GRAD: {
430 CeedInt P_1d, Q_1d;
431 CeedInt block_size = data->block_sizes[0];
432
433 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
434 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
435 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
436 void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
437
438 if (dim == 1) {
439 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
440 elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
441 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
442 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar);
443
444 if (is_transpose) {
445 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
446 thread_1d, 1, elems_per_block, shared_mem, grad_args));
447 } else {
448 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
449 }
450 } else if (dim == 2) {
451 // Check if required threads is small enough to do multiple elems
452 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
453 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
454 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
455
456 if (is_transpose) {
457 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
458 thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
459 } else {
460 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
461 grad_args));
462 }
463 } else if (dim == 3) {
464 const CeedInt elems_per_block = 1;
465 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
466 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
467
468 if (is_transpose) {
469 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
470 thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
471 } else {
472 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
473 grad_args));
474 }
475 }
476 } break;
477 case CEED_EVAL_WEIGHT:
478 case CEED_EVAL_NONE: /* handled separately below */
479 break;
480 // LCOV_EXCL_START
481 case CEED_EVAL_DIV:
482 case CEED_EVAL_CURL:
483 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
484 // LCOV_EXCL_STOP
485 }
486
487 // Restore vectors, cover CEED_EVAL_NONE
488 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
489 CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
490 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
491 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
492 CeedCallBackend(CeedDestroy(&ceed));
493 return CEED_ERROR_SUCCESS;
494 }
495
CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis,const CeedInt num_elem,const CeedInt * num_points,CeedTransposeMode t_mode,CeedEvalMode eval_mode,CeedVector x_ref,CeedVector u,CeedVector v)496 static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
497 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
498 CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
499 return CEED_ERROR_SUCCESS;
500 }
501
CeedBasisApplyAddAtPoints_Hip_shared(CeedBasis basis,const CeedInt num_elem,const CeedInt * num_points,CeedTransposeMode t_mode,CeedEvalMode eval_mode,CeedVector x_ref,CeedVector u,CeedVector v)502 static int CeedBasisApplyAddAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
503 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
504 CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
505 return CEED_ERROR_SUCCESS;
506 }
507
508 //------------------------------------------------------------------------------
509 // Apply basis
510 //------------------------------------------------------------------------------
CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis,bool apply_add,const CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode eval_mode,CeedVector u,CeedVector v)511 static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
512 CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
513 Ceed ceed;
514 Ceed_Hip *ceed_Hip;
515 CeedInt dim, num_comp;
516 const CeedScalar *d_u;
517 CeedScalar *d_v;
518 CeedBasis_Hip_shared *data;
519
520 CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
521 CeedCallBackend(CeedGetData(ceed, &ceed_Hip));
522 CeedCallBackend(CeedBasisGetData(basis, &data));
523 CeedCallBackend(CeedBasisGetDimension(basis, &dim));
524 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
525
526 // Get read/write access to u, v
527 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
528 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
529 if (apply_add) {
530 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
531 } else {
532 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
533 }
534
535 // Apply basis operation
536 switch (eval_mode) {
537 case CEED_EVAL_INTERP: {
538 CeedInt P, Q;
539
540 CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEvalModes[eval_mode]);
541 CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
542 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
543 CeedInt thread = CeedIntMax(Q, P);
544 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};
545
546 {
547 CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
548 elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
549 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
550 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar);
551
552 if (t_mode == CEED_TRANSPOSE) {
553 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1,
554 elems_per_block, shared_mem, interp_args));
555 } else {
556 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread, 1, elems_per_block, shared_mem, interp_args));
557 }
558 }
559 } break;
560 case CEED_EVAL_GRAD: {
561 CeedInt P, Q;
562
563 CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalModes[eval_mode]);
564 CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
565 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
566 CeedInt thread = CeedIntMax(Q, P);
567 void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v};
568
569 {
570 CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
571 elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
572 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
573 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar);
574
575 if (t_mode == CEED_TRANSPOSE) {
576 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1,
577 elems_per_block, shared_mem, grad_args));
578 } else {
579 CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread, 1, elems_per_block, shared_mem, grad_args));
580 }
581 }
582 } break;
583 case CEED_EVAL_WEIGHT: {
584 CeedInt P, Q;
585
586 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
587 CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
588 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
589 CeedInt thread = CeedIntMax(Q, P);
590 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
591
592 {
593 CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
594 elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
595 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
596
597 CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, thread, elems_per_block, 1, weight_args));
598 }
599 } break;
600 case CEED_EVAL_NONE: /* handled separately below */
601 break;
602 // LCOV_EXCL_START
603 case CEED_EVAL_DIV:
604 case CEED_EVAL_CURL:
605 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
606 // LCOV_EXCL_STOP
607 }
608
609 // Restore vectors, cover CEED_EVAL_NONE
610 CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
611 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
612 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
613 CeedCallBackend(CeedDestroy(&ceed));
614 return CEED_ERROR_SUCCESS;
615 }
616
CeedBasisApplyNonTensor_Hip_shared(CeedBasis basis,const CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode eval_mode,CeedVector u,CeedVector v)617 int CeedBasisApplyNonTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
618 CeedVector v) {
619 CeedCallBackend(CeedBasisApplyNonTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
620 return CEED_ERROR_SUCCESS;
621 }
622
CeedBasisApplyAddNonTensor_Hip_shared(CeedBasis basis,const CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode eval_mode,CeedVector u,CeedVector v)623 int CeedBasisApplyAddNonTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
624 CeedVector v) {
625 CeedCallBackend(CeedBasisApplyNonTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
626 return CEED_ERROR_SUCCESS;
627 }
628
629 //------------------------------------------------------------------------------
630 // Destroy basis
631 //------------------------------------------------------------------------------
CeedBasisDestroy_Hip_shared(CeedBasis basis)632 static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
633 Ceed ceed;
634 CeedBasis_Hip_shared *data;
635
636 CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
637 CeedCallBackend(CeedBasisGetData(basis, &data));
638 CeedCallHip(ceed, hipModuleUnload(data->module));
639 if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
640 if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
641 CeedCallBackend(CeedFree(&data->h_points_per_elem));
642 if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
643 CeedCallHip(ceed, hipFree(data->d_interp_1d));
644 CeedCallHip(ceed, hipFree(data->d_grad_1d));
645 CeedCallHip(ceed, hipFree(data->d_collo_grad_1d));
646 CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
647 CeedCallBackend(CeedFree(&data));
648 CeedCallBackend(CeedDestroy(&ceed));
649 return CEED_ERROR_SUCCESS;
650 }
651
652 //------------------------------------------------------------------------------
653 // Create tensor basis
654 //------------------------------------------------------------------------------
CeedBasisCreateTensorH1_Hip_shared(CeedInt dim,CeedInt P_1d,CeedInt Q_1d,const CeedScalar * interp_1d,const CeedScalar * grad_1d,const CeedScalar * q_ref_1d,const CeedScalar * q_weight_1d,CeedBasis basis)655 int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
656 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
657 Ceed ceed;
658 CeedInt num_comp;
659 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar);
660 const CeedInt interp_bytes = q_bytes * P_1d;
661 CeedBasis_Hip_shared *data;
662
663 CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
664 CeedCallBackend(CeedCalloc(1, &data));
665
666 // Copy basis data to GPU
667 if (q_weight_1d) {
668 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
669 CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
670 }
671 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
672 CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice));
673 CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes));
674 CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice));
675
676 // Compute collocated gradient and copy to GPU
677 data->d_collo_grad_1d = NULL;
678 bool has_collocated_grad = dim == 3 && Q_1d >= P_1d;
679
680 if (has_collocated_grad) {
681 CeedScalar *collo_grad_1d;
682
683 CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d));
684 CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d));
685 CeedCallHip(ceed, hipMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d));
686 CeedCallHip(ceed, hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, hipMemcpyHostToDevice));
687 CeedCallBackend(CeedFree(&collo_grad_1d));
688 }
689
690 // Set number of threads per block for basis kernels
691 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
692 CeedCallBackend(ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, data->block_sizes));
693
694 // Compile basis kernels
695 bool is_collocated = false;
696 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor.h>\n";
697
698 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D",
699 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
700 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], "BASIS_GRAD_BLOCK_SIZE",
701 data->block_sizes[1], "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2], "BASIS_HAS_COLLOCATED_GRAD",
702 has_collocated_grad));
703 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
704 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocated" : "Interp", &data->Interp));
705 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocatedTranspose" : "InterpTranspose", &data->InterpTranspose));
706 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocatedTransposeAdd" : "InterpTransposeAdd",
707 &data->InterpTransposeAdd));
708 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocated" : "Grad", &data->Grad));
709 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocatedTranspose" : "GradTranspose", &data->GradTranspose));
710 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocatedTransposeAdd" : "GradTransposeAdd", &data->GradTransposeAdd));
711 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
712
713 CeedCallBackend(CeedBasisSetData(basis, data));
714
715 // Register backend functions
716 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared));
717 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Hip_shared));
718 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared));
719 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip_shared));
720 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
721 CeedCallBackend(CeedDestroy(&ceed));
722 return CEED_ERROR_SUCCESS;
723 }
724
725 //------------------------------------------------------------------------------
726 // Create non-tensor basis
727 //------------------------------------------------------------------------------
CeedBasisCreateH1_Hip_shared(CeedElemTopology topo,CeedInt dim,CeedInt num_nodes,CeedInt num_qpts,const CeedScalar * interp,const CeedScalar * grad,const CeedScalar * q_ref,const CeedScalar * q_weight,CeedBasis basis)728 int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
729 const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
730 Ceed ceed;
731 CeedInt num_comp, q_comp_interp, q_comp_grad;
732 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar);
733 CeedBasis_Hip_shared *data;
734
735 CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
736
737 // Check shared memory size
738 {
739 Ceed_Hip *hip_data;
740
741 CeedCallBackend(CeedGetData(ceed, &hip_data));
742 if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) >
743 hip_data->device_prop.sharedMemPerBlock) {
744 CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis));
745 CeedCallBackend(CeedDestroy(&ceed));
746 return CEED_ERROR_SUCCESS;
747 }
748 }
749
750 CeedCallBackend(CeedCalloc(1, &data));
751
752 // Copy basis data to GPU
753 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
754 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
755 if (q_weight) {
756 CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
757 CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight, q_bytes, hipMemcpyHostToDevice));
758 }
759 if (interp) {
760 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
761
762 CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
763 CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp, interp_bytes, hipMemcpyHostToDevice));
764 }
765 if (grad) {
766 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad;
767
768 CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, grad_bytes));
769 CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad, grad_bytes, hipMemcpyHostToDevice));
770 }
771
772 // Compile basis kernels
773 const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-nontensor.h>\n";
774
775 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
776 CeedCallBackend(ComputeBasisThreadBlockSizes(dim, num_nodes, num_qpts, num_comp, data->block_sizes));
777 CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 6, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_T_1D",
778 CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_INTERP_BLOCK_SIZE",
779 data->block_sizes[0]));
780 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
781 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
782 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
783 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
784 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose));
785 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
786 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
787
788 CeedCallBackend(CeedBasisSetData(basis, data));
789
790 // Register backend functions
791 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip_shared));
792 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip_shared));
793 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
794 CeedCallBackend(CeedDestroy(&ceed));
795 return CEED_ERROR_SUCCESS;
796 }
797
798 //------------------------------------------------------------------------------
799