xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator.c (revision d89549c6e0afe634fc6cadb472a2a3224f4b1b63)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-source/cuda/cuda-types.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 #include <stddef.h>
14 #include <string.h>
15 
16 #include "../cuda/ceed-cuda-common.h"
17 #include "../cuda/ceed-cuda-compile.h"
18 #include "ceed-cuda-gen-operator-build.h"
19 #include "ceed-cuda-gen.h"
20 
21 //------------------------------------------------------------------------------
22 // Destroy operator
23 //------------------------------------------------------------------------------
24 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
25   Ceed                   ceed;
26   CeedOperator_Cuda_gen *impl;
27 
28   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
29   CeedCallBackend(CeedOperatorGetData(op, &impl));
30   if (impl->module) CeedCallCuda(ceed, cuModuleUnload(impl->module));
31   if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem));
32   CeedCallBackend(CeedFree(&impl));
33   CeedCallBackend(CeedDestroy(&ceed));
34   return CEED_ERROR_SUCCESS;
35 }
36 
37 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, int elems_per_block) {
38   int useful_threads_per_block = threads_per_elem * elems_per_block;
39   // round up to nearest multiple of warp_size
40   int block_size    = CeedDivUpInt(useful_threads_per_block, warp_size) * warp_size;
41   int blocks_per_sm = threads_per_sm / block_size;
42   return threads_per_sm - useful_threads_per_block * blocks_per_sm;
43 }
44 
45 // Choose the least wasteful block size constrained by blocks_per_sm of max_threads_per_block.
46 //
47 // The x and y part of block[] contains per-element sizes (specified on input) while the z part is number of elements.
48 //
49 // Problem setting: we'd like to make occupancy high with relatively few inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how
50 // many threads can run.
51 //
52 // Note that full occupancy sometimes can't be achieved by one thread block.
53 // For example, an SM might support 1536 threads in total, but only 1024 within a single thread block.
54 // So cuOccupancyMaxPotentialBlockSize may suggest a block size of 768 so that two blocks can run, versus one block of 1024 will prevent a second
55 // block from running. The cuda-gen kernels are pretty heavy with lots of instruction-level parallelism (ILP) so we'll generally be okay with
56 // relatively low occupancy and smaller thread blocks, but we solve a reasonably general problem here. Empirically, we find that blocks bigger than
57 // about 256 have higher latency and worse load balancing when the number of elements is modest.
58 //
59 // cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of the number of quadrature points (or number of basis functions).
60 // They also have a lot of __syncthreads(), which is another point against excessively large thread blocks.
61 // Suppose I have elements with 7x7x7 quadrature points.
62 // This will loop over the last dimension, so we have 7*7=49 threads per element.
63 // Suppose we have two elements = 2*49=98 useful threads.
64 // CUDA schedules in units of full warps (32 threads), so 128 CUDA hardware threads are effectively committed to that block.
65 // Now suppose cuOccupancyMaxPotentialBlockSize returned 352.
66 // We can schedule 2 blocks of size 98 (196 useful threads using 256 hardware threads), but not a third block (which would need a total of 384
67 // hardware threads).
68 //
69 // If instead, we had packed 3 elements, we'd have 3*49=147 useful threads occupying 160 slots, and could schedule two blocks.
70 // Alternatively, we could pack a single block of 7 elements (2*49=343 useful threads) into the 354 slots.
71 // The latter has the least "waste", but __syncthreads() over-synchronizes and it might not pay off relative to smaller blocks.
72 static int BlockGridCalculate(CeedInt num_elem, int blocks_per_sm, int max_threads_per_block, int max_threads_z, int warp_size, int block[3],
73                               int *grid) {
74   const int threads_per_sm   = blocks_per_sm * max_threads_per_block;
75   const int threads_per_elem = block[0] * block[1];
76   int       elems_per_block  = 1;
77   int       waste            = Waste(threads_per_sm, warp_size, threads_per_elem, 1);
78 
79   for (int i = 2; i <= CeedIntMin(max_threads_per_block / threads_per_elem, num_elem); i++) {
80     int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i);
81 
82     // We want to minimize waste, but smaller kernels have lower latency and less __syncthreads() overhead so when a larger block size has the same
83     // waste as a smaller one, go ahead and prefer the smaller block.
84     if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) {
85       elems_per_block = i;
86       waste           = i_waste;
87     }
88   }
89   // In low-order elements, threads_per_elem may be sufficiently low to give an elems_per_block greater than allowable for the device, so we must
90   // check before setting the z-dimension size of the block.
91   block[2] = CeedIntMin(elems_per_block, max_threads_z);
92   *grid    = CeedDivUpInt(num_elem, elems_per_block);
93   return CEED_ERROR_SUCCESS;
94 }
95 
96 // callback for cuOccupancyMaxPotentialBlockSize, providing the amount of dynamic shared memory required for a thread block of size threads.
97 static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); }
98 
99 //------------------------------------------------------------------------------
100 // Apply and add to output
101 //------------------------------------------------------------------------------
102 static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good,
103                                              CeedRequest *request) {
104   bool                    is_at_points, is_tensor;
105   Ceed                    ceed;
106   Ceed_Cuda              *cuda_data;
107   CeedInt                 num_elem, num_input_fields, num_output_fields;
108   CeedEvalMode            eval_mode;
109   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
110   CeedQFunction_Cuda_gen *qf_data;
111   CeedQFunction           qf;
112   CeedOperatorField      *op_input_fields, *op_output_fields;
113   CeedOperator_Cuda_gen  *data;
114 
115   // Build the operator kernel
116   CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, is_run_good));
117   if (!(*is_run_good)) return CEED_ERROR_SUCCESS;
118 
119   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
120   CeedCallBackend(CeedGetData(ceed, &cuda_data));
121   CeedCallBackend(CeedOperatorGetData(op, &data));
122   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
123   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
124   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
125   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
126   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
127 
128   // Input vectors
129   for (CeedInt i = 0; i < num_input_fields; i++) {
130     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
131     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
132       data->fields.inputs[i] = NULL;
133     } else {
134       bool       is_active;
135       CeedVector vec;
136 
137       // Get input vector
138       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
139       is_active = vec == CEED_VECTOR_ACTIVE;
140       if (is_active) data->fields.inputs[i] = input_arr;
141       else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
142       CeedCallBackend(CeedVectorDestroy(&vec));
143     }
144   }
145 
146   // Output vectors
147   for (CeedInt i = 0; i < num_output_fields; i++) {
148     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
149     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
150       data->fields.outputs[i] = NULL;
151     } else {
152       bool       is_active;
153       CeedVector vec;
154 
155       // Get output vector
156       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
157       is_active = vec == CEED_VECTOR_ACTIVE;
158       if (is_active) data->fields.outputs[i] = output_arr;
159       else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
160       CeedCallBackend(CeedVectorDestroy(&vec));
161     }
162   }
163 
164   // Point coordinates, if needed
165   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
166   if (is_at_points) {
167     // Coords
168     CeedVector vec;
169 
170     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
171     CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
172     CeedCallBackend(CeedVectorDestroy(&vec));
173 
174     // Points per elem
175     if (num_elem != data->points.num_elem) {
176       CeedInt            *points_per_elem;
177       const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
178       CeedElemRestriction rstr_points = NULL;
179 
180       data->points.num_elem = num_elem;
181       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
182       CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
183       for (CeedInt e = 0; e < num_elem; e++) {
184         CeedInt num_points_elem;
185 
186         CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
187         points_per_elem[e] = num_points_elem;
188       }
189       if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem));
190       CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes));
191       CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice));
192       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
193       CeedCallBackend(CeedFree(&points_per_elem));
194     }
195   }
196 
197   // Get context data
198   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
199 
200   // Apply operator
201   void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
202   int   max_threads_per_block, min_grid_size, grid;
203 
204   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
205   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
206   int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
207 
208   if (is_tensor) {
209     CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, is_at_points ? 1 : max_threads_per_block,
210                                        cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
211   } else {
212     CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1));
213 
214     grid     = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
215     block[2] = elems_per_block;
216   }
217   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
218 
219   CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, stream, grid, block[0], block[1], block[2], shared_mem, is_run_good, opargs));
220 
221   // Restore input arrays
222   for (CeedInt i = 0; i < num_input_fields; i++) {
223     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
224     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
225     } else {
226       bool       is_active;
227       CeedVector vec;
228 
229       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
230       is_active = vec == CEED_VECTOR_ACTIVE;
231       if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
232       CeedCallBackend(CeedVectorDestroy(&vec));
233     }
234   }
235 
236   // Restore output arrays
237   for (CeedInt i = 0; i < num_output_fields; i++) {
238     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
239     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
240     } else {
241       bool       is_active;
242       CeedVector vec;
243 
244       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
245       is_active = vec == CEED_VECTOR_ACTIVE;
246       if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
247       CeedCallBackend(CeedVectorDestroy(&vec));
248     }
249   }
250 
251   // Restore point coordinates, if needed
252   if (is_at_points) {
253     CeedVector vec;
254 
255     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
256     CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
257     CeedCallBackend(CeedVectorDestroy(&vec));
258   }
259 
260   // Restore context data
261   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
262 
263   // Cleanup
264   CeedCallBackend(CeedDestroy(&ceed));
265   CeedCallBackend(CeedQFunctionDestroy(&qf));
266   if (!(*is_run_good)) data->use_fallback = true;
267   return CEED_ERROR_SUCCESS;
268 }
269 
270 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
271   bool              is_run_good = false;
272   const CeedScalar *input_arr   = NULL;
273   CeedScalar       *output_arr  = NULL;
274 
275   // Try to run kernel
276   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
277   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
278   CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, NULL, input_arr, output_arr, &is_run_good, request));
279   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
280   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
281 
282   // Fallback on unsuccessful run
283   if (!is_run_good) {
284     CeedOperator op_fallback;
285 
286     CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
287     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
288     CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
289   }
290   return CEED_ERROR_SUCCESS;
291 }
292 
293 static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
294   bool              is_run_good[CEED_COMPOSITE_MAX] = {false};
295   CeedInt           num_suboperators;
296   const CeedScalar *input_arr  = NULL;
297   CeedScalar       *output_arr = NULL;
298   Ceed              ceed;
299   CeedOperator     *sub_operators;
300 
301   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
302   CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
303   CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
304   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
305   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
306   for (CeedInt i = 0; i < num_suboperators; i++) {
307     CeedInt num_elem = 0;
308 
309     CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
310     if (num_elem > 0) {
311       cudaStream_t stream = NULL;
312 
313       CeedCallCuda(ceed, cudaStreamCreate(&stream));
314       CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(sub_operators[i], stream, input_arr, output_arr, &is_run_good[i], request));
315       CeedCallCuda(ceed, cudaStreamDestroy(stream));
316     }
317   }
318   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
319   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
320   CeedCallCuda(ceed, cudaDeviceSynchronize());
321 
322   // Fallback on unsuccessful run
323   for (CeedInt i = 0; i < num_suboperators; i++) {
324     if (!is_run_good[i]) {
325       CeedOperator op_fallback;
326 
327       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
328       CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback));
329       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
330     }
331   }
332   CeedCallBackend(CeedDestroy(&ceed));
333   return CEED_ERROR_SUCCESS;
334 }
335 
336 //------------------------------------------------------------------------------
337 // Create operator
338 //------------------------------------------------------------------------------
339 int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
340   bool                   is_composite;
341   Ceed                   ceed;
342   CeedOperator_Cuda_gen *impl;
343 
344   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
345   CeedCallBackend(CeedCalloc(1, &impl));
346   CeedCallBackend(CeedOperatorSetData(op, impl));
347   CeedCall(CeedOperatorIsComposite(op, &is_composite));
348   if (is_composite) {
349     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Cuda_gen));
350   } else {
351     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen));
352   }
353   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen));
354   CeedCallBackend(CeedDestroy(&ceed));
355   return CEED_ERROR_SUCCESS;
356 }
357 
358 //------------------------------------------------------------------------------
359