xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator.c (revision fd831f258b694b9857c348ebef72fcfdbf8a8f6b)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-source/cuda/cuda-types.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 #include <stddef.h>
14 
15 #include "../cuda/ceed-cuda-common.h"
16 #include "../cuda/ceed-cuda-compile.h"
17 #include "ceed-cuda-gen-operator-build.h"
18 #include "ceed-cuda-gen.h"
19 
20 //------------------------------------------------------------------------------
21 // Destroy operator
22 //------------------------------------------------------------------------------
23 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
24   Ceed                   ceed;
25   CeedOperator_Cuda_gen *impl;
26 
27   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
28   CeedCallBackend(CeedOperatorGetData(op, &impl));
29   if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem));
30   CeedCallBackend(CeedFree(&impl));
31   CeedCallBackend(CeedDestroy(&ceed));
32   return CEED_ERROR_SUCCESS;
33 }
34 
35 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, int elems_per_block) {
36   int useful_threads_per_block = threads_per_elem * elems_per_block;
37   // round up to nearest multiple of warp_size
38   int block_size    = CeedDivUpInt(useful_threads_per_block, warp_size) * warp_size;
39   int blocks_per_sm = threads_per_sm / block_size;
40   return threads_per_sm - useful_threads_per_block * blocks_per_sm;
41 }
42 
43 // Choose the least wasteful block size constrained by blocks_per_sm of max_threads_per_block.
44 //
45 // The x and y part of block[] contains per-element sizes (specified on input) while the z part is number of elements.
46 //
47 // Problem setting: we'd like to make occupancy high with relatively few inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how
48 // many threads can run.
49 //
50 // Note that full occupancy sometimes can't be achieved by one thread block.
51 // For example, an SM might support 1536 threads in total, but only 1024 within a single thread block.
52 // So cuOccupancyMaxPotentialBlockSize may suggest a block size of 768 so that two blocks can run, versus one block of 1024 will prevent a second
53 // block from running. The cuda-gen kernels are pretty heavy with lots of instruction-level parallelism (ILP) so we'll generally be okay with
54 // relatively low occupancy and smaller thread blocks, but we solve a reasonably general problem here. Empirically, we find that blocks bigger than
55 // about 256 have higher latency and worse load balancing when the number of elements is modest.
56 //
57 // cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of the number of quadrature points (or number of basis functions).
58 // They also have a lot of __syncthreads(), which is another point against excessively large thread blocks.
59 // Suppose I have elements with 7x7x7 quadrature points.
60 // This will loop over the last dimension, so we have 7*7=49 threads per element.
61 // Suppose we have two elements = 2*49=98 useful threads.
62 // CUDA schedules in units of full warps (32 threads), so 128 CUDA hardware threads are effectively committed to that block.
63 // Now suppose cuOccupancyMaxPotentialBlockSize returned 352.
64 // We can schedule 2 blocks of size 98 (196 useful threads using 256 hardware threads), but not a third block (which would need a total of 384
65 // hardware threads).
66 //
67 // If instead, we had packed 3 elements, we'd have 3*49=147 useful threads occupying 160 slots, and could schedule two blocks.
68 // Alternatively, we could pack a single block of 7 elements (2*49=343 useful threads) into the 354 slots.
69 // The latter has the least "waste", but __syncthreads() over-synchronizes and it might not pay off relative to smaller blocks.
70 static int BlockGridCalculate(CeedInt num_elem, int blocks_per_sm, int max_threads_per_block, int max_threads_z, int warp_size, int block[3],
71                               int *grid) {
72   const int threads_per_sm   = blocks_per_sm * max_threads_per_block;
73   const int threads_per_elem = block[0] * block[1];
74   int       elems_per_block  = 1;
75   int       waste            = Waste(threads_per_sm, warp_size, threads_per_elem, 1);
76 
77   for (int i = 2; i <= CeedIntMin(max_threads_per_block / threads_per_elem, num_elem); i++) {
78     int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i);
79 
80     // We want to minimize waste, but smaller kernels have lower latency and less __syncthreads() overhead so when a larger block size has the same
81     // waste as a smaller one, go ahead and prefer the smaller block.
82     if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) {
83       elems_per_block = i;
84       waste           = i_waste;
85     }
86   }
87   // In low-order elements, threads_per_elem may be sufficiently low to give an elems_per_block greater than allowable for the device, so we must
88   // check before setting the z-dimension size of the block.
89   block[2] = CeedIntMin(elems_per_block, max_threads_z);
90   *grid    = CeedDivUpInt(num_elem, elems_per_block);
91   return CEED_ERROR_SUCCESS;
92 }
93 
94 // callback for cuOccupancyMaxPotentialBlockSize, providing the amount of dynamic shared memory required for a thread block of size threads.
95 static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); }
96 
97 //------------------------------------------------------------------------------
98 // Apply and add to output
99 //------------------------------------------------------------------------------
100 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
101   bool                    is_at_points;
102   Ceed                    ceed;
103   Ceed_Cuda              *cuda_data;
104   CeedInt                 num_elem, num_input_fields, num_output_fields;
105   CeedEvalMode            eval_mode;
106   CeedVector              output_vecs[CEED_FIELD_MAX] = {NULL};
107   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
108   CeedQFunction_Cuda_gen *qf_data;
109   CeedQFunction           qf;
110   CeedOperatorField      *op_input_fields, *op_output_fields;
111   CeedOperator_Cuda_gen  *data;
112 
113   // Check for tensor-product bases
114   {
115     bool has_tensor_bases;
116 
117     CeedCallBackend(CeedOperatorHasTensorBases(op, &has_tensor_bases));
118     // -- Fallback to ref if not all bases are tensor-product
119     if (!has_tensor_bases) {
120       CeedOperator op_fallback;
121 
122       CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to non-tensor bases");
123       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
124       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
125       return CEED_ERROR_SUCCESS;
126     }
127   }
128 
129   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
130   CeedCallBackend(CeedGetData(ceed, &cuda_data));
131   CeedCallBackend(CeedOperatorGetData(op, &data));
132   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
133   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
134   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
135   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
136   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
137 
138   // Creation of the operator
139   CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op));
140 
141   // Input vectors
142   for (CeedInt i = 0; i < num_input_fields; i++) {
143     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
144     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
145       data->fields.inputs[i] = NULL;
146     } else {
147       bool       is_active;
148       CeedVector vec;
149 
150       // Get input vector
151       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
152       is_active = vec == CEED_VECTOR_ACTIVE;
153       if (is_active) vec = input_vec;
154       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
155       if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
156     }
157   }
158 
159   // Output vectors
160   for (CeedInt i = 0; i < num_output_fields; i++) {
161     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
162     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
163       data->fields.outputs[i] = NULL;
164     } else {
165       bool       is_active;
166       CeedVector vec;
167 
168       // Get output vector
169       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
170       is_active = vec == CEED_VECTOR_ACTIVE;
171       if (is_active) vec = output_vec;
172       output_vecs[i] = vec;
173       // Check for multiple output modes
174       CeedInt index = -1;
175 
176       for (CeedInt j = 0; j < i; j++) {
177         if (vec == output_vecs[j]) {
178           index = j;
179           break;
180         }
181       }
182       if (index == -1) {
183         CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
184       } else {
185         data->fields.outputs[i] = data->fields.outputs[index];
186       }
187       if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
188     }
189   }
190 
191   // Point coordinates, if needed
192   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
193   if (is_at_points) {
194     // Coords
195     CeedVector vec;
196 
197     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
198     CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
199     CeedCallBackend(CeedVectorDestroy(&vec));
200 
201     // Points per elem
202     if (num_elem != data->points.num_elem) {
203       CeedInt            *points_per_elem;
204       const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
205       CeedElemRestriction rstr_points = NULL;
206 
207       data->points.num_elem = num_elem;
208       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
209       CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
210       for (CeedInt e = 0; e < num_elem; e++) {
211         CeedInt num_points_elem;
212 
213         CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
214         points_per_elem[e] = num_points_elem;
215       }
216       if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem));
217       CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes));
218       CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice));
219       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
220       CeedCallBackend(CeedFree(&points_per_elem));
221     }
222   }
223 
224   // Get context data
225   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
226 
227   // Apply operator
228   void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
229   const CeedInt dim       = data->dim;
230   const CeedInt Q_1d      = data->Q_1d;
231   const CeedInt P_1d      = data->max_P_1d;
232   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
233   int           max_threads_per_block, min_grid_size, grid;
234 
235   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
236   int block[3] = {thread_1d, dim < 2 ? 1 : thread_1d, -1};
237 
238   CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
239                                      cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
240   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
241 
242   CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs));
243 
244   // Restore input arrays
245   for (CeedInt i = 0; i < num_input_fields; i++) {
246     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
247     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
248     } else {
249       bool       is_active;
250       CeedVector vec;
251 
252       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
253       is_active = vec == CEED_VECTOR_ACTIVE;
254       if (is_active) vec = input_vec;
255       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
256       if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
257     }
258   }
259 
260   // Restore output arrays
261   for (CeedInt i = 0; i < num_output_fields; i++) {
262     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
263     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
264     } else {
265       bool       is_active;
266       CeedVector vec;
267 
268       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
269       is_active = vec == CEED_VECTOR_ACTIVE;
270       if (is_active) vec = output_vec;
271       // Check for multiple output modes
272       CeedInt index = -1;
273 
274       for (CeedInt j = 0; j < i; j++) {
275         if (vec == output_vecs[j]) {
276           index = j;
277           break;
278         }
279       }
280       if (index == -1) {
281         CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
282       }
283       if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
284     }
285   }
286 
287   // Restore point coordinates, if needed
288   if (is_at_points) {
289     CeedVector vec;
290 
291     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
292     CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
293     CeedCallBackend(CeedVectorDestroy(&vec));
294   }
295 
296   // Restore context data
297   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
298   CeedCallBackend(CeedDestroy(&ceed));
299   CeedCallBackend(CeedQFunctionDestroy(&qf));
300   return CEED_ERROR_SUCCESS;
301 }
302 
303 //------------------------------------------------------------------------------
304 // Create operator
305 //------------------------------------------------------------------------------
306 int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
307   Ceed                   ceed;
308   CeedOperator_Cuda_gen *impl;
309 
310   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
311   CeedCallBackend(CeedCalloc(1, &impl));
312   CeedCallBackend(CeedOperatorSetData(op, impl));
313   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen));
314   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen));
315   CeedCallBackend(CeedDestroy(&ceed));
316   return CEED_ERROR_SUCCESS;
317 }
318 
319 //------------------------------------------------------------------------------
320