xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator.c (revision caee03026e6576cbf7a399c2fc51bb918c77f451)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/backend.h>
9 #include <ceed/ceed.h>
10 #include <stddef.h>
11 
12 #include "../cuda/ceed-cuda-compile.h"
13 #include "ceed-cuda-gen-operator-build.h"
14 #include "ceed-cuda-gen.h"
15 
16 //------------------------------------------------------------------------------
17 // Destroy operator
18 //------------------------------------------------------------------------------
19 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
20   CeedOperator_Cuda_gen *impl;
21   CeedCallBackend(CeedOperatorGetData(op, &impl));
22   CeedCallBackend(CeedFree(&impl));
23   return CEED_ERROR_SUCCESS;
24 }
25 
26 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, int elems_per_block) {
27   int useful_threads_per_block = threads_per_elem * elems_per_block;
28   // round up to nearest multiple of warp_size
29   int block_size    = ((useful_threads_per_block + warp_size - 1) / warp_size) * warp_size;
30   int blocks_per_sm = threads_per_sm / block_size;
31   return threads_per_sm - useful_threads_per_block * blocks_per_sm;
32 }
33 
34 // Choose the least wasteful block size constrained by blocks_per_sm of
35 // max_threads_per_block.
36 //
37 // The x and y part of block[] contains per-element sizes (specified on input)
38 // while the z part is number of elements.
39 //
40 // Problem setting: we'd like to make occupancy high with relatively few
41 // inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how
42 // many threads can run.
43 //
44 // Note that full occupancy sometimes can't be achieved by one thread block. For
45 // example, an SM might support 1536 threads in total, but only 1024 within a
46 // single thread block. So cuOccupancyMaxPotentialBlockSize may suggest a block
47 // size of 768 so that two blocks can run, versus one block of 1024 will prevent
48 // a second block from running. The cuda-gen kernels are pretty heavy with lots
49 // of instruction-level parallelism (ILP) so we'll generally be okay with
50 // relatvely low occupancy and smaller thread blocks, but we solve a reasonably
51 // general problem here. Empirically, we find that blocks bigger than about 256
52 // have higher latency and worse load balancing when the number of elements is
53 // modest.
54 //
55 // cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of
56 // the number of quadrature points (or number of basis functions). They also
57 // have a lot of __syncthreads(), which is another point against excessively
58 // large thread blocks. Suppose I have elements with 7x7x7 quadrature points.
59 // This will loop over the last dimension, so we have 7*7=49 threads per
60 // element. Suppose we have two elements = 2*49=98 useful threads. CUDA
61 // schedules in units of full warps (32 threads), so 128 CUDA hardware threads
62 // are effectively committed to that block. Now suppose
63 // cuOccupancyMaxPotentialBlockSize returned 352. We can schedule 2 blocks of
64 // size 98 (196 useful threads using 256 hardware threads), but not a third
65 // block (which would need a total of 384 hardware threads).
66 //
67 // If instead, we had packed 3 elements, we'd have 3*49=147 useful threads
68 // occupying 160 slots, and could schedule two blocks. Alternatively, we could
69 // pack a single block of 7 elements (2*49=343 useful threads) into the 354
70 // slots. The latter has the least "waste", but __syncthreads()
71 // over-synchronizes and it might not pay off relative to smaller blocks.
72 static int BlockGridCalculate(CeedInt num_elem, int blocks_per_sm, int max_threads_per_block, int max_threads_z, int warp_size, int block[3],
73                               int *grid) {
74   const int threads_per_sm   = blocks_per_sm * max_threads_per_block;
75   const int threads_per_elem = block[0] * block[1];
76   int       elems_per_block  = 1;
77   int       waste            = Waste(threads_per_sm, warp_size, threads_per_elem, 1);
78   for (int i = 2; i <= CeedIntMin(max_threads_per_block / threads_per_elem, num_elem); i++) {
79     int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i);
80     // We want to minimize waste, but smaller kernels have lower latency and
81     // less __syncthreads() overhead so when a larger block size has the same
82     // waste as a smaller one, go ahead and prefer the smaller block.
83     if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) {
84       elems_per_block = i;
85       waste           = i_waste;
86     }
87   }
88   // In low-order elements, threads_per_elem may be sufficiently low to give
89   // an elems_per_block greater than allowable for the device, so we must check
90   // before setting the z-dimension size of the block.
91   block[2] = CeedIntMin(elems_per_block, max_threads_z);
92   *grid    = (num_elem + elems_per_block - 1) / elems_per_block;
93   return CEED_ERROR_SUCCESS;
94 }
95 
96 // callback for cuOccupancyMaxPotentialBlockSize, providing the amount of
97 // dynamic shared memory required for a thread block of size threads.
98 static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); }
99 
100 //------------------------------------------------------------------------------
101 // Apply and add to output
102 //------------------------------------------------------------------------------
103 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
104   Ceed ceed;
105   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
106   Ceed_Cuda *cuda_data;
107   CeedCallBackend(CeedGetData(ceed, &cuda_data));
108   CeedOperator_Cuda_gen *data;
109   CeedCallBackend(CeedOperatorGetData(op, &data));
110   CeedQFunction           qf;
111   CeedQFunction_Cuda_gen *qf_data;
112   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
113   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
114   CeedInt num_elem, num_input_fields, num_output_fields;
115   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
116   CeedOperatorField *op_input_fields, *op_output_fields;
117   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
118   CeedQFunctionField *qf_input_fields, *qf_output_fields;
119   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
120   CeedEvalMode eval_mode;
121   CeedVector   vec, output_vecs[CEED_FIELD_MAX] = {};
122 
123   // Creation of the operator
124   CeedCallBackend(CeedCudaGenOperatorBuild(op));
125 
126   // Input vectors
127   for (CeedInt i = 0; i < num_input_fields; i++) {
128     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
129     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
130       data->fields.inputs[i] = NULL;
131     } else {
132       // Get input vector
133       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
134       if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
135       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
136     }
137   }
138 
139   // Output vectors
140 
141   for (CeedInt i = 0; i < num_output_fields; i++) {
142     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
143     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
144       data->fields.outputs[i] = NULL;
145     } else {
146       // Get output vector
147       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
148       if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
149       output_vecs[i] = vec;
150       // Check for multiple output modes
151       CeedInt index = -1;
152       for (CeedInt j = 0; j < i; j++) {
153         if (vec == output_vecs[j]) {
154           index = j;
155           break;
156         }
157       }
158       if (index == -1) {
159         CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
160       } else {
161         data->fields.outputs[i] = data->fields.outputs[index];
162       }
163     }
164   }
165 
166   // Get context data
167   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
168 
169   // Apply operator
170 
171   void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W};
172   const CeedInt dim       = data->dim;
173   const CeedInt Q_1d      = data->Q_1d;
174   const CeedInt P_1d      = data->max_P_1d;
175   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
176   int           max_threads_per_block, min_grid_size;
177   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
178   int block[3] =
179       {
180           thread_1d,
181           dim < 2 ? 1 : thread_1d,
182           -1,
183       },
184       grid;
185   CeedChkBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
186                                     cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
187   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
188   CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs));
189 
190   // Restore input arrays
191   for (CeedInt i = 0; i < num_input_fields; i++) {
192     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
193     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
194     } else {
195       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
196       if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
197       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
198     }
199   }
200 
201   // Restore output arrays
202   for (CeedInt i = 0; i < num_output_fields; i++) {
203     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
204     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
205     } else {
206       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
207       if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
208       // Check for multiple output modes
209       CeedInt index = -1;
210       for (CeedInt j = 0; j < i; j++) {
211         if (vec == output_vecs[j]) {
212           index = j;
213           break;
214         }
215       }
216       if (index == -1) {
217         CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
218       }
219     }
220   }
221 
222   // Restore context data
223   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
224 
225   return CEED_ERROR_SUCCESS;
226 }
227 
228 //------------------------------------------------------------------------------
229 // Create operator
230 //------------------------------------------------------------------------------
231 int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
232   Ceed ceed;
233   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
234   CeedOperator_Cuda_gen *impl;
235 
236   CeedCallBackend(CeedCalloc(1, &impl));
237   CeedCallBackend(CeedOperatorSetData(op, impl));
238 
239   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen));
240   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen));
241   return CEED_ERROR_SUCCESS;
242 }
243 
244 //------------------------------------------------------------------------------
245