xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator.c (revision f1c40530bc6f02bc6ca847af718b8b41cc3fd3ca)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 #include <ceed/backend.h>
10 #include <stddef.h>
11 #include "ceed-cuda-gen.h"
12 #include "ceed-cuda-gen-operator-build.h"
13 #include "../cuda/ceed-cuda-compile.h"
14 
15 //------------------------------------------------------------------------------
16 // Destroy operator
17 //------------------------------------------------------------------------------
18 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
19   int ierr;
20   CeedOperator_Cuda_gen *impl;
21   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
22   ierr = CeedFree(&impl); CeedChkBackend(ierr);
23   return CEED_ERROR_SUCCESS;
24 }
25 
26 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem,
27                  int elems_per_block) {
28   int useful_threads_per_block = threads_per_elem * elems_per_block;
29   // round up to nearest multiple of warp_size
30   int block_size = ((useful_threads_per_block + warp_size - 1) / warp_size) *
31                    warp_size;
32   int blocks_per_sm = threads_per_sm / block_size;
33   return threads_per_sm - useful_threads_per_block * blocks_per_sm;
34 }
35 
36 // Choose the least wasteful block size constrained by blocks_per_sm of
37 // max_threads_per_block.
38 //
39 // The x and y part of block[] contains per-element sizes (specified on input)
40 // while the z part is number of elements.
41 //
42 // Problem setting: we'd like to make occupancy high with relatively few
43 // inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how
44 // many threads can run.
45 //
46 // Note that full occupancy sometimes can't be achieved by one thread block. For
47 // example, an SM might support 1536 threads in total, but only 1024 within a
48 // single thread block. So cuOccupancyMaxPotentialBlockSize may suggest a block
49 // size of 768 so that two blocks can run, versus one block of 1024 will prevent
50 // a second block from running. The cuda-gen kernels are pretty heavy with lots
51 // of instruction-level parallelism (ILP) so we'll generally be okay with
52 // relatvely low occupancy and smaller thread blocks, but we solve a reasonably
53 // general problem here. Empirically, we find that blocks bigger than about 256
54 // have higher latency and worse load balancing when the number of elements is
55 // modest.
56 //
57 // cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of
58 // the number of quadrature points (or number of basis functions). They also
59 // have a lot of __syncthreads(), which is another point against excessively
60 // large thread blocks. Suppose I have elements with 7x7x7 quadrature points.
61 // This will loop over the last dimension, so we have 7*7=49 threads per
62 // element. Suppose we have two elements = 2*49=98 useful threads. CUDA
63 // schedules in units of full warps (32 threads), so 128 CUDA hardware threads
64 // are effectively committed to that block. Now suppose
65 // cuOccupancyMaxPotentialBlockSize returned 352. We can schedule 2 blocks of
66 // size 98 (196 useful threads using 256 hardware threads), but not a third
67 // block (which would need a total of 384 hardware threads).
68 //
69 // If instead, we had packed 3 elements, we'd have 3*49=147 useful threads
70 // occupying 160 slots, and could schedule two blocks. Alternatively, we could
71 // pack a single block of 7 elements (2*49=343 useful threads) into the 354
72 // slots. The latter has the least "waste", but __syncthreads()
73 // over-synchronizes and it might not pay off relative to smaller blocks.
74 static int BlockGridCalculate(CeedInt num_elem, int blocks_per_sm,
75                               int max_threads_per_block, int max_threads_z,
76                               int warp_size, int block[3], int *grid) {
77   const int threads_per_sm = blocks_per_sm * max_threads_per_block;
78   const int threads_per_elem = block[0] * block[1];
79   int elems_per_block = 1;
80   int waste = Waste(threads_per_sm, warp_size, threads_per_elem, 1);
81   for (int i = 2;
82        i <= CeedIntMin(max_threads_per_block / threads_per_elem, num_elem);
83        i++) {
84     int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i);
85     // We want to minimize waste, but smaller kernels have lower latency and
86     // less __syncthreads() overhead so when a larger block size has the same
87     // waste as a smaller one, go ahead and prefer the smaller block.
88     if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) {
89       elems_per_block = i;
90       waste = i_waste;
91     }
92   }
93   // In low-order elements, threads_per_elem may be sufficiently low to give
94   // an elems_per_block greater than allowable for the device, so we must check
95   // before setting the z-dimension size of the block.
96   block[2] = CeedIntMin(elems_per_block, max_threads_z);
97   *grid = (num_elem + elems_per_block - 1) / elems_per_block;
98   return CEED_ERROR_SUCCESS;
99 }
100 
101 // callback for cuOccupancyMaxPotentialBlockSize, providing the amount of
102 // dynamic shared memory required for a thread block of size threads.
103 static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); }
104 
105 //------------------------------------------------------------------------------
106 // Apply and add to output
107 //------------------------------------------------------------------------------
108 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
109     CeedVector output_vec, CeedRequest *request) {
110   int ierr;
111   Ceed ceed;
112   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
113   Ceed_Cuda *cuda_data;
114   ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr);
115   CeedOperator_Cuda_gen *data;
116   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
117   CeedQFunction qf;
118   CeedQFunction_Cuda_gen *qf_data;
119   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
120   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
121   CeedInt num_elem, num_input_fields, num_output_fields;
122   ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr);
123   CeedOperatorField *op_input_fields, *op_output_fields;
124   ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields,
125                                &num_output_fields, &op_output_fields);
126   CeedChkBackend(ierr);
127   CeedQFunctionField *qf_input_fields, *qf_output_fields;
128   ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL,
129                                 &qf_output_fields);
130   CeedChkBackend(ierr);
131   CeedEvalMode eval_mode;
132   CeedVector vec, output_vecs[CEED_FIELD_MAX] = {};
133 
134   // Creation of the operator
135   ierr = CeedCudaGenOperatorBuild(op); CeedChkBackend(ierr);
136 
137   // Input vectors
138   for (CeedInt i = 0; i < num_input_fields; i++) {
139     ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode);
140     CeedChkBackend(ierr);
141     if (eval_mode == CEED_EVAL_WEIGHT) { // Skip
142       data->fields.inputs[i] = NULL;
143     } else {
144       // Get input vector
145       ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
146       CeedChkBackend(ierr);
147       if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
148       ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]);
149       CeedChkBackend(ierr);
150     }
151   }
152 
153   // Output vectors
154   for (CeedInt i = 0; i < num_output_fields; i++) {
155     ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode);
156     CeedChkBackend(ierr);
157     if (eval_mode == CEED_EVAL_WEIGHT) { // Skip
158       data->fields.outputs[i] = NULL;
159     } else {
160       // Get output vector
161       ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec);
162       CeedChkBackend(ierr);
163       if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
164       output_vecs[i] = vec;
165       // Check for multiple output modes
166       CeedInt index = -1;
167       for (CeedInt j = 0; j < i; j++) {
168         if (vec == output_vecs[j]) {
169           index = j;
170           break;
171         }
172       }
173       if (index == -1) {
174         ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]);
175         CeedChkBackend(ierr);
176       } else {
177         data->fields.outputs[i] = data->fields.outputs[index];
178       }
179     }
180   }
181 
182   // Get context data
183   ierr = CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c);
184   CeedChkBackend(ierr);
185 
186   // Apply operator
187   void *opargs[] = {(void *) &num_elem, &qf_data->d_c, &data->indices,
188                     &data->fields, &data->B, &data->G, &data->W
189                    };
190   const CeedInt dim = data->dim;
191   const CeedInt Q_1d = data->Q_1d;
192   const CeedInt P_1d = data->max_P_1d;
193   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
194   int max_threads_per_block, min_grid_size;
195   CeedChk_Cu(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size,
196              &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
197   int block[3] = {thread_1d, dim < 2 ? 1 : thread_1d, -1,}, grid;
198   CeedChkBackend(BlockGridCalculate(num_elem,
199                                     min_grid_size/ cuda_data->device_prop.multiProcessorCount,
200                                     max_threads_per_block,
201                                     cuda_data->device_prop.maxThreadsDim[2],
202                                     cuda_data->device_prop.warpSize, block, &grid));
203   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
204   ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, block[0], block[1],
205                                     block[2], shared_mem, opargs);
206   CeedChkBackend(ierr);
207 
208   // Restore input arrays
209   for (CeedInt i = 0; i < num_input_fields; i++) {
210     ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode);
211     CeedChkBackend(ierr);
212     if (eval_mode == CEED_EVAL_WEIGHT) { // Skip
213     } else {
214       ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec);
215       CeedChkBackend(ierr);
216       if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
217       ierr = CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]);
218       CeedChkBackend(ierr);
219     }
220   }
221 
222   // Restore output arrays
223   for (CeedInt i = 0; i < num_output_fields; i++) {
224     ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode);
225     CeedChkBackend(ierr);
226     if (eval_mode == CEED_EVAL_WEIGHT) { // Skip
227     } else {
228       ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec);
229       CeedChkBackend(ierr);
230       if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
231       // Check for multiple output modes
232       CeedInt index = -1;
233       for (CeedInt j = 0; j < i; j++) {
234         if (vec == output_vecs[j]) {
235           index = j;
236           break;
237         }
238       }
239       if (index == -1) {
240         ierr = CeedVectorRestoreArray(vec, &data->fields.outputs[i]);
241         CeedChkBackend(ierr);
242       }
243     }
244   }
245 
246   // Restore context data
247   ierr = CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c);
248   CeedChkBackend(ierr);
249 
250   return CEED_ERROR_SUCCESS;
251 }
252 
253 //------------------------------------------------------------------------------
254 // Create operator
255 //------------------------------------------------------------------------------
256 int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
257   int ierr;
258   Ceed ceed;
259   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
260   CeedOperator_Cuda_gen *impl;
261 
262   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
263   ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr);
264 
265   ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd",
266                                 CeedOperatorApplyAdd_Cuda_gen); CeedChkBackend(ierr);
267   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
268                                 CeedOperatorDestroy_Cuda_gen); CeedChkBackend(ierr);
269   return CEED_ERROR_SUCCESS;
270 }
271 
272 //------------------------------------------------------------------------------
273