xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator.c (revision cefa2673c9b7c6fb97d32b63fbbfb5d237e3c796)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 #include <ceed/backend.h>
10 #include <stddef.h>
11 #include "ceed-cuda-gen.h"
12 #include "ceed-cuda-gen-operator-build.h"
13 #include "../cuda/ceed-cuda-compile.h"
14 
15 //------------------------------------------------------------------------------
16 // Destroy operator
17 //------------------------------------------------------------------------------
18 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
19   int ierr;
20   CeedOperator_Cuda_gen *impl;
21   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
22   ierr = CeedFree(&impl); CeedChkBackend(ierr);
23   return CEED_ERROR_SUCCESS;
24 }
25 
26 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem,
27                  int elems_per_block) {
28   int useful_threads_per_block = threads_per_elem * elems_per_block;
29   // round up to nearest multiple of warp_size
30   int block_size = ((useful_threads_per_block + warp_size - 1) / warp_size) *
31                    warp_size;
32   int blocks_per_sm = threads_per_sm / block_size;
33   return threads_per_sm - useful_threads_per_block * blocks_per_sm;
34 }
35 
36 // Choose the least wasteful block size constrained by blocks_per_sm of
37 // max_threads_per_block.
38 //
39 // The x and y part of block[] contains per-element sizes (specified on input)
40 // while the z part is number of elements.
41 //
42 // Problem setting: we'd like to make occupancy high with relatively few
43 // inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how
44 // many threads can run.
45 //
46 // Note that full occupancy sometimes can't be achieved by one thread block. For
47 // example, an SM might support 1536 threads in total, but only 1024 within a
48 // single thread block. So cuOccupancyMaxPotentialBlockSize may suggest a block
49 // size of 768 so that two blocks can run, versus one block of 1024 will prevent
50 // a second block from running. The cuda-gen kernels are pretty heavy with lots
51 // of instruction-level parallelism (ILP) so we'll generally be okay with
52 // relatvely low occupancy and smaller thread blocks, but we solve a reasonably
53 // general problem here. Empirically, we find that blocks bigger than about 256
54 // have higher latency and worse load balancing when the number of elements is
55 // modest.
56 //
57 // cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of
58 // the number of quadrature points (or number of basis functions). They also
59 // have a lot of __syncthreads(), which is another point against excessively
60 // large thread blocks. Suppose I have elements with 7x7x7 quadrature points.
61 // This will loop over the last dimension, so we have 7*7=49 threads per
62 // element. Suppose we have two elements = 2*49=98 useful threads. CUDA
63 // schedules in units of full warps (32 threads), so 128 CUDA hardware threads
64 // are effectively committed to that block. Now suppose
65 // cuOccupancyMaxPotentialBlockSize returned 352. We can schedule 2 blocks of
66 // size 98 (196 useful threads using 256 hardware threads), but not a third
67 // block (which would need a total of 384 hardware threads).
68 //
69 // If instead, we had packed 3 elements, we'd have 3*49=147 useful threads
70 // occupying 160 slots, and could schedule two blocks. Alternatively, we could
71 // pack a single block of 7 elements (2*49=343 useful threads) into the 354
72 // slots. The latter has the least "waste", but __syncthreads()
73 // over-synchronizes and it might not pay off relative to smaller blocks.
74 static int BlockGridCalculate(CeedInt nelem, int blocks_per_sm,
75                               int max_threads_per_block, int max_threads_z,
76                               int warp_size, int block[3], int *grid) {
77   const int threads_per_sm = blocks_per_sm * max_threads_per_block;
78   const int threads_per_elem = block[0] * block[1];
79   int elems_per_block = 1;
80   int waste = Waste(threads_per_sm, warp_size, threads_per_elem, 1);
81   for (int i=2;
82        i <= CeedIntMin(max_threads_per_block / threads_per_elem, nelem);
83        i++) {
84     int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i);
85     // We want to minimize waste, but smaller kernels have lower latency and
86     // less __syncthreads() overhead so when a larger block size has the same
87     // waste as a smaller one, go ahead and prefer the smaller block.
88     if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) {
89       elems_per_block = i;
90       waste = i_waste;
91     }
92   }
93   // In low-order elements, threads_per_elem may be sufficiently low to give
94   // an elems_per_block greater than allowable for the device, so we must check
95   // before setting the z-dimension size of the block.
96   block[2] = CeedIntMin(elems_per_block, max_threads_z);
97   *grid = (nelem + elems_per_block - 1) / elems_per_block;
98   return CEED_ERROR_SUCCESS;
99 }
100 
101 // callback for cuOccupancyMaxPotentialBlockSize, providing the amount of
102 // dynamic shared memory required for a thread block of size threads.
103 static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); }
104 
105 //------------------------------------------------------------------------------
106 // Apply and add to output
107 //------------------------------------------------------------------------------
108 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec,
109     CeedVector outvec, CeedRequest *request) {
110   int ierr;
111   Ceed ceed;
112   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
113   Ceed_Cuda *cuda_data;
114   ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr);
115   CeedOperator_Cuda_gen *data;
116   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
117   CeedQFunction qf;
118   CeedQFunction_Cuda_gen *qf_data;
119   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
120   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
121   CeedInt nelem, numinputfields, numoutputfields;
122   ierr = CeedOperatorGetNumElements(op, &nelem); CeedChkBackend(ierr);
123   CeedOperatorField *opinputfields, *opoutputfields;
124   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
125                                &numoutputfields, &opoutputfields);
126   CeedChkBackend(ierr);
127   CeedQFunctionField *qfinputfields, *qfoutputfields;
128   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
129   CeedChkBackend(ierr);
130   CeedEvalMode emode;
131   CeedVector vec, outvecs[CEED_FIELD_MAX] = {};
132 
133   // Creation of the operator
134   ierr = CeedCudaGenOperatorBuild(op); CeedChkBackend(ierr);
135 
136   // Input vectors
137   for (CeedInt i = 0; i < numinputfields; i++) {
138     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
139     CeedChkBackend(ierr);
140     if (emode == CEED_EVAL_WEIGHT) { // Skip
141       data->fields.in[i] = NULL;
142     } else {
143       // Get input vector
144       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
145       if (vec == CEED_VECTOR_ACTIVE) vec = invec;
146       ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.in[i]);
147       CeedChkBackend(ierr);
148     }
149   }
150 
151   // Output vectors
152   for (CeedInt i = 0; i < numoutputfields; i++) {
153     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
154     CeedChkBackend(ierr);
155     if (emode == CEED_EVAL_WEIGHT) { // Skip
156       data->fields.out[i] = NULL;
157     } else {
158       // Get output vector
159       ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec);
160       CeedChkBackend(ierr);
161       if (vec == CEED_VECTOR_ACTIVE) vec = outvec;
162       outvecs[i] = vec;
163       // Check for multiple output modes
164       CeedInt index = -1;
165       for (CeedInt j = 0; j < i; j++) {
166         if (vec == outvecs[j]) {
167           index = j;
168           break;
169         }
170       }
171       if (index == -1) {
172         ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.out[i]);
173         CeedChkBackend(ierr);
174       } else {
175         data->fields.out[i] = data->fields.out[index];
176       }
177     }
178   }
179 
180   // Get context data
181   ierr = CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c);
182   CeedChkBackend(ierr);
183 
184   // Apply operator
185   void *opargs[] = {(void *) &nelem, &qf_data->d_c, &data->indices,
186                     &data->fields, &data->B, &data->G, &data->W
187                    };
188   const CeedInt dim = data->dim;
189   const CeedInt Q1d = data->Q1d;
190   const CeedInt P1d = data->maxP1d;
191   const CeedInt thread1d = CeedIntMax(Q1d, P1d);
192   int max_threads_per_block, min_grid_size;
193   CeedChk_Cu(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size,
194              &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
195   int block[3] = {thread1d, dim < 2 ? 1 : thread1d, -1,}, grid;
196   CeedChkBackend(BlockGridCalculate(nelem,
197                                     min_grid_size/ cuda_data->device_prop.multiProcessorCount,
198                                     max_threads_per_block,
199                                     cuda_data->device_prop.maxThreadsDim[2],
200                                     cuda_data->device_prop.warpSize, block, &grid));
201   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
202   ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, block[0], block[1],
203                                     block[2], shared_mem, opargs);
204   CeedChkBackend(ierr);
205 
206   // Restore input arrays
207   for (CeedInt i = 0; i < numinputfields; i++) {
208     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
209     CeedChkBackend(ierr);
210     if (emode == CEED_EVAL_WEIGHT) { // Skip
211     } else {
212       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
213       if (vec == CEED_VECTOR_ACTIVE) vec = invec;
214       ierr = CeedVectorRestoreArrayRead(vec, &data->fields.in[i]);
215       CeedChkBackend(ierr);
216     }
217   }
218 
219   // Restore output arrays
220   for (CeedInt i = 0; i < numoutputfields; i++) {
221     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
222     CeedChkBackend(ierr);
223     if (emode == CEED_EVAL_WEIGHT) { // Skip
224     } else {
225       ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec);
226       CeedChkBackend(ierr);
227       if (vec == CEED_VECTOR_ACTIVE) vec = outvec;
228       // Check for multiple output modes
229       CeedInt index = -1;
230       for (CeedInt j = 0; j < i; j++) {
231         if (vec == outvecs[j]) {
232           index = j;
233           break;
234         }
235       }
236       if (index == -1) {
237         ierr = CeedVectorRestoreArray(vec, &data->fields.out[i]);
238         CeedChkBackend(ierr);
239       }
240     }
241   }
242 
243   // Restore context data
244   ierr = CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c);
245   CeedChkBackend(ierr);
246 
247   return CEED_ERROR_SUCCESS;
248 }
249 
250 //------------------------------------------------------------------------------
251 // Create operator
252 //------------------------------------------------------------------------------
253 int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
254   int ierr;
255   Ceed ceed;
256   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
257   CeedOperator_Cuda_gen *impl;
258 
259   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
260   ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr);
261 
262   ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd",
263                                 CeedOperatorApplyAdd_Cuda_gen); CeedChkBackend(ierr);
264   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
265                                 CeedOperatorDestroy_Cuda_gen); CeedChkBackend(ierr);
266   return CEED_ERROR_SUCCESS;
267 }
268 
269 //------------------------------------------------------------------------------
270