1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include <stddef.h> 20 #include "ceed-cuda-gen.h" 21 #include "ceed-cuda-gen-operator-build.h" 22 #include "../cuda/ceed-cuda-compile.h" 23 24 //------------------------------------------------------------------------------ 25 // Destroy operator 26 //------------------------------------------------------------------------------ 27 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 28 int ierr; 29 CeedOperator_Cuda_gen *impl; 30 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 31 ierr = CeedFree(&impl); CeedChkBackend(ierr); 32 return CEED_ERROR_SUCCESS; 33 } 34 35 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, 36 int elems_per_block) { 37 int useful_threads_per_block = threads_per_elem * elems_per_block; 38 // round up to nearest multiple of warp_size 39 int block_size = ((useful_threads_per_block + warp_size - 1) / warp_size) * 40 warp_size; 41 int blocks_per_sm = threads_per_sm / block_size; 42 return threads_per_sm - useful_threads_per_block * blocks_per_sm; 43 } 44 45 // Choose the least wasteful block size constrained by blocks_per_sm of 46 // max_threads_per_block. 47 // 48 // The x and y part of block[] contains per-element sizes (specified on input) 49 // while the z part is number of elements. 50 // 51 // Problem setting: we'd like to make occupancy high with relatively few 52 // inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how 53 // many threads can run. 54 // 55 // Note that full occupancy sometimes can't be achieved by one thread block. For 56 // example, an SM might support 1536 threads in total, but only 1024 within a 57 // single thread block. So cuOccupancyMaxPotentialBlockSize may suggest a block 58 // size of 768 so that two blocks can run, versus one block of 1024 will prevent 59 // a second block from running. The cuda-gen kernels are pretty heavy with lots 60 // of instruction-level parallelism (ILP) so we'll generally be okay with 61 // relatvely low occupancy and smaller thread blocks, but we solve a reasonably 62 // general problem here. Empirically, we find that blocks bigger than about 256 63 // have higher latency and worse load balancing when the number of elements is 64 // modest. 65 // 66 // cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of 67 // the number of quadrature points (or number of basis functions). They also 68 // have a lot of __syncthreads(), which is another point against excessively 69 // large thread blocks. Suppose I have elements with 7x7x7 quadrature points. 70 // This will loop over the last dimension, so we have 7*7=49 threads per 71 // element. Suppose we have two elements = 2*49=98 useful threads. CUDA 72 // schedules in units of full warps (32 threads), so 128 CUDA hardware threads 73 // are effectively committed to that block. Now suppose 74 // cuOccupancyMaxPotentialBlockSize returned 352. We can schedule 2 blocks of 75 // size 98 (196 useful threads using 256 hardware threads), but not a third 76 // block (which would need a total of 384 hardware threads). 77 // 78 // If instead, we had packed 3 elements, we'd have 3*49=147 useful threads 79 // occupying 160 slots, and could schedule two blocks. Alternatively, we could 80 // pack a single block of 7 elements (2*49=343 useful threads) into the 354 81 // slots. The latter has the least "waste", but __syncthreads() 82 // over-synchronizes and it might not pay off relative to smaller blocks. 83 static int BlockGridCalculate(CeedInt nelem, int blocks_per_sm, 84 int max_threads_per_block, int max_threads_z, 85 int warp_size, int block[3], int *grid) { 86 const int threads_per_sm = blocks_per_sm * max_threads_per_block; 87 const int threads_per_elem = block[0] * block[1]; 88 int elems_per_block = 1; 89 int waste = Waste(threads_per_sm, warp_size, threads_per_elem, 1); 90 for (int i=2; 91 i <= CeedIntMin(max_threads_per_block / threads_per_elem, nelem); 92 i++) { 93 int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i); 94 // We want to minimize waste, but smaller kernels have lower latency and 95 // less __syncthreads() overhead so when a larger block size has the same 96 // waste as a smaller one, go ahead and prefer the smaller block. 97 if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) { 98 elems_per_block = i; 99 waste = i_waste; 100 } 101 } 102 // In low-order elements, threads_per_elem may be sufficiently low to give 103 // an elems_per_block greater than allowable for the device, so we must check 104 // before setting the z-dimension size of the block. 105 block[2] = CeedIntMin(elems_per_block, max_threads_z); 106 *grid = (nelem + elems_per_block - 1) / elems_per_block; 107 return CEED_ERROR_SUCCESS; 108 } 109 110 // callback for cuOccupancyMaxPotentialBlockSize, providing the amount of 111 // dynamic shared memory required for a thread block of size threads. 112 static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); } 113 114 //------------------------------------------------------------------------------ 115 // Apply and add to output 116 //------------------------------------------------------------------------------ 117 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec, 118 CeedVector outvec, CeedRequest *request) { 119 int ierr; 120 Ceed ceed; 121 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 122 Ceed_Cuda *cuda_data; 123 ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr); 124 CeedOperator_Cuda_gen *data; 125 ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 126 CeedQFunction qf; 127 CeedQFunction_Cuda_gen *qf_data; 128 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 129 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 130 CeedInt nelem, numinputfields, numoutputfields; 131 ierr = CeedOperatorGetNumElements(op, &nelem); CeedChkBackend(ierr); 132 CeedOperatorField *opinputfields, *opoutputfields; 133 ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, 134 &numoutputfields, &opoutputfields); 135 CeedChkBackend(ierr); 136 CeedQFunctionField *qfinputfields, *qfoutputfields; 137 ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 138 CeedChkBackend(ierr); 139 CeedEvalMode emode; 140 CeedVector vec, outvecs[CEED_FIELD_MAX] = {}; 141 142 // Creation of the operator 143 ierr = CeedCudaGenOperatorBuild(op); CeedChkBackend(ierr); 144 145 // Input vectors 146 for (CeedInt i = 0; i < numinputfields; i++) { 147 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 148 CeedChkBackend(ierr); 149 if (emode == CEED_EVAL_WEIGHT) { // Skip 150 data->fields.in[i] = NULL; 151 } else { 152 // Get input vector 153 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 154 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 155 ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.in[i]); 156 CeedChkBackend(ierr); 157 } 158 } 159 160 // Output vectors 161 for (CeedInt i = 0; i < numoutputfields; i++) { 162 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 163 CeedChkBackend(ierr); 164 if (emode == CEED_EVAL_WEIGHT) { // Skip 165 data->fields.out[i] = NULL; 166 } else { 167 // Get output vector 168 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); 169 CeedChkBackend(ierr); 170 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 171 outvecs[i] = vec; 172 // Check for multiple output modes 173 CeedInt index = -1; 174 for (CeedInt j = 0; j < i; j++) { 175 if (vec == outvecs[j]) { 176 index = j; 177 break; 178 } 179 } 180 if (index == -1) { 181 ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.out[i]); 182 CeedChkBackend(ierr); 183 } else { 184 data->fields.out[i] = data->fields.out[index]; 185 } 186 } 187 } 188 189 // Get context data 190 CeedQFunctionContext ctx; 191 ierr = CeedQFunctionGetInnerContext(qf, &ctx); CeedChkBackend(ierr); 192 if (ctx) { 193 ierr = CeedQFunctionContextGetData(ctx, CEED_MEM_DEVICE, &qf_data->d_c); 194 CeedChkBackend(ierr); 195 } 196 197 // Apply operator 198 void *opargs[] = {(void *) &nelem, &qf_data->d_c, &data->indices, 199 &data->fields, &data->B, &data->G, &data->W 200 }; 201 const CeedInt dim = data->dim; 202 const CeedInt Q1d = data->Q1d; 203 const CeedInt P1d = data->maxP1d; 204 const CeedInt thread1d = CeedIntMax(Q1d, P1d); 205 int max_threads_per_block, min_grid_size; 206 CeedChk_Cu(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, 207 &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 208 int block[3] = {thread1d, dim < 2 ? 1 : thread1d, -1,}, grid; 209 CeedChkBackend(BlockGridCalculate(nelem, 210 min_grid_size/ cuda_data->device_prop.multiProcessorCount, 211 max_threads_per_block, 212 cuda_data->device_prop.maxThreadsDim[2], 213 cuda_data->device_prop.warpSize, block, &grid)); 214 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 215 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, block[0], block[1], 216 block[2], shared_mem, opargs); 217 CeedChkBackend(ierr); 218 219 // Restore input arrays 220 for (CeedInt i = 0; i < numinputfields; i++) { 221 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 222 CeedChkBackend(ierr); 223 if (emode == CEED_EVAL_WEIGHT) { // Skip 224 } else { 225 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 226 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 227 ierr = CeedVectorRestoreArrayRead(vec, &data->fields.in[i]); 228 CeedChkBackend(ierr); 229 } 230 } 231 232 // Restore output arrays 233 for (CeedInt i = 0; i < numoutputfields; i++) { 234 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 235 CeedChkBackend(ierr); 236 if (emode == CEED_EVAL_WEIGHT) { // Skip 237 } else { 238 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); 239 CeedChkBackend(ierr); 240 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 241 // Check for multiple output modes 242 CeedInt index = -1; 243 for (CeedInt j = 0; j < i; j++) { 244 if (vec == outvecs[j]) { 245 index = j; 246 break; 247 } 248 } 249 if (index == -1) { 250 ierr = CeedVectorRestoreArray(vec, &data->fields.out[i]); 251 CeedChkBackend(ierr); 252 } 253 } 254 } 255 256 // Restore context data 257 if (ctx) { 258 ierr = CeedQFunctionContextRestoreData(ctx, &qf_data->d_c); 259 CeedChkBackend(ierr); 260 } 261 return CEED_ERROR_SUCCESS; 262 } 263 264 //------------------------------------------------------------------------------ 265 // Create operator 266 //------------------------------------------------------------------------------ 267 int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 268 int ierr; 269 Ceed ceed; 270 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 271 CeedOperator_Cuda_gen *impl; 272 273 ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); 274 ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); 275 276 ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", 277 CeedOperatorApplyAdd_Cuda_gen); CeedChkBackend(ierr); 278 ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 279 CeedOperatorDestroy_Cuda_gen); CeedChkBackend(ierr); 280 return CEED_ERROR_SUCCESS; 281 } 282 //------------------------------------------------------------------------------ 283