1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include <stddef.h> 20 #include "ceed-cuda-gen.h" 21 #include "ceed-cuda-gen-operator-build.h" 22 #include "../cuda/ceed-cuda-compile.h" 23 24 //------------------------------------------------------------------------------ 25 // Destroy operator 26 //------------------------------------------------------------------------------ 27 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 28 int ierr; 29 CeedOperator_Cuda_gen *impl; 30 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 31 ierr = CeedFree(&impl); CeedChkBackend(ierr); 32 return CEED_ERROR_SUCCESS; 33 } 34 35 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, 36 int elems_per_block) { 37 int useful_threads_per_block = threads_per_elem * elems_per_block; 38 // round up to nearest multiple of warp_size 39 int block_size = ((useful_threads_per_block + warp_size - 1) / warp_size) * 40 warp_size; 41 int blocks_per_sm = threads_per_sm / block_size; 42 return threads_per_sm - useful_threads_per_block * blocks_per_sm; 43 } 44 45 // Choose the least wasteful block size constrained by blocks_per_sm of 46 // max_threads_per_block. 47 // 48 // The x and y part of block[] contains per-element sizes (specified on input) 49 // while the z part is number of elements. 50 // 51 // Problem setting: we'd like to make occupancy high with relatively few 52 // inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how 53 // many threads can run. 54 // 55 // Note that full occupancy sometimes can't be achieved by one thread block. For 56 // example, an SM might support 1536 threads in total, but only 1024 within a 57 // single thread block. So cuOccupancyMaxPotentialBlockSize may suggest a block 58 // size of 768 so that two blocks can run, versus one block of 1024 will prevent 59 // a second block from running. The cuda-gen kernels are pretty heavy with lots 60 // of instruction-level parallelism (ILP) so we'll generally be okay with 61 // relatvely low occupancy and smaller thread blocks, but we solve a reasonably 62 // general problem here. Empirically, we find that blocks bigger than about 256 63 // have higher latency and worse load balancing when the number of elements is 64 // modest. 65 // 66 // cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of 67 // the number of quadrature points (or number of basis functions). They also 68 // have a lot of __syncthreads(), which is another point against excessively 69 // large thread blocks. Suppose I have elements with 7x7x7 quadrature points. 70 // This will loop over the last dimension, so we have 7*7=49 threads per 71 // element. Suppose we have two elements = 2*49=98 useful threads. CUDA 72 // schedules in units of full warps (32 threads), so 128 CUDA hardware threads 73 // are effectively committed to that block. Now suppose 74 // cuOccupancyMaxPotentialBlockSize returned 352. We can schedule 2 blocks of 75 // size 98 (196 useful threads using 256 hardware threads), but not a third 76 // block (which would need a total of 384 hardware threads). 77 // 78 // If instead, we had packed 3 elements, we'd have 3*49=147 useful threads 79 // occupying 160 slots, and could schedule two blocks. Alternatively, we could 80 // pack a single block of 7 elements (2*49=343 useful threads) into the 354 81 // slots. The latter has the least "waste", but __syncthreads() 82 // over-synchronizes and it might not pay off relative to smaller blocks. 83 static int BlockGridCalculate(CeedInt nelem, int blocks_per_sm, 84 int max_threads_per_block, int max_threads_z, 85 int warp_size, int block[3], int *grid) { 86 const int threads_per_sm = blocks_per_sm * max_threads_per_block; 87 const int threads_per_elem = block[0] * block[1]; 88 int elems_per_block = 1; 89 int waste = Waste(threads_per_sm, warp_size, threads_per_elem, 1); 90 for (int i=2; 91 i <= CeedIntMin(max_threads_per_block / threads_per_elem, nelem); 92 i++) { 93 int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i); 94 // We want to minimize waste, but smaller kernels have lower latency and 95 // less __syncthreads() overhead so when a larger block size has the same 96 // waste as a smaller one, go ahead and prefer the smaller block. 97 if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) { 98 elems_per_block = i; 99 waste = i_waste; 100 } 101 } 102 // In low-order elements, threads_per_elem may be sufficiently low to give 103 // an elems_per_block greater than allowable for the device, so we must check 104 // before setting the z-dimension size of the block. 105 block[2] = CeedIntMin(elems_per_block, max_threads_z); 106 *grid = (nelem + elems_per_block - 1) / elems_per_block; 107 return CEED_ERROR_SUCCESS; 108 } 109 110 // callback for cuOccupancyMaxPotentialBlockSize, providing the amount of 111 // dynamic shared memory required for a thread block of size threads. 112 static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); } 113 114 //------------------------------------------------------------------------------ 115 // Apply and add to output 116 //------------------------------------------------------------------------------ 117 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec, 118 CeedVector outvec, CeedRequest *request) { 119 int ierr; 120 Ceed ceed; 121 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 122 Ceed_Cuda *cuda_data; 123 ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr); 124 CeedOperator_Cuda_gen *data; 125 ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 126 CeedQFunction qf; 127 CeedQFunction_Cuda_gen *qf_data; 128 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 129 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 130 CeedInt nelem, numinputfields, numoutputfields; 131 ierr = CeedOperatorGetNumElements(op, &nelem); CeedChkBackend(ierr); 132 CeedOperatorField *opinputfields, *opoutputfields; 133 ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, 134 &numoutputfields, &opoutputfields); 135 CeedChkBackend(ierr); 136 CeedQFunctionField *qfinputfields, *qfoutputfields; 137 ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 138 CeedChkBackend(ierr); 139 CeedEvalMode emode; 140 CeedVector vec, outvecs[CEED_FIELD_MAX] = {}; 141 142 // Creation of the operator 143 ierr = CeedCudaGenOperatorBuild(op); CeedChkBackend(ierr); 144 145 // Input vectors 146 for (CeedInt i = 0; i < numinputfields; i++) { 147 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 148 CeedChkBackend(ierr); 149 if (emode == CEED_EVAL_WEIGHT) { // Skip 150 data->fields.in[i] = NULL; 151 } else { 152 // Get input vector 153 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 154 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 155 ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.in[i]); 156 CeedChkBackend(ierr); 157 } 158 } 159 160 // Output vectors 161 for (CeedInt i = 0; i < numoutputfields; i++) { 162 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 163 CeedChkBackend(ierr); 164 if (emode == CEED_EVAL_WEIGHT) { // Skip 165 data->fields.out[i] = NULL; 166 } else { 167 // Get output vector 168 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); 169 CeedChkBackend(ierr); 170 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 171 outvecs[i] = vec; 172 // Check for multiple output modes 173 CeedInt index = -1; 174 for (CeedInt j = 0; j < i; j++) { 175 if (vec == outvecs[j]) { 176 index = j; 177 break; 178 } 179 } 180 if (index == -1) { 181 ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.out[i]); 182 CeedChkBackend(ierr); 183 } else { 184 data->fields.out[i] = data->fields.out[index]; 185 } 186 } 187 } 188 189 // Get context data 190 ierr = CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c); 191 CeedChkBackend(ierr); 192 193 // Apply operator 194 void *opargs[] = {(void *) &nelem, &qf_data->d_c, &data->indices, 195 &data->fields, &data->B, &data->G, &data->W 196 }; 197 const CeedInt dim = data->dim; 198 const CeedInt Q1d = data->Q1d; 199 const CeedInt P1d = data->maxP1d; 200 const CeedInt thread1d = CeedIntMax(Q1d, P1d); 201 int max_threads_per_block, min_grid_size; 202 CeedChk_Cu(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, 203 &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 204 int block[3] = {thread1d, dim < 2 ? 1 : thread1d, -1,}, grid; 205 CeedChkBackend(BlockGridCalculate(nelem, 206 min_grid_size/ cuda_data->device_prop.multiProcessorCount, 207 max_threads_per_block, 208 cuda_data->device_prop.maxThreadsDim[2], 209 cuda_data->device_prop.warpSize, block, &grid)); 210 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 211 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, block[0], block[1], 212 block[2], shared_mem, opargs); 213 CeedChkBackend(ierr); 214 215 // Restore input arrays 216 for (CeedInt i = 0; i < numinputfields; i++) { 217 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 218 CeedChkBackend(ierr); 219 if (emode == CEED_EVAL_WEIGHT) { // Skip 220 } else { 221 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 222 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 223 ierr = CeedVectorRestoreArrayRead(vec, &data->fields.in[i]); 224 CeedChkBackend(ierr); 225 } 226 } 227 228 // Restore output arrays 229 for (CeedInt i = 0; i < numoutputfields; i++) { 230 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 231 CeedChkBackend(ierr); 232 if (emode == CEED_EVAL_WEIGHT) { // Skip 233 } else { 234 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); 235 CeedChkBackend(ierr); 236 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 237 // Check for multiple output modes 238 CeedInt index = -1; 239 for (CeedInt j = 0; j < i; j++) { 240 if (vec == outvecs[j]) { 241 index = j; 242 break; 243 } 244 } 245 if (index == -1) { 246 ierr = CeedVectorRestoreArray(vec, &data->fields.out[i]); 247 CeedChkBackend(ierr); 248 } 249 } 250 } 251 252 // Restore context data 253 ierr = CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c); 254 CeedChkBackend(ierr); 255 256 return CEED_ERROR_SUCCESS; 257 } 258 259 //------------------------------------------------------------------------------ 260 // Create operator 261 //------------------------------------------------------------------------------ 262 int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 263 int ierr; 264 Ceed ceed; 265 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 266 CeedOperator_Cuda_gen *impl; 267 268 ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); 269 ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); 270 271 ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", 272 CeedOperatorApplyAdd_Cuda_gen); CeedChkBackend(ierr); 273 ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 274 CeedOperatorDestroy_Cuda_gen); CeedChkBackend(ierr); 275 return CEED_ERROR_SUCCESS; 276 } 277 //------------------------------------------------------------------------------ 278