1*3d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2*3d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3241a4b83SYohann // 4*3d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5241a4b83SYohann // 6*3d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7241a4b83SYohann 8ec3da8bcSJed Brown #include <ceed/ceed.h> 9ec3da8bcSJed Brown #include <ceed/backend.h> 103d576824SJeremy L Thompson #include <stddef.h> 11241a4b83SYohann #include "ceed-cuda-gen.h" 12241a4b83SYohann #include "ceed-cuda-gen-operator-build.h" 136d69246aSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 14241a4b83SYohann 15ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 16ab213215SJeremy L Thompson // Destroy operator 17ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 18241a4b83SYohann static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 19241a4b83SYohann int ierr; 20241a4b83SYohann CeedOperator_Cuda_gen *impl; 21e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 22e15f9bd0SJeremy L Thompson ierr = CeedFree(&impl); CeedChkBackend(ierr); 23e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 24241a4b83SYohann } 25241a4b83SYohann 2639532cebSJed Brown static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, 2739532cebSJed Brown int elems_per_block) { 2839532cebSJed Brown int useful_threads_per_block = threads_per_elem * elems_per_block; 2939532cebSJed Brown // round up to nearest multiple of warp_size 3039532cebSJed Brown int block_size = ((useful_threads_per_block + warp_size - 1) / warp_size) * 3139532cebSJed Brown warp_size; 3239532cebSJed Brown int blocks_per_sm = threads_per_sm / block_size; 3339532cebSJed Brown return threads_per_sm - useful_threads_per_block * blocks_per_sm; 3439532cebSJed Brown } 3539532cebSJed Brown 3639532cebSJed Brown // Choose the least wasteful block size constrained by blocks_per_sm of 3739532cebSJed Brown // max_threads_per_block. 3839532cebSJed Brown // 3939532cebSJed Brown // The x and y part of block[] contains per-element sizes (specified on input) 4039532cebSJed Brown // while the z part is number of elements. 4139532cebSJed Brown // 4239532cebSJed Brown // Problem setting: we'd like to make occupancy high with relatively few 4339532cebSJed Brown // inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how 4439532cebSJed Brown // many threads can run. 4539532cebSJed Brown // 4639532cebSJed Brown // Note that full occupancy sometimes can't be achieved by one thread block. For 4739532cebSJed Brown // example, an SM might support 1536 threads in total, but only 1024 within a 4839532cebSJed Brown // single thread block. So cuOccupancyMaxPotentialBlockSize may suggest a block 4939532cebSJed Brown // size of 768 so that two blocks can run, versus one block of 1024 will prevent 5039532cebSJed Brown // a second block from running. The cuda-gen kernels are pretty heavy with lots 5139532cebSJed Brown // of instruction-level parallelism (ILP) so we'll generally be okay with 5239532cebSJed Brown // relatvely low occupancy and smaller thread blocks, but we solve a reasonably 5339532cebSJed Brown // general problem here. Empirically, we find that blocks bigger than about 256 5439532cebSJed Brown // have higher latency and worse load balancing when the number of elements is 5539532cebSJed Brown // modest. 5639532cebSJed Brown // 5739532cebSJed Brown // cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of 5839532cebSJed Brown // the number of quadrature points (or number of basis functions). They also 5939532cebSJed Brown // have a lot of __syncthreads(), which is another point against excessively 6039532cebSJed Brown // large thread blocks. Suppose I have elements with 7x7x7 quadrature points. 6139532cebSJed Brown // This will loop over the last dimension, so we have 7*7=49 threads per 6239532cebSJed Brown // element. Suppose we have two elements = 2*49=98 useful threads. CUDA 6339532cebSJed Brown // schedules in units of full warps (32 threads), so 128 CUDA hardware threads 6439532cebSJed Brown // are effectively committed to that block. Now suppose 6539532cebSJed Brown // cuOccupancyMaxPotentialBlockSize returned 352. We can schedule 2 blocks of 6639532cebSJed Brown // size 98 (196 useful threads using 256 hardware threads), but not a third 6739532cebSJed Brown // block (which would need a total of 384 hardware threads). 6839532cebSJed Brown // 6939532cebSJed Brown // If instead, we had packed 3 elements, we'd have 3*49=147 useful threads 7039532cebSJed Brown // occupying 160 slots, and could schedule two blocks. Alternatively, we could 7139532cebSJed Brown // pack a single block of 7 elements (2*49=343 useful threads) into the 354 7239532cebSJed Brown // slots. The latter has the least "waste", but __syncthreads() 7339532cebSJed Brown // over-synchronizes and it might not pay off relative to smaller blocks. 7439532cebSJed Brown static int BlockGridCalculate(CeedInt nelem, int blocks_per_sm, 7513516544Snbeams int max_threads_per_block, int max_threads_z, 7613516544Snbeams int warp_size, int block[3], int *grid) { 7739532cebSJed Brown const int threads_per_sm = blocks_per_sm * max_threads_per_block; 7839532cebSJed Brown const int threads_per_elem = block[0] * block[1]; 7939532cebSJed Brown int elems_per_block = 1; 8039532cebSJed Brown int waste = Waste(threads_per_sm, warp_size, threads_per_elem, 1); 8139532cebSJed Brown for (int i=2; 8239532cebSJed Brown i <= CeedIntMin(max_threads_per_block / threads_per_elem, nelem); 8339532cebSJed Brown i++) { 8439532cebSJed Brown int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i); 8539532cebSJed Brown // We want to minimize waste, but smaller kernels have lower latency and 8639532cebSJed Brown // less __syncthreads() overhead so when a larger block size has the same 8739532cebSJed Brown // waste as a smaller one, go ahead and prefer the smaller block. 8839532cebSJed Brown if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) { 8939532cebSJed Brown elems_per_block = i; 9039532cebSJed Brown waste = i_waste; 9139532cebSJed Brown } 9239532cebSJed Brown } 9313516544Snbeams // In low-order elements, threads_per_elem may be sufficiently low to give 9413516544Snbeams // an elems_per_block greater than allowable for the device, so we must check 9513516544Snbeams // before setting the z-dimension size of the block. 9613516544Snbeams block[2] = CeedIntMin(elems_per_block, max_threads_z); 9739532cebSJed Brown *grid = (nelem + elems_per_block - 1) / elems_per_block; 9839532cebSJed Brown return CEED_ERROR_SUCCESS; 9939532cebSJed Brown } 10039532cebSJed Brown 10139532cebSJed Brown // callback for cuOccupancyMaxPotentialBlockSize, providing the amount of 10239532cebSJed Brown // dynamic shared memory required for a thread block of size threads. 10339532cebSJed Brown static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); } 10439532cebSJed Brown 105ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 106ab213215SJeremy L Thompson // Apply and add to output 107ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1083e0c3786SYohann Dudouit static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec, 109241a4b83SYohann CeedVector outvec, CeedRequest *request) { 110241a4b83SYohann int ierr; 111241a4b83SYohann Ceed ceed; 112e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 11339532cebSJed Brown Ceed_Cuda *cuda_data; 11439532cebSJed Brown ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr); 115241a4b83SYohann CeedOperator_Cuda_gen *data; 116e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 117241a4b83SYohann CeedQFunction qf; 118241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 119e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 120e15f9bd0SJeremy L Thompson ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 121241a4b83SYohann CeedInt nelem, numinputfields, numoutputfields; 122e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumElements(op, &nelem); CeedChkBackend(ierr); 123241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 1247e7773b5SJeremy L Thompson ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, 1257e7773b5SJeremy L Thompson &numoutputfields, &opoutputfields); 126e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 127241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 1287e7773b5SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 129e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 130241a4b83SYohann CeedEvalMode emode; 131bf4cb664SJeremy L Thompson CeedVector vec, outvecs[CEED_FIELD_MAX] = {}; 132241a4b83SYohann 133241a4b83SYohann // Creation of the operator 134e15f9bd0SJeremy L Thompson ierr = CeedCudaGenOperatorBuild(op); CeedChkBackend(ierr); 135241a4b83SYohann 136241a4b83SYohann // Input vectors 137241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 138241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 139e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 140241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 141241a4b83SYohann data->fields.in[i] = NULL; 142241a4b83SYohann } else { 143241a4b83SYohann // Get input vector 144e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 145241a4b83SYohann if (vec == CEED_VECTOR_ACTIVE) vec = invec; 146241a4b83SYohann ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.in[i]); 147e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 148241a4b83SYohann } 149241a4b83SYohann } 150241a4b83SYohann 151241a4b83SYohann // Output vectors 152241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 153241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 154e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 155241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 156241a4b83SYohann data->fields.out[i] = NULL; 157241a4b83SYohann } else { 158241a4b83SYohann // Get output vector 159e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); 160e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 161241a4b83SYohann if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 1623b2939feSjeremylt outvecs[i] = vec; 1633b2939feSjeremylt // Check for multiple output modes 1643b2939feSjeremylt CeedInt index = -1; 1653b2939feSjeremylt for (CeedInt j = 0; j < i; j++) { 1663b2939feSjeremylt if (vec == outvecs[j]) { 1673b2939feSjeremylt index = j; 1683b2939feSjeremylt break; 1693b2939feSjeremylt } 1703b2939feSjeremylt } 1713b2939feSjeremylt if (index == -1) { 172241a4b83SYohann ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.out[i]); 173e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1743b2939feSjeremylt } else { 1753b2939feSjeremylt data->fields.out[i] = data->fields.out[index]; 1763b2939feSjeremylt } 177241a4b83SYohann } 178241a4b83SYohann } 179241a4b83SYohann 180777ff853SJeremy L Thompson // Get context data 181441428dfSJeremy L Thompson ierr = CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c); 182e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 183241a4b83SYohann 184241a4b83SYohann // Apply operator 185288c0443SJeremy L Thompson void *opargs[] = {(void *) &nelem, &qf_data->d_c, &data->indices, 186d80fc06aSjeremylt &data->fields, &data->B, &data->G, &data->W 1877f823360Sjeremylt }; 188241a4b83SYohann const CeedInt dim = data->dim; 189241a4b83SYohann const CeedInt Q1d = data->Q1d; 19018d499f1SYohann const CeedInt P1d = data->maxP1d; 19118d499f1SYohann const CeedInt thread1d = CeedIntMax(Q1d, P1d); 19239532cebSJed Brown int max_threads_per_block, min_grid_size; 19339532cebSJed Brown CeedChk_Cu(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, 19439532cebSJed Brown &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 19539532cebSJed Brown int block[3] = {thread1d, dim < 2 ? 1 : thread1d, -1,}, grid; 19639532cebSJed Brown CeedChkBackend(BlockGridCalculate(nelem, 1970d0321e0SJeremy L Thompson min_grid_size/ cuda_data->device_prop.multiProcessorCount, 1980d0321e0SJeremy L Thompson max_threads_per_block, 1990d0321e0SJeremy L Thompson cuda_data->device_prop.maxThreadsDim[2], 2000d0321e0SJeremy L Thompson cuda_data->device_prop.warpSize, block, &grid)); 20139532cebSJed Brown CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 20239532cebSJed Brown ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, block[0], block[1], 20339532cebSJed Brown block[2], shared_mem, opargs); 204e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 205241a4b83SYohann 206241a4b83SYohann // Restore input arrays 207241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 208241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 209e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 210241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 211241a4b83SYohann } else { 212e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 213241a4b83SYohann if (vec == CEED_VECTOR_ACTIVE) vec = invec; 214241a4b83SYohann ierr = CeedVectorRestoreArrayRead(vec, &data->fields.in[i]); 215e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 216241a4b83SYohann } 217241a4b83SYohann } 218241a4b83SYohann 219241a4b83SYohann // Restore output arrays 220241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 221241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 222e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 223241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 224241a4b83SYohann } else { 225e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); 226e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 227241a4b83SYohann if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 2283b2939feSjeremylt // Check for multiple output modes 2293b2939feSjeremylt CeedInt index = -1; 2303b2939feSjeremylt for (CeedInt j = 0; j < i; j++) { 2313b2939feSjeremylt if (vec == outvecs[j]) { 2323b2939feSjeremylt index = j; 2333b2939feSjeremylt break; 2343b2939feSjeremylt } 2353b2939feSjeremylt } 2363b2939feSjeremylt if (index == -1) { 237241a4b83SYohann ierr = CeedVectorRestoreArray(vec, &data->fields.out[i]); 238e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 239241a4b83SYohann } 240241a4b83SYohann } 2413b2939feSjeremylt } 242777ff853SJeremy L Thompson 243777ff853SJeremy L Thompson // Restore context data 244441428dfSJeremy L Thompson ierr = CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c); 245e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 246441428dfSJeremy L Thompson 247e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 248241a4b83SYohann } 249241a4b83SYohann 250ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 251ab213215SJeremy L Thompson // Create operator 252ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 253241a4b83SYohann int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 254241a4b83SYohann int ierr; 255241a4b83SYohann Ceed ceed; 256e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 257241a4b83SYohann CeedOperator_Cuda_gen *impl; 258241a4b83SYohann 259e15f9bd0SJeremy L Thompson ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); 260e15f9bd0SJeremy L Thompson ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); 261241a4b83SYohann 2623e0c3786SYohann Dudouit ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", 263e15f9bd0SJeremy L Thompson CeedOperatorApplyAdd_Cuda_gen); CeedChkBackend(ierr); 264241a4b83SYohann ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 265e15f9bd0SJeremy L Thompson CeedOperatorDestroy_Cuda_gen); CeedChkBackend(ierr); 266e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 267241a4b83SYohann } 268ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 269