1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 16 #include "../cuda/ceed-cuda-common.h" 17 #include "../cuda/ceed-cuda-compile.h" 18 #include "ceed-cuda-shared.h" 19 20 //------------------------------------------------------------------------------ 21 // Device initalization 22 //------------------------------------------------------------------------------ 23 int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B); 24 int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); 25 int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); 26 27 //------------------------------------------------------------------------------ 28 // Apply basis 29 //------------------------------------------------------------------------------ 30 int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 31 CeedVector v) { 32 Ceed ceed; 33 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 34 Ceed_Cuda *ceed_Cuda; 35 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 36 CeedBasis_Cuda_shared *data; 37 CeedCallBackend(CeedBasisGetData(basis, &data)); 38 CeedInt dim, num_comp; 39 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 40 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 41 42 // Read vectors 43 const CeedScalar *d_u; 44 CeedScalar *d_v; 45 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 46 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 47 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 48 49 // Apply basis operation 50 switch (eval_mode) { 51 case CEED_EVAL_INTERP: { 52 CeedInt P_1d, Q_1d; 53 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 54 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 55 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 56 CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B)); 57 void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v}; 58 if (dim == 1) { 59 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 60 1)); // avoid >512 total threads 61 CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 62 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 63 if (t_mode == CEED_TRANSPOSE) { 64 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); 65 } else { 66 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); 67 } 68 } else if (dim == 2) { 69 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 70 // elems_per_block must be at least 1 71 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 72 CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 73 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 74 if (t_mode == CEED_TRANSPOSE) { 75 CeedCallBackend( 76 CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 77 } else { 78 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 79 } 80 } else if (dim == 3) { 81 CeedInt elems_per_block = 1; 82 CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 83 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 84 if (t_mode == CEED_TRANSPOSE) { 85 CeedCallBackend( 86 CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 87 } else { 88 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 89 } 90 } 91 } break; 92 case CEED_EVAL_GRAD: { 93 CeedInt P_1d, Q_1d; 94 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 95 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 96 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 97 if (data->d_collo_grad_1d) { 98 CeedCallBackend(CeedInit_CudaCollocatedGrad(data->d_interp_1d, data->d_collo_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G)); 99 } else { 100 CeedCallBackend(CeedInit_CudaGrad(data->d_interp_1d, data->d_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G)); 101 } 102 void *grad_args[] = {(void *)&num_elem, &data->c_B, &data->c_G, &d_u, &d_v}; 103 if (dim == 1) { 104 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 105 1)); // avoid >512 total threads 106 CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 107 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 108 if (t_mode == CEED_TRANSPOSE) { 109 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); 110 } else { 111 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); 112 } 113 } else if (dim == 2) { 114 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 115 // elems_per_block must be at least 1 116 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 117 CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 118 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 119 if (t_mode == CEED_TRANSPOSE) { 120 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 121 } else { 122 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 123 } 124 } else if (dim == 3) { 125 CeedInt elems_per_block = 1; 126 CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 127 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 128 if (t_mode == CEED_TRANSPOSE) { 129 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 130 } else { 131 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 132 } 133 } 134 } break; 135 case CEED_EVAL_WEIGHT: { 136 CeedInt Q_1d; 137 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 138 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 139 if (dim == 1) { 140 const CeedInt elems_per_block = 32 / Q_1d; 141 const CeedInt gridsize = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 142 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, gridsize, Q_1d, elems_per_block, 1, weight_args)); 143 } else if (dim == 2) { 144 const CeedInt opt_elems = 32 / (Q_1d * Q_1d); 145 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 146 const CeedInt gridsize = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 147 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, elems_per_block, weight_args)); 148 } else if (dim == 3) { 149 const CeedInt opt_elems = 32 / (Q_1d * Q_1d); 150 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 151 const CeedInt gridsize = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 152 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, elems_per_block, weight_args)); 153 } 154 } break; 155 // LCOV_EXCL_START 156 // Evaluate the divergence to/from the quadrature points 157 case CEED_EVAL_DIV: 158 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 159 // Evaluate the curl to/from the quadrature points 160 case CEED_EVAL_CURL: 161 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 162 // Take no action, BasisApply should not have been called 163 case CEED_EVAL_NONE: 164 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 165 // LCOV_EXCL_STOP 166 } 167 168 // Restore vectors 169 if (eval_mode != CEED_EVAL_WEIGHT) { 170 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 171 } 172 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 173 return CEED_ERROR_SUCCESS; 174 } 175 176 //------------------------------------------------------------------------------ 177 // Destroy basis 178 //------------------------------------------------------------------------------ 179 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 180 Ceed ceed; 181 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 182 183 CeedBasis_Cuda_shared *data; 184 CeedCallBackend(CeedBasisGetData(basis, &data)); 185 186 CeedCallCuda(ceed, cuModuleUnload(data->module)); 187 188 CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 189 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 190 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 191 CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d)); 192 193 CeedCallBackend(CeedFree(&data)); 194 195 return CEED_ERROR_SUCCESS; 196 } 197 198 //------------------------------------------------------------------------------ 199 // Create tensor basis 200 //------------------------------------------------------------------------------ 201 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 202 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 203 Ceed ceed; 204 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 205 CeedBasis_Cuda_shared *data; 206 CeedCallBackend(CeedCalloc(1, &data)); 207 208 // Copy basis data to GPU 209 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 210 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 211 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 212 213 const CeedInt interp_bytes = q_bytes * P_1d; 214 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 215 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 216 217 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 218 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 219 220 // Compute collocated gradient and copy to GPU 221 data->d_collo_grad_1d = NULL; 222 bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; 223 if (has_collocated_grad) { 224 CeedScalar *collo_grad_1d; 225 CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); 226 CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); 227 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d)); 228 CeedCallCuda(ceed, cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, cudaMemcpyHostToDevice)); 229 CeedCallBackend(CeedFree(&collo_grad_1d)); 230 } 231 232 // Compile basis kernels 233 CeedInt num_comp; 234 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 235 char *basis_kernel_path, *basis_kernel_source; 236 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor.h", &basis_kernel_path)); 237 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 238 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 239 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete -----\n"); 240 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", 241 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 242 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad)); 243 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 244 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 245 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 246 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); 247 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 248 CeedCallBackend(CeedFree(&basis_kernel_path)); 249 CeedCallBackend(CeedFree(&basis_kernel_source)); 250 251 CeedCallBackend(CeedBasisSetData(basis, data)); 252 253 // Register backend functions 254 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared)); 255 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 256 return CEED_ERROR_SUCCESS; 257 } 258 259 //------------------------------------------------------------------------------ 260