1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed/ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <hip/hip_runtime.h> 12 #include <stddef.h> 13 #include "ceed-hip-shared.h" 14 #include "../hip/ceed-hip-common.h" 15 #include "../hip/ceed-hip-compile.h" 16 17 //------------------------------------------------------------------------------ 18 // Compute a block size based on required minimum threads 19 //------------------------------------------------------------------------------ 20 static CeedInt ComputeBlockSizeFromRequirement(const CeedInt required) { 21 CeedInt maxSize = 1024; // Max total threads per block 22 CeedInt currentSize = 64; // Start with one group 23 24 while(currentSize < maxSize) { 25 if (currentSize > required) 26 break; 27 else 28 currentSize = currentSize * 2; 29 } 30 return currentSize; 31 } 32 33 //------------------------------------------------------------------------------ 34 // Compute required thread block sizes for basis kernels given P, Q, dim, and 35 // num_comp 36 //------------------------------------------------------------------------------ 37 static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, 38 const CeedInt Q_1d, 39 const CeedInt num_comp, CeedInt *block_sizes) { 40 41 // Note that this will use the same block sizes for all dimensions when compiling, 42 // but as each basis object is defined for a particular dimension, we will never 43 // call any kernels except the ones for the dimension for which we have computed the 44 // block sizes. 45 const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d); 46 switch (dim) { 47 case 1: { 48 // Interp kernels: 49 block_sizes[0] = 256; 50 51 // Grad kernels: 52 block_sizes[1] = 256; 53 54 // Weight kernels: 55 block_sizes[2] = 256; 56 57 } break; 58 case 2: { 59 // Interp kernels: 60 CeedInt required = thread_1d * thread_1d * num_comp; 61 block_sizes[0] = ComputeBlockSizeFromRequirement(required); 62 63 // Grad kernels: currently use same required minimum threads 64 block_sizes[1] = ComputeBlockSizeFromRequirement(required); 65 66 // Weight kernels: 67 required = CeedIntMax(64, Q_1d * Q_1d); 68 block_sizes[2] = ComputeBlockSizeFromRequirement(required); 69 70 } break; 71 case 3: { 72 // Interp kernels: 73 CeedInt required = thread_1d * thread_1d * num_comp; 74 block_sizes[0] = ComputeBlockSizeFromRequirement(required); 75 76 // Grad kernels: currently use same required minimum threads 77 block_sizes[1] = ComputeBlockSizeFromRequirement(required); 78 79 // Weight kernels: 80 required = Q_1d * Q_1d * Q_1d; 81 block_sizes[2] = ComputeBlockSizeFromRequirement(required); 82 } 83 } 84 85 return CEED_ERROR_SUCCESS; 86 } 87 88 //------------------------------------------------------------------------------ 89 // Apply basis 90 //------------------------------------------------------------------------------ 91 int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, 92 CeedTransposeMode t_mode, 93 CeedEvalMode eval_mode, CeedVector u, 94 CeedVector v) { 95 int ierr; 96 Ceed ceed; 97 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 98 Ceed_Hip *ceed_Hip; 99 CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); 100 CeedBasis_Hip_shared *data; 101 CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 102 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 103 CeedInt dim, num_comp; 104 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 105 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 106 107 // Read vectors 108 const CeedScalar *d_u; 109 CeedScalar *d_v; 110 if (eval_mode != CEED_EVAL_WEIGHT) { 111 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 112 } 113 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 114 115 // Clear v for transpose mode 116 if (t_mode == CEED_TRANSPOSE) { 117 CeedSize length; 118 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 119 ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr); 120 } 121 122 // Apply basis operation 123 switch (eval_mode) { 124 case CEED_EVAL_INTERP: { 125 CeedInt P_1d, Q_1d; 126 CeedInt block_size = data->block_sizes[0]; 127 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 128 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 129 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 130 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 131 &d_u, &d_v 132 }; 133 if (dim == 1) { 134 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 135 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 136 CeedInt grid = num_elem / elems_per_block + 137 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 138 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 139 ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, 1, 140 elems_per_block, shared_mem, 141 interp_args); CeedChkBackend(ierr); 142 } else if (dim == 2) { 143 // Check if required threads is small enough to do multiple elems 144 const CeedInt elems_per_block = CeedIntMax(block_size / 145 (thread_1d*thread_1d*num_comp), 1); 146 CeedInt grid = num_elem / elems_per_block + 147 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 148 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 149 CeedScalar); 150 ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, 151 num_comp*elems_per_block, shared_mem, 152 interp_args); CeedChkBackend(ierr); 153 } else if (dim == 3) { 154 CeedInt elems_per_block = 1; 155 CeedInt grid = num_elem / elems_per_block + 156 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 157 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 158 CeedScalar); 159 ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, 160 num_comp*elems_per_block, shared_mem, 161 interp_args); CeedChkBackend(ierr); 162 } 163 } break; 164 case CEED_EVAL_GRAD: { 165 CeedInt P_1d, Q_1d; 166 CeedInt block_size = data->block_sizes[1]; 167 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 168 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 169 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 170 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 171 &data->d_grad_1d, &d_u, &d_v 172 }; 173 if (dim == 1) { 174 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 175 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 176 CeedInt grid = num_elem / elems_per_block + 177 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 178 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 179 ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, 1, 180 elems_per_block, shared_mem, grad_args); 181 CeedChkBackend(ierr); 182 } else if (dim == 2) { 183 // Check if required threads is small enough to do multiple elems 184 const CeedInt elems_per_block = CeedIntMax(block_size/ 185 (thread_1d*thread_1d*num_comp), 1); 186 CeedInt grid = num_elem / elems_per_block + 187 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 188 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 189 CeedScalar); 190 ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, 191 num_comp*elems_per_block, shared_mem, 192 grad_args); CeedChkBackend(ierr); 193 } else if (dim == 3) { 194 CeedInt elems_per_block = 1; 195 CeedInt grid = num_elem / elems_per_block + 196 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 197 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 198 CeedScalar); 199 ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, 200 num_comp*elems_per_block, shared_mem, 201 grad_args); CeedChkBackend(ierr); 202 } 203 } break; 204 case CEED_EVAL_WEIGHT: { 205 CeedInt Q_1d; 206 CeedInt block_size = data->block_sizes[2]; 207 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 208 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 209 if (dim == 1) { 210 const CeedInt opt_elems = block_size / Q_1d; 211 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 212 const CeedInt grid_size = num_elem / elems_per_block + 213 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 214 ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, 215 elems_per_block, 1, weight_args); 216 CeedChkBackend(ierr); 217 } else if (dim == 2) { 218 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 219 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 220 const CeedInt grid_size = num_elem / elems_per_block + 221 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 222 ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, 223 elems_per_block, weight_args); 224 CeedChkBackend(ierr); 225 } else if (dim == 3) { 226 const CeedInt grid_size = num_elem; 227 ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, Q_1d, 228 weight_args); 229 CeedChkBackend(ierr); 230 } 231 } break; 232 // LCOV_EXCL_START 233 // Evaluate the divergence to/from the quadrature points 234 case CEED_EVAL_DIV: 235 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 236 // Evaluate the curl to/from the quadrature points 237 case CEED_EVAL_CURL: 238 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 239 // Take no action, BasisApply should not have been called 240 case CEED_EVAL_NONE: 241 return CeedError(ceed, CEED_ERROR_BACKEND, 242 "CEED_EVAL_NONE does not make sense in this context"); 243 // LCOV_EXCL_STOP 244 } 245 246 // Restore vectors 247 if (eval_mode != CEED_EVAL_WEIGHT) { 248 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 249 } 250 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 251 return CEED_ERROR_SUCCESS; 252 } 253 254 //------------------------------------------------------------------------------ 255 // Destroy basis 256 //------------------------------------------------------------------------------ 257 static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { 258 int ierr; 259 Ceed ceed; 260 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 261 262 CeedBasis_Hip_shared *data; 263 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 264 265 CeedChk_Hip(ceed, hipModuleUnload(data->module)); 266 267 ierr = hipFree(data->d_q_weight_1d); CeedChk_Hip(ceed, ierr); 268 ierr = hipFree(data->d_interp_1d); CeedChk_Hip(ceed, ierr); 269 ierr = hipFree(data->d_grad_1d); CeedChk_Hip(ceed, ierr); 270 ierr = hipFree(data->d_collo_grad_1d); CeedChk_Hip(ceed, ierr); 271 ierr = CeedFree(&data); CeedChkBackend(ierr); 272 273 return CEED_ERROR_SUCCESS; 274 } 275 276 //------------------------------------------------------------------------------ 277 // Create tensor basis 278 //------------------------------------------------------------------------------ 279 int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 280 const CeedScalar *interp_1d, 281 const CeedScalar *grad_1d, 282 const CeedScalar *q_ref1d, 283 const CeedScalar *q_weight_1d, 284 CeedBasis basis) { 285 int ierr; 286 Ceed ceed; 287 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 288 CeedBasis_Hip_shared *data; 289 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 290 291 // Copy basis data to GPU 292 const CeedInt qBytes = Q_1d * sizeof(CeedScalar); 293 ierr = hipMalloc((void **)&data->d_q_weight_1d, qBytes); 294 CeedChk_Hip(ceed, ierr); 295 ierr = hipMemcpy(data->d_q_weight_1d, q_weight_1d, qBytes, 296 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 297 298 const CeedInt iBytes = qBytes * P_1d; 299 ierr = hipMalloc((void **)&data->d_interp_1d, iBytes); CeedChk_Hip(ceed, ierr); 300 ierr = hipMemcpy(data->d_interp_1d, interp_1d, iBytes, 301 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 302 303 ierr = hipMalloc((void **)&data->d_grad_1d, iBytes); CeedChk_Hip(ceed, ierr); 304 ierr = hipMemcpy(data->d_grad_1d, grad_1d, iBytes, 305 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 306 307 // Compute collocated gradient and copy to GPU 308 data->d_collo_grad_1d = NULL; 309 if (dim == 3 && Q_1d >= P_1d) { 310 CeedScalar *collo_grad_1d; 311 ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); 312 ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); 313 ierr = hipMalloc((void **)&data->d_collo_grad_1d, qBytes * Q_1d); 314 CeedChk_Hip(ceed, ierr); 315 ierr = hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, qBytes * Q_1d, 316 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 317 ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); 318 } 319 320 // Set number of threads per block for basis kernels 321 CeedInt num_comp; 322 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 323 ierr = ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, 324 data->block_sizes); 325 CeedChkBackend(ierr); 326 327 // Compile basis kernels 328 char *basis_kernel_path, *basis_kernel_source; 329 ierr = CeedGetInstalledJitPath(ceed, 330 "ceed/jit-source/hip/hip-shared-basis.h", 331 &basis_kernel_path); CeedChkBackend(ierr); 332 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 333 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 334 CeedChkBackend(ierr); 335 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 336 ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 11, 337 "BASIS_Q_1D", Q_1d, 338 "BASIS_P_1D", P_1d, 339 "BASIS_T_1D", CeedIntMax(Q_1d, P_1d), 340 "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 341 Q_1d : P_1d, dim), 342 "BASIS_DIM", dim, 343 "BASIS_NUM_COMP", num_comp, 344 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 345 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), 346 "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], 347 "BASIS_GRAD_BLOCK_SIZE", data->block_sizes[1], 348 "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2] 349 ); CeedChkBackend(ierr); 350 ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp); 351 CeedChkBackend(ierr); 352 ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad); 353 CeedChkBackend(ierr); 354 ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight); 355 CeedChkBackend(ierr); 356 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 357 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 358 359 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 360 361 // Register backend functions 362 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 363 CeedBasisApplyTensor_Hip_shared); 364 CeedChkBackend(ierr); 365 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 366 CeedBasisDestroy_Hip_shared); CeedChkBackend(ierr); 367 return CEED_ERROR_SUCCESS; 368 } 369 //------------------------------------------------------------------------------ 370