1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include <ceed/jit-tools.h> 20 #include <hip/hip_runtime.h> 21 #include <stddef.h> 22 #include "ceed-hip-shared.h" 23 #include "../hip/ceed-hip-common.h" 24 #include "../hip/ceed-hip-compile.h" 25 26 //------------------------------------------------------------------------------ 27 // Compute a block size based on required minimum threads 28 //------------------------------------------------------------------------------ 29 static CeedInt ComputeBlockSizeFromRequirement(const CeedInt required) { 30 CeedInt maxSize = 1024; // Max total threads per block 31 CeedInt currentSize = 64; // Start with one group 32 33 while(currentSize < maxSize) { 34 if (currentSize > required) 35 break; 36 else 37 currentSize = currentSize * 2; 38 } 39 return currentSize; 40 } 41 42 //------------------------------------------------------------------------------ 43 // Compute required thread block sizes for basis kernels given P, Q, dim, and 44 // num_comp 45 //------------------------------------------------------------------------------ 46 static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, 47 const CeedInt Q_1d, 48 const CeedInt num_comp, CeedInt *block_sizes) { 49 50 // Note that this will use the same block sizes for all dimensions when compiling, 51 // but as each basis object is defined for a particular dimension, we will never 52 // call any kernels except the ones for the dimension for which we have computed the 53 // block sizes. 54 const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d); 55 switch (dim) { 56 case 1: { 57 // Interp kernels: 58 block_sizes[0] = 256; 59 60 // Grad kernels: 61 block_sizes[1] = 256; 62 63 // Weight kernels: 64 block_sizes[2] = 256; 65 66 } break; 67 case 2: { 68 // Interp kernels: 69 CeedInt required = thread_1d * thread_1d * num_comp; 70 block_sizes[0] = ComputeBlockSizeFromRequirement(required); 71 72 // Grad kernels: currently use same required minimum threads 73 block_sizes[1] = ComputeBlockSizeFromRequirement(required); 74 75 // Weight kernels: 76 required = CeedIntMax(64, Q_1d * Q_1d); 77 block_sizes[2] = ComputeBlockSizeFromRequirement(required); 78 79 } break; 80 case 3: { 81 // Interp kernels: 82 CeedInt required = thread_1d * thread_1d * num_comp; 83 block_sizes[0] = ComputeBlockSizeFromRequirement(required); 84 85 // Grad kernels: currently use same required minimum threads 86 block_sizes[1] = ComputeBlockSizeFromRequirement(required); 87 88 // Weight kernels: 89 required = Q_1d * Q_1d * Q_1d; 90 block_sizes[2] = ComputeBlockSizeFromRequirement(required); 91 } 92 } 93 94 return CEED_ERROR_SUCCESS; 95 } 96 97 //------------------------------------------------------------------------------ 98 // Apply basis 99 //------------------------------------------------------------------------------ 100 int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, 101 CeedTransposeMode t_mode, 102 CeedEvalMode eval_mode, CeedVector u, 103 CeedVector v) { 104 int ierr; 105 Ceed ceed; 106 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 107 Ceed_Hip *ceed_Hip; 108 CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); 109 CeedBasis_Hip_shared *data; 110 CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 111 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 112 CeedInt dim, num_comp; 113 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 114 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 115 116 // Read vectors 117 const CeedScalar *d_u; 118 CeedScalar *d_v; 119 if (eval_mode != CEED_EVAL_WEIGHT) { 120 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 121 } 122 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 123 124 // Clear v for transpose mode 125 if (t_mode == CEED_TRANSPOSE) { 126 CeedInt length; 127 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 128 ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr); 129 } 130 131 // Apply basis operation 132 switch (eval_mode) { 133 case CEED_EVAL_INTERP: { 134 CeedInt P_1d, Q_1d; 135 CeedInt block_size = data->block_sizes[0]; 136 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 137 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 138 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 139 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 140 &d_u, &d_v 141 }; 142 if (dim == 1) { 143 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 144 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 145 CeedInt grid = num_elem / elems_per_block + 146 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 147 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 148 ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, 1, 149 elems_per_block, shared_mem, 150 interp_args); CeedChkBackend(ierr); 151 } else if (dim == 2) { 152 // Check if required threads is small enough to do multiple elems 153 const CeedInt elems_per_block = CeedIntMax(block_size / 154 (thread_1d*thread_1d*num_comp), 1); 155 CeedInt grid = num_elem / elems_per_block + 156 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 157 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 158 CeedScalar); 159 ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, 160 num_comp*elems_per_block, shared_mem, 161 interp_args); CeedChkBackend(ierr); 162 } else if (dim == 3) { 163 CeedInt elems_per_block = 1; 164 CeedInt grid = num_elem / elems_per_block + 165 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 166 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 167 CeedScalar); 168 ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, 169 num_comp*elems_per_block, shared_mem, 170 interp_args); CeedChkBackend(ierr); 171 } 172 } break; 173 case CEED_EVAL_GRAD: { 174 CeedInt P_1d, Q_1d; 175 CeedInt block_size = data->block_sizes[1]; 176 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 177 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 178 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 179 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 180 &data->d_grad_1d, &d_u, &d_v 181 }; 182 if (dim == 1) { 183 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 184 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 185 CeedInt grid = num_elem / elems_per_block + 186 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 187 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 188 ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, 1, 189 elems_per_block, shared_mem, grad_args); 190 CeedChkBackend(ierr); 191 } else if (dim == 2) { 192 // Check if required threads is small enough to do multiple elems 193 const CeedInt elems_per_block = CeedIntMax(block_size/ 194 (thread_1d*thread_1d*num_comp), 1); 195 CeedInt grid = num_elem / elems_per_block + 196 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 197 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 198 CeedScalar); 199 ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, 200 num_comp*elems_per_block, shared_mem, 201 grad_args); CeedChkBackend(ierr); 202 } else if (dim == 3) { 203 CeedInt elems_per_block = 1; 204 CeedInt grid = num_elem / elems_per_block + 205 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 206 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 207 CeedScalar); 208 ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, 209 num_comp*elems_per_block, shared_mem, 210 grad_args); CeedChkBackend(ierr); 211 } 212 } break; 213 case CEED_EVAL_WEIGHT: { 214 CeedInt Q_1d; 215 CeedInt block_size = data->block_sizes[2]; 216 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 217 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 218 if (dim == 1) { 219 const CeedInt opt_elems = block_size / Q_1d; 220 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 221 const CeedInt grid_size = num_elem / elems_per_block + 222 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 223 ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, 224 elems_per_block, 1, weight_args); 225 CeedChkBackend(ierr); 226 } else if (dim == 2) { 227 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 228 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 229 const CeedInt grid_size = num_elem / elems_per_block + 230 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 231 ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, 232 elems_per_block, weight_args); 233 CeedChkBackend(ierr); 234 } else if (dim == 3) { 235 const CeedInt grid_size = num_elem; 236 ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, Q_1d, 237 weight_args); 238 CeedChkBackend(ierr); 239 } 240 } break; 241 // LCOV_EXCL_START 242 // Evaluate the divergence to/from the quadrature points 243 case CEED_EVAL_DIV: 244 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 245 // Evaluate the curl to/from the quadrature points 246 case CEED_EVAL_CURL: 247 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 248 // Take no action, BasisApply should not have been called 249 case CEED_EVAL_NONE: 250 return CeedError(ceed, CEED_ERROR_BACKEND, 251 "CEED_EVAL_NONE does not make sense in this context"); 252 // LCOV_EXCL_STOP 253 } 254 255 // Restore vectors 256 if (eval_mode != CEED_EVAL_WEIGHT) { 257 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 258 } 259 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 260 return CEED_ERROR_SUCCESS; 261 } 262 263 //------------------------------------------------------------------------------ 264 // Destroy basis 265 //------------------------------------------------------------------------------ 266 static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { 267 int ierr; 268 Ceed ceed; 269 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 270 271 CeedBasis_Hip_shared *data; 272 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 273 274 CeedChk_Hip(ceed, hipModuleUnload(data->module)); 275 276 ierr = hipFree(data->d_q_weight_1d); CeedChk_Hip(ceed, ierr); 277 ierr = hipFree(data->d_interp_1d); CeedChk_Hip(ceed, ierr); 278 ierr = hipFree(data->d_grad_1d); CeedChk_Hip(ceed, ierr); 279 ierr = hipFree(data->d_collo_grad_1d); CeedChk_Hip(ceed, ierr); 280 ierr = CeedFree(&data); CeedChkBackend(ierr); 281 282 return CEED_ERROR_SUCCESS; 283 } 284 285 //------------------------------------------------------------------------------ 286 // Create tensor basis 287 //------------------------------------------------------------------------------ 288 int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 289 const CeedScalar *interp_1d, 290 const CeedScalar *grad_1d, 291 const CeedScalar *q_ref1d, 292 const CeedScalar *q_weight_1d, 293 CeedBasis basis) { 294 int ierr; 295 Ceed ceed; 296 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 297 CeedBasis_Hip_shared *data; 298 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 299 300 // Copy basis data to GPU 301 const CeedInt qBytes = Q_1d * sizeof(CeedScalar); 302 ierr = hipMalloc((void **)&data->d_q_weight_1d, qBytes); 303 CeedChk_Hip(ceed, ierr); 304 ierr = hipMemcpy(data->d_q_weight_1d, q_weight_1d, qBytes, 305 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 306 307 const CeedInt iBytes = qBytes * P_1d; 308 ierr = hipMalloc((void **)&data->d_interp_1d, iBytes); CeedChk_Hip(ceed, ierr); 309 ierr = hipMemcpy(data->d_interp_1d, interp_1d, iBytes, 310 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 311 312 ierr = hipMalloc((void **)&data->d_grad_1d, iBytes); CeedChk_Hip(ceed, ierr); 313 ierr = hipMemcpy(data->d_grad_1d, grad_1d, iBytes, 314 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 315 316 // Compute collocated gradient and copy to GPU 317 data->d_collo_grad_1d = NULL; 318 if (dim == 3 && Q_1d >= P_1d) { 319 CeedScalar *collo_grad_1d; 320 ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); 321 ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); 322 ierr = hipMalloc((void **)&data->d_collo_grad_1d, qBytes * Q_1d); 323 CeedChk_Hip(ceed, ierr); 324 ierr = hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, qBytes * Q_1d, 325 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 326 ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); 327 } 328 329 // Set number of threads per block for basis kernels 330 CeedInt num_comp; 331 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 332 ierr = ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, 333 data->block_sizes); 334 CeedChkBackend(ierr); 335 336 // Compile basis kernels 337 char *basis_kernel_path, *basis_kernel_source; 338 ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-shared-basis.h", 339 &basis_kernel_path); CeedChkBackend(ierr); 340 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 341 CeedChkBackend(ierr); 342 ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 11, 343 "Q1D", Q_1d, 344 "P1D", P_1d, 345 "T1D", CeedIntMax(Q_1d, P_1d), 346 "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 347 Q_1d : P_1d, dim), 348 "BASIS_DIM", dim, 349 "BASIS_NCOMP", num_comp, 350 "BASIS_ELEMSIZE", CeedIntPow(P_1d, dim), 351 "BASIS_NQPT", CeedIntPow(Q_1d, dim), 352 "INTERP_BLKSIZE", data->block_sizes[0], 353 "GRAD_BLKSIZE", data->block_sizes[1], 354 "WEIGHT_BLKSIZE", data->block_sizes[2] 355 ); CeedChkBackend(ierr); 356 ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp); 357 CeedChkBackend(ierr); 358 ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad); 359 CeedChkBackend(ierr); 360 ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight); 361 CeedChkBackend(ierr); 362 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 363 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 364 365 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 366 367 // Register backend functions 368 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 369 CeedBasisApplyTensor_Hip_shared); 370 CeedChkBackend(ierr); 371 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 372 CeedBasisDestroy_Hip_shared); CeedChkBackend(ierr); 373 return CEED_ERROR_SUCCESS; 374 } 375 //------------------------------------------------------------------------------ 376