1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include <ceed/jit-tools.h> 20 #include <hip/hip_runtime.h> 21 #include "ceed-hip-ref.h" 22 #include "../hip/ceed-hip-compile.h" 23 24 //------------------------------------------------------------------------------ 25 // Basis apply - tensor 26 //------------------------------------------------------------------------------ 27 int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, 28 CeedTransposeMode t_mode, 29 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 30 int ierr; 31 Ceed ceed; 32 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 33 Ceed_Hip *ceed_Hip; 34 ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); 35 CeedBasis_Hip *data; 36 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 37 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 38 const int max_block_size = 64; 39 40 // Read vectors 41 const CeedScalar *d_u; 42 CeedScalar *d_v; 43 if (eval_mode != CEED_EVAL_WEIGHT) { 44 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 45 } 46 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 47 48 // Clear v for transpose operation 49 if (t_mode == CEED_TRANSPOSE) { 50 CeedInt length; 51 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 52 ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); 53 CeedChk_Hip(ceed, ierr); 54 } 55 56 // Basis action 57 switch (eval_mode) { 58 case CEED_EVAL_INTERP: { 59 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, 60 &data->d_interp_1d, &d_u, &d_v 61 }; 62 CeedInt Q_1d, dim; 63 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 64 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 65 CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 66 67 ierr = CeedRunKernelHip(ceed, data->Interp, num_elem, block_size, interp_args); 68 CeedChkBackend(ierr); 69 } break; 70 case CEED_EVAL_GRAD: { 71 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 72 &data->d_grad_1d, &d_u, &d_v 73 }; 74 CeedInt block_size = max_block_size; 75 76 ierr = CeedRunKernelHip(ceed, data->Grad, num_elem, block_size, grad_args); 77 CeedChkBackend(ierr); 78 } break; 79 case CEED_EVAL_WEIGHT: { 80 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 81 const int block_size = 64; 82 int grid_size = num_elem / block_size; 83 if (block_size * grid_size < num_elem) 84 grid_size += 1; 85 86 ierr = CeedRunKernelHip(ceed, data->Weight, grid_size, block_size, 87 weight_args); CeedChkBackend(ierr); 88 } break; 89 // LCOV_EXCL_START 90 // Evaluate the divergence to/from the quadrature points 91 case CEED_EVAL_DIV: 92 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 93 // Evaluate the curl to/from the quadrature points 94 case CEED_EVAL_CURL: 95 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 96 // Take no action, BasisApply should not have been called 97 case CEED_EVAL_NONE: 98 return CeedError(ceed, CEED_ERROR_BACKEND, 99 "CEED_EVAL_NONE does not make sense in this context"); 100 // LCOV_EXCL_STOP 101 } 102 103 // Restore vectors 104 if (eval_mode != CEED_EVAL_WEIGHT) { 105 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 106 } 107 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 108 return CEED_ERROR_SUCCESS; 109 } 110 111 //------------------------------------------------------------------------------ 112 // Basis apply - non-tensor 113 //------------------------------------------------------------------------------ 114 int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, 115 CeedTransposeMode t_mode, CeedEvalMode eval_mode, 116 CeedVector u, CeedVector v) { 117 int ierr; 118 Ceed ceed; 119 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 120 Ceed_Hip *ceed_Hip; 121 ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); 122 CeedBasisNonTensor_Hip *data; 123 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 124 CeedInt num_nodes, num_qpts; 125 ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChkBackend(ierr); 126 ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChkBackend(ierr); 127 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 128 int elemsPerBlock = 1; 129 int grid = num_elem/elemsPerBlock+(( 130 num_elem/elemsPerBlock*elemsPerBlock<num_elem)?1:0); 131 132 // Read vectors 133 const CeedScalar *d_u; 134 CeedScalar *d_v; 135 if (eval_mode != CEED_EVAL_WEIGHT) { 136 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 137 } 138 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 139 140 // Clear v for transpose operation 141 if (t_mode == CEED_TRANSPOSE) { 142 CeedInt length; 143 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 144 ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); 145 CeedChk_Hip(ceed, ierr); 146 } 147 148 // Apply basis operation 149 switch (eval_mode) { 150 case CEED_EVAL_INTERP: { 151 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, 152 &data->d_interp, &d_u, &d_v 153 }; 154 if (transpose) { 155 ierr = CeedRunKernelDimHip(ceed, data->Interp, grid, num_nodes, 1, 156 elemsPerBlock, interp_args); CeedChkBackend(ierr); 157 } else { 158 ierr = CeedRunKernelDimHip(ceed, data->Interp, grid, num_qpts, 1, 159 elemsPerBlock, interp_args); CeedChkBackend(ierr); 160 } 161 } break; 162 case CEED_EVAL_GRAD: { 163 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_grad, 164 &d_u, &d_v 165 }; 166 if (transpose) { 167 ierr = CeedRunKernelDimHip(ceed, data->Grad, grid, num_nodes, 1, 168 elemsPerBlock, grad_args); CeedChkBackend(ierr); 169 } else { 170 ierr = CeedRunKernelDimHip(ceed, data->Grad, grid, num_qpts, 1, 171 elemsPerBlock, grad_args); CeedChkBackend(ierr); 172 } 173 } break; 174 case CEED_EVAL_WEIGHT: { 175 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight, &d_v}; 176 ierr = CeedRunKernelDimHip(ceed, data->Weight, grid, num_qpts, 1, 177 elemsPerBlock, weight_args); CeedChkBackend(ierr); 178 } break; 179 // LCOV_EXCL_START 180 // Evaluate the divergence to/from the quadrature points 181 case CEED_EVAL_DIV: 182 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 183 // Evaluate the curl to/from the quadrature points 184 case CEED_EVAL_CURL: 185 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 186 // Take no action, BasisApply should not have been called 187 case CEED_EVAL_NONE: 188 return CeedError(ceed, CEED_ERROR_BACKEND, 189 "CEED_EVAL_NONE does not make sense in this context"); 190 // LCOV_EXCL_STOP 191 } 192 193 // Restore vectors 194 if (eval_mode != CEED_EVAL_WEIGHT) { 195 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 196 } 197 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 198 return CEED_ERROR_SUCCESS; 199 } 200 201 //------------------------------------------------------------------------------ 202 // Destroy tensor basis 203 //------------------------------------------------------------------------------ 204 static int CeedBasisDestroy_Hip(CeedBasis basis) { 205 int ierr; 206 Ceed ceed; 207 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 208 209 CeedBasis_Hip *data; 210 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 211 212 CeedChk_Hip(ceed, hipModuleUnload(data->module)); 213 214 ierr = hipFree(data->d_q_weight_1d); CeedChk_Hip(ceed, ierr); 215 ierr = hipFree(data->d_interp_1d); CeedChk_Hip(ceed, ierr); 216 ierr = hipFree(data->d_grad_1d); CeedChk_Hip(ceed, ierr); 217 ierr = CeedFree(&data); CeedChkBackend(ierr); 218 219 return CEED_ERROR_SUCCESS; 220 } 221 222 //------------------------------------------------------------------------------ 223 // Destroy non-tensor basis 224 //------------------------------------------------------------------------------ 225 static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { 226 int ierr; 227 Ceed ceed; 228 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 229 230 CeedBasisNonTensor_Hip *data; 231 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 232 233 CeedChk_Hip(ceed, hipModuleUnload(data->module)); 234 235 ierr = hipFree(data->d_q_weight); CeedChk_Hip(ceed, ierr); 236 ierr = hipFree(data->d_interp); CeedChk_Hip(ceed, ierr); 237 ierr = hipFree(data->d_grad); CeedChk_Hip(ceed, ierr); 238 ierr = CeedFree(&data); CeedChkBackend(ierr); 239 240 return CEED_ERROR_SUCCESS; 241 } 242 243 //------------------------------------------------------------------------------ 244 // Create tensor 245 //------------------------------------------------------------------------------ 246 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 247 const CeedScalar *interp_1d, 248 const CeedScalar *grad_1d, 249 const CeedScalar *qref1d, 250 const CeedScalar *q_weight_1d, 251 CeedBasis basis) { 252 int ierr; 253 Ceed ceed; 254 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 255 CeedBasis_Hip *data; 256 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 257 258 // Copy data to GPU 259 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 260 ierr = hipMalloc((void **)&data->d_q_weight_1d, q_bytes); 261 CeedChk_Hip(ceed, ierr); 262 ierr = hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 263 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 264 265 const CeedInt interp_bytes = q_bytes * P_1d; 266 ierr = hipMalloc((void **)&data->d_interp_1d, interp_bytes); 267 CeedChk_Hip(ceed, ierr); 268 ierr = hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 269 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 270 271 ierr = hipMalloc((void **)&data->d_grad_1d, interp_bytes); 272 CeedChk_Hip(ceed, ierr); 273 ierr = hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 274 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 275 276 // Complie basis kernels 277 CeedInt ncomp; 278 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 279 char *basis_kernel_path, *basis_kernel_source; 280 ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-basis-tensor.h", 281 &basis_kernel_path); CeedChkBackend(ierr); 282 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 283 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 284 CeedChkBackend(ierr); 285 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 286 ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 7, 287 "BASIS_Q_1D", Q_1d, 288 "BASIS_P_1D", P_1d, 289 "BASIS_BUF_LEN", ncomp * CeedIntPow(Q_1d > P_1d ? 290 Q_1d : P_1d, dim), 291 "BASIS_DIM", dim, 292 "BASIS_NUM_COMP", ncomp, 293 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 294 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim) 295 ); CeedChkBackend(ierr); 296 ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp); 297 CeedChkBackend(ierr); 298 ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad); 299 CeedChkBackend(ierr); 300 ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight); 301 CeedChkBackend(ierr); 302 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 303 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 304 305 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 306 307 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 308 CeedBasisApply_Hip); CeedChkBackend(ierr); 309 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 310 CeedBasisDestroy_Hip); CeedChkBackend(ierr); 311 return CEED_ERROR_SUCCESS; 312 } 313 314 //------------------------------------------------------------------------------ 315 // Create non-tensor 316 //------------------------------------------------------------------------------ 317 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, 318 CeedInt num_qpts, const CeedScalar *interp, 319 const CeedScalar *grad, const CeedScalar *qref, 320 const CeedScalar *q_weight, CeedBasis basis) { 321 int ierr; 322 Ceed ceed; 323 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 324 CeedBasisNonTensor_Hip *data; 325 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 326 327 // Copy basis data to GPU 328 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 329 ierr = hipMalloc((void **)&data->d_q_weight, q_bytes); CeedChk_Hip(ceed, ierr); 330 ierr = hipMemcpy(data->d_q_weight, q_weight, q_bytes, 331 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 332 333 const CeedInt interp_bytes = q_bytes * num_nodes; 334 ierr = hipMalloc((void **)&data->d_interp, interp_bytes); 335 CeedChk_Hip(ceed, ierr); 336 ierr = hipMemcpy(data->d_interp, interp, interp_bytes, 337 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 338 339 const CeedInt grad_bytes = q_bytes * num_nodes * dim; 340 ierr = hipMalloc((void **)&data->d_grad, grad_bytes); CeedChk_Hip(ceed, ierr); 341 ierr = hipMemcpy(data->d_grad, grad, grad_bytes, 342 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 343 344 // Compile basis kernels 345 CeedInt ncomp; 346 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 347 char *basis_kernel_path, *basis_kernel_source; 348 ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-basis-nontensor.h", 349 &basis_kernel_path); CeedChkBackend(ierr); 350 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 351 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 352 CeedChkBackend(ierr); 353 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 354 ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 4, 355 "BASIS_Q", num_qpts, 356 "BASIS_P", num_nodes, 357 "BASIS_DIM", dim, 358 "BASIS_NUM_COMP", ncomp 359 ); CeedChk_Hip(ceed, ierr); 360 ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp); 361 CeedChk_Hip(ceed, ierr); 362 ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad); 363 CeedChk_Hip(ceed, ierr); 364 ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight); 365 CeedChk_Hip(ceed, ierr); 366 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 367 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 368 369 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 370 371 // Register backend functions 372 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 373 CeedBasisApplyNonTensor_Hip); CeedChkBackend(ierr); 374 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 375 CeedBasisDestroyNonTensor_Hip); CeedChkBackend(ierr); 376 return CEED_ERROR_SUCCESS; 377 } 378 //------------------------------------------------------------------------------ 379