1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include <ceed/jit-tools.h> 20 #include <hip/hip_runtime.h> 21 #include "ceed-hip-ref.h" 22 #include "../hip/ceed-hip-compile.h" 23 24 //------------------------------------------------------------------------------ 25 // Basis apply - tensor 26 //------------------------------------------------------------------------------ 27 int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, 28 CeedTransposeMode t_mode, 29 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 30 int ierr; 31 Ceed ceed; 32 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 33 Ceed_Hip *ceed_Hip; 34 ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); 35 CeedBasis_Hip *data; 36 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 37 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 38 const int max_block_size = 64; 39 40 // Read vectors 41 const CeedScalar *d_u; 42 CeedScalar *d_v; 43 if (eval_mode != CEED_EVAL_WEIGHT) { 44 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 45 } 46 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 47 48 // Clear v for transpose operation 49 if (t_mode == CEED_TRANSPOSE) { 50 CeedInt length; 51 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 52 ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); 53 CeedChk_Hip(ceed, ierr); 54 } 55 56 // Basis action 57 switch (eval_mode) { 58 case CEED_EVAL_INTERP: { 59 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, 60 &data->d_interp_1d, &d_u, &d_v 61 }; 62 CeedInt Q_1d, dim; 63 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 64 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 65 CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 66 67 ierr = CeedRunKernelHip(ceed, data->Interp, num_elem, block_size, interp_args); 68 CeedChkBackend(ierr); 69 } break; 70 case CEED_EVAL_GRAD: { 71 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 72 &data->d_grad_1d, &d_u, &d_v 73 }; 74 CeedInt block_size = max_block_size; 75 76 ierr = CeedRunKernelHip(ceed, data->Grad, num_elem, block_size, grad_args); 77 CeedChkBackend(ierr); 78 } break; 79 case CEED_EVAL_WEIGHT: { 80 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 81 const int block_size = 64; 82 int grid_size = num_elem / block_size; 83 if (block_size * grid_size < num_elem) 84 grid_size += 1; 85 86 ierr = CeedRunKernelHip(ceed, data->Weight, grid_size, block_size, 87 weight_args); CeedChkBackend(ierr); 88 } break; 89 // LCOV_EXCL_START 90 // Evaluate the divergence to/from the quadrature points 91 case CEED_EVAL_DIV: 92 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 93 // Evaluate the curl to/from the quadrature points 94 case CEED_EVAL_CURL: 95 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 96 // Take no action, BasisApply should not have been called 97 case CEED_EVAL_NONE: 98 return CeedError(ceed, CEED_ERROR_BACKEND, 99 "CEED_EVAL_NONE does not make sense in this context"); 100 // LCOV_EXCL_STOP 101 } 102 103 // Restore vectors 104 if (eval_mode != CEED_EVAL_WEIGHT) { 105 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 106 } 107 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 108 return CEED_ERROR_SUCCESS; 109 } 110 111 //------------------------------------------------------------------------------ 112 // Basis apply - non-tensor 113 //------------------------------------------------------------------------------ 114 int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, 115 CeedTransposeMode t_mode, CeedEvalMode eval_mode, 116 CeedVector u, CeedVector v) { 117 int ierr; 118 Ceed ceed; 119 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 120 Ceed_Hip *ceed_Hip; 121 ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); 122 CeedBasisNonTensor_Hip *data; 123 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 124 CeedInt num_nodes, num_qpts; 125 ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChkBackend(ierr); 126 ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChkBackend(ierr); 127 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 128 int elemsPerBlock = 1; 129 int grid = num_elem/elemsPerBlock+(( 130 num_elem/elemsPerBlock*elemsPerBlock<num_elem)?1:0); 131 132 // Read vectors 133 const CeedScalar *d_u; 134 CeedScalar *d_v; 135 if (eval_mode != CEED_EVAL_WEIGHT) { 136 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 137 } 138 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 139 140 // Clear v for transpose operation 141 if (t_mode == CEED_TRANSPOSE) { 142 CeedInt length; 143 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 144 ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); 145 CeedChk_Hip(ceed, ierr); 146 } 147 148 // Apply basis operation 149 switch (eval_mode) { 150 case CEED_EVAL_INTERP: { 151 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, 152 &data->d_interp, &d_u, &d_v 153 }; 154 if (transpose) { 155 ierr = CeedRunKernelDimHip(ceed, data->Interp, grid, num_nodes, 1, 156 elemsPerBlock, interp_args); CeedChkBackend(ierr); 157 } else { 158 ierr = CeedRunKernelDimHip(ceed, data->Interp, grid, num_qpts, 1, 159 elemsPerBlock, interp_args); CeedChkBackend(ierr); 160 } 161 } break; 162 case CEED_EVAL_GRAD: { 163 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_grad, 164 &d_u, &d_v 165 }; 166 if (transpose) { 167 ierr = CeedRunKernelDimHip(ceed, data->Grad, grid, num_nodes, 1, 168 elemsPerBlock, grad_args); CeedChkBackend(ierr); 169 } else { 170 ierr = CeedRunKernelDimHip(ceed, data->Grad, grid, num_qpts, 1, 171 elemsPerBlock, grad_args); CeedChkBackend(ierr); 172 } 173 } break; 174 case CEED_EVAL_WEIGHT: { 175 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight, &d_v}; 176 ierr = CeedRunKernelDimHip(ceed, data->Weight, grid, num_qpts, 1, 177 elemsPerBlock, weight_args); CeedChkBackend(ierr); 178 } break; 179 // LCOV_EXCL_START 180 // Evaluate the divergence to/from the quadrature points 181 case CEED_EVAL_DIV: 182 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 183 // Evaluate the curl to/from the quadrature points 184 case CEED_EVAL_CURL: 185 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 186 // Take no action, BasisApply should not have been called 187 case CEED_EVAL_NONE: 188 return CeedError(ceed, CEED_ERROR_BACKEND, 189 "CEED_EVAL_NONE does not make sense in this context"); 190 // LCOV_EXCL_STOP 191 } 192 193 // Restore vectors 194 if (eval_mode != CEED_EVAL_WEIGHT) { 195 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 196 } 197 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 198 return CEED_ERROR_SUCCESS; 199 } 200 201 //------------------------------------------------------------------------------ 202 // Destroy tensor basis 203 //------------------------------------------------------------------------------ 204 static int CeedBasisDestroy_Hip(CeedBasis basis) { 205 int ierr; 206 Ceed ceed; 207 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 208 209 CeedBasis_Hip *data; 210 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 211 212 CeedChk_Hip(ceed, hipModuleUnload(data->module)); 213 214 ierr = hipFree(data->d_q_weight_1d); CeedChk_Hip(ceed, ierr); 215 ierr = hipFree(data->d_interp_1d); CeedChk_Hip(ceed, ierr); 216 ierr = hipFree(data->d_grad_1d); CeedChk_Hip(ceed, ierr); 217 ierr = CeedFree(&data); CeedChkBackend(ierr); 218 219 return CEED_ERROR_SUCCESS; 220 } 221 222 //------------------------------------------------------------------------------ 223 // Destroy non-tensor basis 224 //------------------------------------------------------------------------------ 225 static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { 226 int ierr; 227 Ceed ceed; 228 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 229 230 CeedBasisNonTensor_Hip *data; 231 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 232 233 CeedChk_Hip(ceed, hipModuleUnload(data->module)); 234 235 ierr = hipFree(data->d_q_weight); CeedChk_Hip(ceed, ierr); 236 ierr = hipFree(data->d_interp); CeedChk_Hip(ceed, ierr); 237 ierr = hipFree(data->d_grad); CeedChk_Hip(ceed, ierr); 238 ierr = CeedFree(&data); CeedChkBackend(ierr); 239 240 return CEED_ERROR_SUCCESS; 241 } 242 243 //------------------------------------------------------------------------------ 244 // Create tensor 245 //------------------------------------------------------------------------------ 246 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 247 const CeedScalar *interp_1d, 248 const CeedScalar *grad_1d, 249 const CeedScalar *qref1d, 250 const CeedScalar *q_weight_1d, 251 CeedBasis basis) { 252 int ierr; 253 Ceed ceed; 254 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 255 CeedBasis_Hip *data; 256 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 257 258 // Copy data to GPU 259 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 260 ierr = hipMalloc((void **)&data->d_q_weight_1d, q_bytes); 261 CeedChk_Hip(ceed, ierr); 262 ierr = hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 263 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 264 265 const CeedInt interp_bytes = q_bytes * P_1d; 266 ierr = hipMalloc((void **)&data->d_interp_1d, interp_bytes); 267 CeedChk_Hip(ceed, ierr); 268 ierr = hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 269 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 270 271 ierr = hipMalloc((void **)&data->d_grad_1d, interp_bytes); 272 CeedChk_Hip(ceed, ierr); 273 ierr = hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 274 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 275 276 // Complie basis kernels 277 CeedInt ncomp; 278 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 279 char *basis_kernel_path, *basis_kernel_source; 280 ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-basis-tensor.h", 281 &basis_kernel_path); CeedChkBackend(ierr); 282 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 283 CeedChkBackend(ierr); 284 ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 7, 285 "BASIS_Q1D", Q_1d, 286 "BASIS_P1D", P_1d, 287 "BASIS_BUF_LEN", ncomp * CeedIntPow(Q_1d > P_1d ? 288 Q_1d : P_1d, dim), 289 "BASIS_DIM", dim, 290 "BASIS_NCOMP", ncomp, 291 "BASIS_ELEMSIZE", CeedIntPow(P_1d, dim), 292 "BASIS_NQPT", CeedIntPow(Q_1d, dim) 293 ); CeedChkBackend(ierr); 294 ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp); 295 CeedChkBackend(ierr); 296 ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad); 297 CeedChkBackend(ierr); 298 ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight); 299 CeedChkBackend(ierr); 300 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 301 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 302 303 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 304 305 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 306 CeedBasisApply_Hip); CeedChkBackend(ierr); 307 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 308 CeedBasisDestroy_Hip); CeedChkBackend(ierr); 309 return CEED_ERROR_SUCCESS; 310 } 311 312 //------------------------------------------------------------------------------ 313 // Create non-tensor 314 //------------------------------------------------------------------------------ 315 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, 316 CeedInt num_qpts, const CeedScalar *interp, 317 const CeedScalar *grad, const CeedScalar *qref, 318 const CeedScalar *q_weight, CeedBasis basis) { 319 int ierr; 320 Ceed ceed; 321 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 322 CeedBasisNonTensor_Hip *data; 323 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 324 325 // Copy basis data to GPU 326 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 327 ierr = hipMalloc((void **)&data->d_q_weight, q_bytes); CeedChk_Hip(ceed, ierr); 328 ierr = hipMemcpy(data->d_q_weight, q_weight, q_bytes, 329 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 330 331 const CeedInt interp_bytes = q_bytes * num_nodes; 332 ierr = hipMalloc((void **)&data->d_interp, interp_bytes); 333 CeedChk_Hip(ceed, ierr); 334 ierr = hipMemcpy(data->d_interp, interp, interp_bytes, 335 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 336 337 const CeedInt grad_bytes = q_bytes * num_nodes * dim; 338 ierr = hipMalloc((void **)&data->d_grad, grad_bytes); CeedChk_Hip(ceed, ierr); 339 ierr = hipMemcpy(data->d_grad, grad, grad_bytes, 340 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 341 342 // Compile basis kernels 343 CeedInt ncomp; 344 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 345 char *basis_kernel_path, *basis_kernel_source; 346 ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-basis-nontensor.h", 347 &basis_kernel_path); CeedChkBackend(ierr); 348 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 349 CeedChkBackend(ierr); 350 ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 4, 351 "Q", num_qpts, 352 "P", num_nodes, 353 "BASIS_DIM", dim, 354 "BASIS_NCOMP", ncomp 355 ); CeedChk_Hip(ceed, ierr); 356 ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp); 357 CeedChk_Hip(ceed, ierr); 358 ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad); 359 CeedChk_Hip(ceed, ierr); 360 ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight); 361 CeedChk_Hip(ceed, ierr); 362 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 363 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 364 365 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 366 367 // Register backend functions 368 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 369 CeedBasisApplyNonTensor_Hip); CeedChkBackend(ierr); 370 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 371 CeedBasisDestroyNonTensor_Hip); CeedChkBackend(ierr); 372 return CEED_ERROR_SUCCESS; 373 } 374 //------------------------------------------------------------------------------ 375