jit-source/hip/hip-ref-basis-nontensor-templates.h

*d075f50bSSebastian Grimberg// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
*d075f50bSSebastian Grimberg// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
*d075f50bSSebastian Grimberg//
*d075f50bSSebastian Grimberg// SPDX-License-Identifier: BSD-2-Clause
*d075f50bSSebastian Grimberg//
*d075f50bSSebastian Grimberg// This file is part of CEED:  http://github.com/ceed
*d075f50bSSebastian Grimberg
*d075f50bSSebastian Grimberg/// @file
*d075f50bSSebastian Grimberg/// Internal header for HIP non-tensor product basis templates
*d075f50bSSebastian Grimberg#ifndef CEED_HIP_REF_BASIS_NONTENSOR_TEMPLATES_H
*d075f50bSSebastian Grimberg#define CEED_HIP_REF_BASIS_NONTENSOR_TEMPLATES_H
*d075f50bSSebastian Grimberg
*d075f50bSSebastian Grimberg#include <ceed.h>
*d075f50bSSebastian Grimberg
*d075f50bSSebastian Grimberg//------------------------------------------------------------------------------
*d075f50bSSebastian Grimberg// Tensor contraction
*d075f50bSSebastian Grimberg//------------------------------------------------------------------------------
*d075f50bSSebastian Grimbergtemplate <int NUM_COMP, int Q_COMP, int P, int Q>
*d075f50bSSebastian Grimberginline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U,
*d075f50bSSebastian Grimberg                                const CeedInt strides_comp_V, const CeedInt strides_q_comp_V, const CeedScalar *__restrict__ d_B,
*d075f50bSSebastian Grimberg                                const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
*d075f50bSSebastian Grimberg  const CeedInt     t_id = threadIdx.x;
*d075f50bSSebastian Grimberg  const CeedScalar *U;
*d075f50bSSebastian Grimberg  CeedScalar        r_V[Q_COMP];
*d075f50bSSebastian Grimberg  // TODO load B in shared memory if blockDim.z > 1?
*d075f50bSSebastian Grimberg
*d075f50bSSebastian Grimberg  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*d075f50bSSebastian Grimberg    // Run with Q threads
*d075f50bSSebastian Grimberg    U = d_U + elem * strides_elem_U + comp * strides_comp_U;
*d075f50bSSebastian Grimberg    for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0;
*d075f50bSSebastian Grimberg    for (CeedInt i = 0; i < P; i++) {
*d075f50bSSebastian Grimberg      const CeedScalar val = U[i];
*d075f50bSSebastian Grimberg
*d075f50bSSebastian Grimberg      for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] += d_B[i + t_id * P + d * P * Q] * val;
*d075f50bSSebastian Grimberg    }
*d075f50bSSebastian Grimberg    for (CeedInt d = 0; d < Q_COMP; d++) {
*d075f50bSSebastian Grimberg      d_V[elem * strides_elem_V + comp * strides_comp_V + d * strides_q_comp_V + t_id] = r_V[d];
*d075f50bSSebastian Grimberg    }
*d075f50bSSebastian Grimberg  }
*d075f50bSSebastian Grimberg}
*d075f50bSSebastian Grimberg
*d075f50bSSebastian Grimberg//------------------------------------------------------------------------------
*d075f50bSSebastian Grimberg// Tensor contraction transpose
*d075f50bSSebastian Grimberg//------------------------------------------------------------------------------
*d075f50bSSebastian Grimbergtemplate <int NUM_COMP, int Q_COMP, int P, int Q>
*d075f50bSSebastian Grimberginline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U,
*d075f50bSSebastian Grimberg                                         const CeedInt strides_comp_V, const CeedInt strides_q_comp_U, const CeedScalar *__restrict__ d_B,
*d075f50bSSebastian Grimberg                                         const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
*d075f50bSSebastian Grimberg  const CeedInt     t_id = threadIdx.x;
*d075f50bSSebastian Grimberg  const CeedScalar *U;
*d075f50bSSebastian Grimberg  CeedScalar        r_V;
*d075f50bSSebastian Grimberg  // TODO load B in shared memory if blockDim.z > 1?
*d075f50bSSebastian Grimberg
*d075f50bSSebastian Grimberg  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*d075f50bSSebastian Grimberg    // Run with P threads
*d075f50bSSebastian Grimberg    r_V = 0.0;
*d075f50bSSebastian Grimberg    for (CeedInt d = 0; d < Q_COMP; d++) {
*d075f50bSSebastian Grimberg      U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U;
*d075f50bSSebastian Grimberg      for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i];
*d075f50bSSebastian Grimberg    }
*d075f50bSSebastian Grimberg    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V;
*d075f50bSSebastian Grimberg  }
*d075f50bSSebastian Grimberg}
*d075f50bSSebastian Grimberg
*d075f50bSSebastian Grimberg//------------------------------------------------------------------------------
*d075f50bSSebastian Grimberg
*d075f50bSSebastian Grimberg#endif  // CEED_HIP_REF_BASIS_NONTENSOR_TEMPLATES_H