jit-source/magma/magma-common-tensor.h

*9ba83ac0SJeremy L Thompson// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
f80f4a74SSebastian Grimberg// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
f80f4a74SSebastian Grimberg//
f80f4a74SSebastian Grimberg// SPDX-License-Identifier: BSD-2-Clause
f80f4a74SSebastian Grimberg//
f80f4a74SSebastian Grimberg// This file is part of CEED:  http://github.com/ceed
f80f4a74SSebastian Grimberg
3c1e2affSSebastian Grimberg/// @file
3c1e2affSSebastian Grimberg/// Internal header for MAGMA backend common tensor basis definitions
509d4af6SJeremy L Thompson#pragma once
f80f4a74SSebastian Grimberg
3c1e2affSSebastian Grimberg#include "magma-common-defs.h"
f80f4a74SSebastian Grimberg
9e0c01faSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
f80f4a74SSebastian Grimberg// read U or V of a 1D element into shared memory sU[][] or sV[][] --  for all components
f80f4a74SSebastian Grimberg// the devptr is assumed to point directly to the element
f80f4a74SSebastian Grimberg// must sync after call
3c1e2affSSebastian Grimbergtemplate <typename T, int LENGTH, int NUM_COMP>
3c1e2affSSebastian Grimbergstatic __device__ __inline__ void read_1d(const T *devptr, const int compstride, T *sBuffer[NUM_COMP], const int tx) {
f80f4a74SSebastian Grimberg  if (tx < LENGTH) {
3c1e2affSSebastian Grimberg    for (int comp = 0; comp < NUM_COMP; comp++) {
3c1e2affSSebastian Grimberg      sBuffer[comp][tx] = devptr[comp * compstride + tx];
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg}
f80f4a74SSebastian Grimberg
9e0c01faSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
f80f4a74SSebastian Grimberg// write V of a 1D element into global memory from sV[][] --  for all components
f80f4a74SSebastian Grimberg// the devptr is assumed to point directly to the element
3c1e2affSSebastian Grimbergtemplate <typename T, int LENGTH, int NUM_COMP>
3c1e2affSSebastian Grimbergstatic __device__ __inline__ void write_1d(T *sBuffer[NUM_COMP], T *devptr, const int compstride, const int tx) {
f80f4a74SSebastian Grimberg  if (tx < LENGTH) {
3c1e2affSSebastian Grimberg    for (int comp = 0; comp < NUM_COMP; comp++) {
3c1e2affSSebastian Grimberg      devptr[comp * compstride + tx] = sBuffer[comp][tx];
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg}
f80f4a74SSebastian Grimberg
9e0c01faSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
db2becc9SJeremy L Thompson// sum into V of a 1D element into global memory from sV[][] --  for all components
db2becc9SJeremy L Thompson// the devptr is assumed to point directly to the element
db2becc9SJeremy L Thompsontemplate <typename T, int LENGTH, int NUM_COMP>
db2becc9SJeremy L Thompsonstatic __device__ __inline__ void sum_1d(T *sBuffer[NUM_COMP], T *devptr, const int compstride, const int tx) {
db2becc9SJeremy L Thompson  if (tx < LENGTH) {
db2becc9SJeremy L Thompson    for (int comp = 0; comp < NUM_COMP; comp++) {
db2becc9SJeremy L Thompson      devptr[comp * compstride + tx] += sBuffer[comp][tx];
db2becc9SJeremy L Thompson    }
db2becc9SJeremy L Thompson  }
db2becc9SJeremy L Thompson}
db2becc9SJeremy L Thompson
db2becc9SJeremy L Thompson////////////////////////////////////////////////////////////////////////////////
f80f4a74SSebastian Grimberg// read U of a 2D element into registers rU[][][] --  for all components of a single dim
f80f4a74SSebastian Grimberg// dU is assumed to be offset by elem-stride and dim-stride
3c1e2affSSebastian Grimberg// register is assumed to be rU[DIM_U][NUM_COMP][rU_SIZE]
3c1e2affSSebastian Grimberg// i_DIM specifies which dimension is being read into in rU
9e0c01faSSebastian Grimberg// rU_SIZE can be different from P (e.g. max(P, Q))
3c1e2affSSebastian Grimberg// sTmp is a shared memory workspace of size P^2
3c1e2affSSebastian Grimbergtemplate <typename T, int P, int DIM_U, int NUM_COMP, int rU_SIZE, int i_DIM>
9e0c01faSSebastian Grimbergstatic __device__ __inline__ void read_U_2d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) {
9e0c01faSSebastian Grimberg  // read U as a batch P of (1 x P) vectors
3c1e2affSSebastian Grimberg  // vec 0  : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
3c1e2affSSebastian Grimberg  // vec 1  : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
f80f4a74SSebastian Grimberg  // ...
3c1e2affSSebastian Grimberg  // vec P-1: [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
f80f4a74SSebastian Grimberg  // threads collaboratively read vec0 and then vec1 and so on
f80f4a74SSebastian Grimberg  // but for the kernel, we want
f80f4a74SSebastian Grimberg  // thread 0 to hold all of vec0 in registers, and
f80f4a74SSebastian Grimberg  // thread 1 to hold all of vec1 in registers, and and so on
f80f4a74SSebastian Grimberg  // so we need to transpose
3c1e2affSSebastian Grimberg  for (int comp = 0; comp < NUM_COMP; comp++) {
f80f4a74SSebastian Grimberg    // read from global memory into shared memory
3c1e2affSSebastian Grimberg    if (tx < P) {
3c1e2affSSebastian Grimberg      for (int i = 0; i < P; i++) {
3c1e2affSSebastian Grimberg        sTmp[i * P + tx] = dU[comp * compstride + i * P + tx];
f80f4a74SSebastian Grimberg      }
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg    __syncthreads();
f80f4a74SSebastian Grimberg
3c1e2affSSebastian Grimberg    if (tx < P) {
3c1e2affSSebastian Grimberg      for (int i = 0; i < P; i++) {
3c1e2affSSebastian Grimberg        rU[i_DIM][comp][i] = sTmp[tx * P + i];
f80f4a74SSebastian Grimberg      }
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg    __syncthreads();
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg}
f80f4a74SSebastian Grimberg
9e0c01faSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
f80f4a74SSebastian Grimberg// read V of a 2D element into registers rV[][][] --  for all components of a single dim
f80f4a74SSebastian Grimberg// dV is assumed to be offset by elem-stride and dim-stride
3c1e2affSSebastian Grimberg// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
3c1e2affSSebastian Grimberg// i_DIM specifies which dimension is being read into in rV
9e0c01faSSebastian Grimberg// rV_SIZE can be different from P (e.g. max(P, Q))
3c1e2affSSebastian Grimbergtemplate <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
9e0c01faSSebastian Grimbergstatic __device__ __inline__ void read_V_2d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
3c1e2affSSebastian Grimberg  if (tx < Q) {
3c1e2affSSebastian Grimberg    for (int comp = 0; comp < NUM_COMP; comp++) {
3c1e2affSSebastian Grimberg      for (int j = 0; j < Q; j++) {
3c1e2affSSebastian Grimberg        rV[i_DIM][comp][j] = dV[comp * compstride + j * Q + tx];
f80f4a74SSebastian Grimberg      }
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg}
f80f4a74SSebastian Grimberg
9e0c01faSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
f80f4a74SSebastian Grimberg// write V of a 2D element from registers rV[][][] to global memory --  for all components of a single dim
f80f4a74SSebastian Grimberg// dV is assumed to be offset by elem-stride and dim-stride
3c1e2affSSebastian Grimberg// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
9e0c01faSSebastian Grimberg// i_DIM specifies which dimension is being written to in dV
9e0c01faSSebastian Grimberg// rV_SIZE can be different from P (e.g. max(P, Q))
3c1e2affSSebastian Grimbergtemplate <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
9e0c01faSSebastian Grimbergstatic __device__ __inline__ void write_V_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
3c1e2affSSebastian Grimberg  if (tx < Q) {
3c1e2affSSebastian Grimberg    for (int comp = 0; comp < NUM_COMP; comp++) {
3c1e2affSSebastian Grimberg      for (int j = 0; j < Q; j++) {
3c1e2affSSebastian Grimberg        dV[comp * compstride + j * Q + tx] = rV[i_DIM][comp][j];
f80f4a74SSebastian Grimberg      }
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg}
f80f4a74SSebastian Grimberg
9e0c01faSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
db2becc9SJeremy L Thompson// sum into V of a 2D element from registers rV[][][] to global memory --  for all components of a single dim
db2becc9SJeremy L Thompson// dV is assumed to be offset by elem-stride and dim-stride
db2becc9SJeremy L Thompson// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
db2becc9SJeremy L Thompson// i_DIM specifies which dimension is being written to in dV
db2becc9SJeremy L Thompson// rV_SIZE can be different from P (e.g. max(P, Q))
db2becc9SJeremy L Thompsontemplate <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
db2becc9SJeremy L Thompsonstatic __device__ __inline__ void sum_V_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
db2becc9SJeremy L Thompson  if (tx < Q) {
db2becc9SJeremy L Thompson    for (int comp = 0; comp < NUM_COMP; comp++) {
db2becc9SJeremy L Thompson      for (int j = 0; j < Q; j++) {
db2becc9SJeremy L Thompson        dV[comp * compstride + j * Q + tx] += rV[i_DIM][comp][j];
db2becc9SJeremy L Thompson      }
db2becc9SJeremy L Thompson    }
db2becc9SJeremy L Thompson  }
db2becc9SJeremy L Thompson}
db2becc9SJeremy L Thompson
db2becc9SJeremy L Thompson////////////////////////////////////////////////////////////////////////////////
f80f4a74SSebastian Grimberg// read U of a 3D element into registers rU[][][] --  for all components of a single dim
f80f4a74SSebastian Grimberg// dU is assumed to be offset by elem-stride and dim-stride
3c1e2affSSebastian Grimberg// register is assumed to be rU[DIM_U][NUM_COMP][rU_SIZE]
3c1e2affSSebastian Grimberg// i_DIM specifies which dimension is being read into in rU
9e0c01faSSebastian Grimberg// rU_SIZE can be different from P (e.g. max(P, Q))
3c1e2affSSebastian Grimberg// sTmp is a shared memory workspace of size P^3
3c1e2affSSebastian Grimbergtemplate <typename T, int P, int DIM_U, int NUM_COMP, int rU_SIZE, int i_DIM>
9e0c01faSSebastian Grimbergstatic __device__ __inline__ void read_U_3d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) {
3c1e2affSSebastian Grimberg  // read U as a batch P^2 of (1 x P_) vectors
3c1e2affSSebastian Grimberg  // vec 0    : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
3c1e2affSSebastian Grimberg  // vec 1    : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
f80f4a74SSebastian Grimberg  // ...
3c1e2affSSebastian Grimberg  // vec P^2-1: [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
f80f4a74SSebastian Grimberg  // threads collaboratively read vec0 and then vec1 and so on
f80f4a74SSebastian Grimberg  // but for the kernel, we want
f80f4a74SSebastian Grimberg  // thread 0 to hold all of vec0 in registers, and
f80f4a74SSebastian Grimberg  // thread 1 to hold all of vec1 in registers, and and so on
f80f4a74SSebastian Grimberg  // so we need to transpose
3c1e2affSSebastian Grimberg  for (int comp = 0; comp < NUM_COMP; comp++) {
f80f4a74SSebastian Grimberg    // read from global memory into shared memory
3c1e2affSSebastian Grimberg    if (tx < P * P) {
3c1e2affSSebastian Grimberg      for (int i = 0; i < P; i++) {
3c1e2affSSebastian Grimberg        sTmp[i * P * P + tx] = dU[comp * compstride + i * P * P + tx];
f80f4a74SSebastian Grimberg      }
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg    __syncthreads();
f80f4a74SSebastian Grimberg
3c1e2affSSebastian Grimberg    if (tx < P * P) {
3c1e2affSSebastian Grimberg      for (int i = 0; i < P; i++) {
3c1e2affSSebastian Grimberg        rU[i_DIM][comp][i] = sTmp[tx * P + i];
f80f4a74SSebastian Grimberg      }
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg    __syncthreads();
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg}
f80f4a74SSebastian Grimberg
9e0c01faSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
f80f4a74SSebastian Grimberg// read V of a 3D element into registers rV[][][] --  for all components of a single dim
f80f4a74SSebastian Grimberg// dV is assumed to be offset by elem-stride and dim-stride
3c1e2affSSebastian Grimberg// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
3c1e2affSSebastian Grimberg// i_DIM specifies which dimension is being read into in rV
9e0c01faSSebastian Grimberg// rV_SIZE can be different from P (e.g. max(P, Q))
3c1e2affSSebastian Grimbergtemplate <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
9e0c01faSSebastian Grimbergstatic __device__ __inline__ void read_V_3d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
3c1e2affSSebastian Grimberg  if (tx < Q * Q) {
3c1e2affSSebastian Grimberg    for (int comp = 0; comp < NUM_COMP; comp++) {
3c1e2affSSebastian Grimberg      for (int j = 0; j < Q; j++) {
3c1e2affSSebastian Grimberg        rV[i_DIM][comp][j] = dV[comp * compstride + j * (Q * Q) + tx];
f80f4a74SSebastian Grimberg      }
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg}
f80f4a74SSebastian Grimberg
9e0c01faSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
f80f4a74SSebastian Grimberg// write V of a 3D element from registers rV[][][] to global memory --  for all components of a single dim
f80f4a74SSebastian Grimberg// dV is assumed to point directly to the element (i.e. already offset by elem-stride)
3c1e2affSSebastian Grimberg// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
9e0c01faSSebastian Grimberg// i_DIM specifies which dimension is being written to in dV
9e0c01faSSebastian Grimberg// rV_SIZE can be different from P (e.g. max(P, Q))
3c1e2affSSebastian Grimbergtemplate <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
9e0c01faSSebastian Grimbergstatic __device__ __inline__ void write_V_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
3c1e2affSSebastian Grimberg  if (tx < (Q * Q)) {
3c1e2affSSebastian Grimberg    for (int comp = 0; comp < NUM_COMP; comp++) {
3c1e2affSSebastian Grimberg      for (int j = 0; j < Q; j++) {
3c1e2affSSebastian Grimberg        dV[comp * compstride + j * (Q * Q) + tx] = rV[i_DIM][comp][j];
f80f4a74SSebastian Grimberg      }
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg}
f80f4a74SSebastian Grimberg
9e0c01faSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
db2becc9SJeremy L Thompson// sum into V of a 3D element from registers rV[][][] to global memory --  for all components of a single dim
db2becc9SJeremy L Thompson// dV is assumed to point directly to the element (i.e. already offset by elem-stride)
db2becc9SJeremy L Thompson// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
db2becc9SJeremy L Thompson// i_DIM specifies which dimension is being written to in dV
db2becc9SJeremy L Thompson// rV_SIZE can be different from P (e.g. max(P, Q))
db2becc9SJeremy L Thompsontemplate <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
db2becc9SJeremy L Thompsonstatic __device__ __inline__ void sum_V_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
db2becc9SJeremy L Thompson  if (tx < (Q * Q)) {
db2becc9SJeremy L Thompson    for (int comp = 0; comp < NUM_COMP; comp++) {
db2becc9SJeremy L Thompson      for (int j = 0; j < Q; j++) {
db2becc9SJeremy L Thompson        dV[comp * compstride + j * (Q * Q) + tx] += rV[i_DIM][comp][j];
db2becc9SJeremy L Thompson      }
db2becc9SJeremy L Thompson    }
db2becc9SJeremy L Thompson  }
db2becc9SJeremy L Thompson}
db2becc9SJeremy L Thompson
db2becc9SJeremy L Thompson////////////////////////////////////////////////////////////////////////////////
9e0c01faSSebastian Grimberg// reads T (no-trans) into shared memory
9e0c01faSSebastian Grimberg// T is B x J
f80f4a74SSebastian Grimberg// must sync after call
f80f4a74SSebastian Grimbergtemplate <int B, int J>
9e0c01faSSebastian Grimbergstatic __device__ __inline__ void read_T_notrans_gm2sm(const int tx, const CeedScalar *dT, CeedScalar *sT) {
f80f4a74SSebastian Grimberg  if (tx < B) {
f80f4a74SSebastian Grimberg    for (int i = 0; i < J; i++) {
f80f4a74SSebastian Grimberg      sT[i * B + tx] = dT[i * B + tx];
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg  }
9e0c01faSSebastian Grimberg  // must sync after call
9e0c01faSSebastian Grimberg}
9e0c01faSSebastian Grimberg
9e0c01faSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
9e0c01faSSebastian Grimberg// reads T (trans) into shared memory
f80f4a74SSebastian Grimberg// T is J x B
9e0c01faSSebastian Grimberg// must sync after call
9e0c01faSSebastian Grimbergtemplate <int B, int J>
9e0c01faSSebastian Grimbergstatic __device__ __inline__ void read_T_trans_gm2sm(const int tx, const CeedScalar *dT, CeedScalar *sT) {
f80f4a74SSebastian Grimberg  if (tx < J) {
f80f4a74SSebastian Grimberg    for (int i = 0; i < B; i++) {
f80f4a74SSebastian Grimberg      sT[tx * B + i] = dT[i * J + tx];
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg  // must sync after call
f80f4a74SSebastian Grimberg}