jit-source/magma/magma-common-nontensor.h

5aed82e4SJeremy L Thompson// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
f80f4a74SSebastian Grimberg// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
f80f4a74SSebastian Grimberg//
f80f4a74SSebastian Grimberg// SPDX-License-Identifier: BSD-2-Clause
f80f4a74SSebastian Grimberg//
f80f4a74SSebastian Grimberg// This file is part of CEED:  http://github.com/ceed
f80f4a74SSebastian Grimberg
3c1e2affSSebastian Grimberg/// @file
3c1e2affSSebastian Grimberg/// Internal header for MAGMA backend common non-tensor basis definitions
509d4af6SJeremy L Thompson#pragma once
f80f4a74SSebastian Grimberg
3c1e2affSSebastian Grimberg#include "magma-common-defs.h"
f80f4a74SSebastian Grimberg
f80f4a74SSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
f80f4a74SSebastian Grimberg// read A (no-trans) from global to reg.
3c1e2affSSebastian Grimberg// A is (P x Q)
833aa127SSebastian Grimberg// 2D thread config. with (P x BY) threads
f80f4a74SSebastian Grimberg// no sync at the end of the function
833aa127SSebastian Grimbergtemplate <typename T, int P, int Q, int BY>
833aa127SSebastian Grimbergstatic __device__ __inline__ void read_A_notrans_g2r_1D_nosync(const int tx, const int ty, const T *dA, T *sA, T rA[Q]) {
833aa127SSebastian Grimberg  const int tid = ty * P + tx;
833aa127SSebastian Grimberg  int       i;
833aa127SSebastian Grimberg
f80f4a74SSebastian Grimberg#pragma unroll
833aa127SSebastian Grimberg  for (i = 0; i < P * Q - P * BY; i += P * BY) {
833aa127SSebastian Grimberg    sA[i + tid] = dA[i + tid];
833aa127SSebastian Grimberg  }
833aa127SSebastian Grimberg  if (i + tid < P * Q) {
833aa127SSebastian Grimberg    sA[i + tid] = dA[i + tid];
833aa127SSebastian Grimberg  }
833aa127SSebastian Grimberg  __syncthreads();
833aa127SSebastian Grimberg
833aa127SSebastian Grimberg#pragma unroll
833aa127SSebastian Grimberg  for (int j = 0; j < Q; j++) {
1a0eda08SSebastian Grimberg    rA[j] = sA[j * P + tx];
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg}
f80f4a74SSebastian Grimberg
f80f4a74SSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
3c1e2affSSebastian Grimberg// read A (trans) from global to reg.
3c1e2affSSebastian Grimberg// A is (P x Q)
9d15e85bSSebastian Grimberg// 2D thread config. with (P x BY) threads
f80f4a74SSebastian Grimberg// no sync at the end of the function
9d15e85bSSebastian Grimbergtemplate <typename T, int P, int Q, int BY>
9d15e85bSSebastian Grimbergstatic __device__ __inline__ void read_A_trans_g2r_1D_nosync(const int tx, const int ty, const T *dA, T *sA, T rA[Q]) {
833aa127SSebastian Grimberg  const int tid = ty * P + tx;
3c1e2affSSebastian Grimberg  int       i;
f80f4a74SSebastian Grimberg
f80f4a74SSebastian Grimberg#pragma unroll
9d15e85bSSebastian Grimberg  for (i = 0; i < P * Q - P * BY; i += P * BY) {
3c1e2affSSebastian Grimberg    sA[i + tid] = dA[i + tid];
f80f4a74SSebastian Grimberg  }
9d15e85bSSebastian Grimberg  if (i + tid < P * Q) {
3c1e2affSSebastian Grimberg    sA[i + tid] = dA[i + tid];
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg  __syncthreads();
f80f4a74SSebastian Grimberg
f80f4a74SSebastian Grimberg#pragma unroll
3c1e2affSSebastian Grimberg  for (int j = 0; j < Q; j++) {
9d15e85bSSebastian Grimberg    rA[j] = sA[tx * Q + j];
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg}
f80f4a74SSebastian Grimberg
f80f4a74SSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
f80f4a74SSebastian Grimberg// read B from global to shared
3c1e2affSSebastian Grimberg// B is (Q x NB)
3c1e2affSSebastian Grimberg// 1D thread config. with (P x 1) threads
f80f4a74SSebastian Grimberg// no sync at the end of the function
3c1e2affSSebastian Grimbergtemplate <typename T, int P, int Q, int NB>
9d15e85bSSebastian Grimbergstatic __device__ __inline__ void read_B_g2s_1D_nosync(const int tx, const int n, const T *dB, T *sB) {
9d15e85bSSebastian Grimberg  int i;
9d15e85bSSebastian Grimberg
3c1e2affSSebastian Grimberg  if (n != NB) {
9d15e85bSSebastian Grimberg    for (i = 0; i < Q * n - P; i += P) {
f80f4a74SSebastian Grimberg      sB[i + tx] = dB[i + tx];
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg  } else {
f80f4a74SSebastian Grimberg#pragma unroll
9d15e85bSSebastian Grimberg    for (i = 0; i < Q * NB - P; i += P) {
f80f4a74SSebastian Grimberg      sB[i + tx] = dB[i + tx];
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg  }
9d15e85bSSebastian Grimberg  if (i + tx < Q * n) {
9d15e85bSSebastian Grimberg    sB[i + tx] = dB[i + tx];
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg}
f80f4a74SSebastian Grimberg
f80f4a74SSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
3c1e2affSSebastian Grimberg// write C from reg. to global
3c1e2affSSebastian Grimberg// C is (P x NB)
3c1e2affSSebastian Grimberg// 1D thread config. with (P x 1) threads
3c1e2affSSebastian Grimberg// no sync at the end of the function
3c1e2affSSebastian Grimbergtemplate <typename T, int P, int Q, int NB>
9d15e85bSSebastian Grimbergstatic __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC) {
3c1e2affSSebastian Grimberg  if (n != NB) {
9d15e85bSSebastian Grimberg    for (int i = 0; i < n; i++) {
9d15e85bSSebastian Grimberg      dC[i * P + tx] = rC[i];
3c1e2affSSebastian Grimberg    }
3c1e2affSSebastian Grimberg  } else {
3c1e2affSSebastian Grimberg#pragma unroll
9d15e85bSSebastian Grimberg    for (int i = 0; i < NB; i++) {
9d15e85bSSebastian Grimberg      dC[i * P + tx] = rC[i];
3c1e2affSSebastian Grimberg    }
3c1e2affSSebastian Grimberg  }
3c1e2affSSebastian Grimberg}
3c1e2affSSebastian Grimberg
3c1e2affSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
*db2becc9SJeremy L Thompson// sum into C from reg. to global
*db2becc9SJeremy L Thompson// C is (P x NB)
*db2becc9SJeremy L Thompson// 1D thread config. with (P x 1) threads
*db2becc9SJeremy L Thompson// no sync at the end of the function
*db2becc9SJeremy L Thompsontemplate <typename T, int P, int Q, int NB>
*db2becc9SJeremy L Thompsonstatic __device__ __inline__ void sum_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC) {
*db2becc9SJeremy L Thompson  if (n != NB) {
*db2becc9SJeremy L Thompson    for (int i = 0; i < n; i++) {
*db2becc9SJeremy L Thompson      dC[i * P + tx] += rC[i];
*db2becc9SJeremy L Thompson    }
*db2becc9SJeremy L Thompson  } else {
*db2becc9SJeremy L Thompson#pragma unroll
*db2becc9SJeremy L Thompson    for (int i = 0; i < NB; i++) {
*db2becc9SJeremy L Thompson      dC[i * P + tx] += rC[i];
*db2becc9SJeremy L Thompson    }
*db2becc9SJeremy L Thompson  }
*db2becc9SJeremy L Thompson}
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson////////////////////////////////////////////////////////////////////////////////
3c1e2affSSebastian Grimberg// multiply C = A x B using 1D threads in P x 1 config
3c1e2affSSebastian Grimberg// A (P x Q)  in reg., one row per thread
3c1e2affSSebastian Grimberg// B (Q x NB) in shared memory
f80f4a74SSebastian Grimberg// C in registers -- one row per thread
f80f4a74SSebastian Grimberg// no sync at the end of the function
3c1e2affSSebastian Grimbergtemplate <typename T, int P, int Q, int NB>
9d15e85bSSebastian Grimbergstatic __device__ __inline__ void mul_rAsBrC_1D_nosync(T rA[Q], T *sB, T rC[NB]) {
3c1e2affSSebastian Grimberg  T rB[Q];
9d15e85bSSebastian Grimberg
f80f4a74SSebastian Grimberg#pragma unroll
3c1e2affSSebastian Grimberg  for (int i = 0; i < NB; i++) {
f80f4a74SSebastian Grimberg#pragma unroll
9d15e85bSSebastian Grimberg    for (int j = 0; j < Q; j++) {
9d15e85bSSebastian Grimberg      rB[j] = sB[i * Q + j];
f80f4a74SSebastian Grimberg    }
3c1e2affSSebastian Grimberg    rC[i] = 0.0;
f80f4a74SSebastian Grimberg#pragma unroll
9d15e85bSSebastian Grimberg    for (int j = 0; j < Q; j++) {
9d15e85bSSebastian Grimberg      rC[i] += rA[j] * rB[j];
f80f4a74SSebastian Grimberg    }
f80f4a74SSebastian Grimberg  }
f80f4a74SSebastian Grimberg}
f80f4a74SSebastian Grimberg
3c1e2affSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
3c1e2affSSebastian Grimberg// multiply C += A x B using 1D threads in P x 1 config
3c1e2affSSebastian Grimberg// A (P x Q)  in reg., one row per thread
3c1e2affSSebastian Grimberg// B (Q x NB) in shared memory
3c1e2affSSebastian Grimberg// C in registers -- one row per thread
3c1e2affSSebastian Grimberg// no sync at the end of the function
3c1e2affSSebastian Grimbergtemplate <typename T, int P, int Q, int NB>
9d15e85bSSebastian Grimbergstatic __device__ __inline__ void addmul_rAsBrC_1D_nosync(T rA[Q], T *sB, T rC[NB]) {
3c1e2affSSebastian Grimberg  T rB[Q];
9d15e85bSSebastian Grimberg
3c1e2affSSebastian Grimberg#pragma unroll
3c1e2affSSebastian Grimberg  for (int i = 0; i < NB; i++) {
3c1e2affSSebastian Grimberg#pragma unroll
9d15e85bSSebastian Grimberg    for (int j = 0; j < Q; j++) {
9d15e85bSSebastian Grimberg      rB[j] = sB[i * Q + j];
3c1e2affSSebastian Grimberg    }
3c1e2affSSebastian Grimberg#pragma unroll
9d15e85bSSebastian Grimberg    for (int j = 0; j < Q; j++) {
9d15e85bSSebastian Grimberg      rC[i] += rA[j] * rB[j];
3c1e2affSSebastian Grimberg    }
3c1e2affSSebastian Grimberg  }
3c1e2affSSebastian Grimberg}