xref: /libCEED/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h (revision 5cd6c1fb67d52eb6a42b887bb79c183682dd86ca)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP non-tensor product basis
10 #ifndef CEED_HIP_REF_BASIS_NONTENSOR_H
11 #define CEED_HIP_REF_BASIS_NONTENSOR_H
12 
13 #include <ceed.h>
14 
15 //------------------------------------------------------------------------------
16 // Non-Tensor Basis Kernels
17 //------------------------------------------------------------------------------
18 
19 //------------------------------------------------------------------------------
20 // Interp
21 //------------------------------------------------------------------------------
22 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U,
23                                   CeedScalar *__restrict__ d_V) {
24   const CeedInt t_id = threadIdx.x;
25 
26   const CeedScalar *U;
27   CeedScalar        V;
28   // TODO load B in shared memory if blockDim.z > 1?
29 
30   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
31     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
32       if (transpose) {  // run with P threads
33         U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q;
34         V = 0.0;
35         for (CeedInt i = 0; i < BASIS_Q; i++) V += d_B[t_id + i * BASIS_P] * U[i];
36 
37         d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V;
38       } else {  // run with Q threads
39         U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P;
40         V = 0.0;
41         for (CeedInt i = 0; i < BASIS_P; i++) V += d_B[i + t_id * BASIS_P] * U[i];
42 
43         d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + t_id] = V;
44       }
45     }
46   }
47 }
48 
49 //------------------------------------------------------------------------------
50 // Grad
51 //------------------------------------------------------------------------------
52 extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U,
53                                 CeedScalar *__restrict__ d_V) {
54   const CeedInt t_id = threadIdx.x;
55 
56   const CeedScalar *U;
57   // TODO load G in shared memory if blockDim.z > 1?
58 
59   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
60     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
61       if (transpose) {  // run with P threads
62         CeedScalar V = 0.0;
63         for (CeedInt dim = 0; dim < BASIS_DIM; dim++) {
64           U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q;
65           for (CeedInt i = 0; i < BASIS_Q; i++) V += d_G[t_id + i * BASIS_P + dim * BASIS_P * BASIS_Q] * U[i];
66         }
67 
68         d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V;
69       } else {  // run with Q threads
70         CeedScalar V[BASIS_DIM];
71         U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P;
72         for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] = 0.0;
73         for (CeedInt i = 0; i < BASIS_P; i++) {
74           const CeedScalar val = U[i];
75           for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] += d_G[i + t_id * BASIS_P + dim * BASIS_P * BASIS_Q] * val;
76         }
77 
78         for (CeedInt dim = 0; dim < BASIS_DIM; dim++) {
79           d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q + t_id] = V[dim];
80         }
81       }
82     }
83   }
84 }
85 
86 //------------------------------------------------------------------------------
87 // Weight
88 //------------------------------------------------------------------------------
89 extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) {
90   const CeedInt t_id = threadIdx.x;
91   // TODO load q_weight in shared memory if blockDim.z > 1?
92   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
93     d_V[elem * BASIS_Q + t_id] = q_weight[t_id];
94   }
95 }
96 
97 //------------------------------------------------------------------------------
98 
99 #endif  // CEED_HIP_REF_BASIS_NONTENSOR_H
100