xref: /libCEED/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h (revision 5fb68f377259d3910de46d787b7c5d1587fd01e1)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 
10 //------------------------------------------------------------------------------
11 // Non-Tensor Basis Kernels
12 //------------------------------------------------------------------------------
13 
14 //------------------------------------------------------------------------------
15 // Interp
16 //------------------------------------------------------------------------------
17 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U,
18                                   CeedScalar *__restrict__ d_V) {
19   const CeedInt t_id = threadIdx.x;
20 
21   const CeedScalar *U;
22   CeedScalar        V;
23   // TODO load B in shared memory if blockDim.z > 1?
24 
25   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
26     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
27       if (transpose) {  // run with P threads
28         U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q;
29         V = 0.0;
30         for (CeedInt i = 0; i < BASIS_Q; i++) V += d_B[t_id + i * BASIS_P] * U[i];
31 
32         d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V;
33       } else {  // run with Q threads
34         U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P;
35         V = 0.0;
36         for (CeedInt i = 0; i < BASIS_P; i++) V += d_B[i + t_id * BASIS_P] * U[i];
37 
38         d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + t_id] = V;
39       }
40     }
41   }
42 }
43 
44 //------------------------------------------------------------------------------
45 // Grad
46 //------------------------------------------------------------------------------
47 extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U,
48                                 CeedScalar *__restrict__ d_V) {
49   const CeedInt t_id = threadIdx.x;
50 
51   const CeedScalar *U;
52   // TODO load G in shared memory if blockDim.z > 1?
53 
54   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
55     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
56       if (transpose) {  // run with P threads
57         CeedScalar V = 0.0;
58         for (CeedInt dim = 0; dim < BASIS_DIM; dim++) {
59           U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q;
60           for (CeedInt i = 0; i < BASIS_Q; i++) V += d_G[t_id + i * BASIS_P + dim * BASIS_P * BASIS_Q] * U[i];
61         }
62 
63         d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V;
64       } else {  // run with Q threads
65         CeedScalar V[BASIS_DIM];
66         U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P;
67         for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] = 0.0;
68         for (CeedInt i = 0; i < BASIS_P; i++) {
69           const CeedScalar val = U[i];
70           for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] += d_G[i + t_id * BASIS_P + dim * BASIS_P * BASIS_Q] * val;
71         }
72 
73         for (CeedInt dim = 0; dim < BASIS_DIM; dim++) {
74           d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q + t_id] = V[dim];
75         }
76       }
77     }
78   }
79 }
80 
81 //------------------------------------------------------------------------------
82 // Weight
83 //------------------------------------------------------------------------------
84 extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) {
85   const CeedInt t_id = threadIdx.x;
86   // TODO load q_weight in shared memory if blockDim.z > 1?
87   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
88     d_V[elem * BASIS_Q + t_id] = q_weight[t_id];
89   }
90 }
91 
92 //------------------------------------------------------------------------------
93