xref: /libCEED/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h (revision 49a40c8a2d720db341b0b117b89656b473cbebfb)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA non-tensor product basis
10 #ifndef CEED_CUDA_REF_BASIS_NONTENSOR_H
11 #define CEED_CUDA_REF_BASIS_NONTENSOR_H
12 
13 #include <ceed.h>
14 
15 //------------------------------------------------------------------------------
16 // Non-Tensor Basis Kernels
17 //------------------------------------------------------------------------------
18 
19 //------------------------------------------------------------------------------
20 // Interp
21 //------------------------------------------------------------------------------
22 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U,
23                                   CeedScalar *__restrict__ d_V) {
24   const CeedInt     t_id = threadIdx.x;
25   const CeedScalar *U;
26   CeedScalar        V;
27   // TODO load B in shared memory if blockDim.z > 1?
28 
29   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
30     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
31       if (transpose) {  // run with P threads
32         U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q;
33         V = 0.0;
34         for (CeedInt i = 0; i < BASIS_Q; i++) V += d_B[t_id + i * BASIS_P] * U[i];
35 
36         d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V;
37       } else {  // run with Q threads
38         U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P;
39         V = 0.0;
40         for (CeedInt i = 0; i < BASIS_P; i++) V += d_B[i + t_id * BASIS_P] * U[i];
41 
42         d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + t_id] = V;
43       }
44     }
45   }
46 }
47 
48 //------------------------------------------------------------------------------
49 // Grad
50 //------------------------------------------------------------------------------
51 extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U,
52                                 CeedScalar *__restrict__ d_V) {
53   const CeedInt     t_id = threadIdx.x;
54   const CeedScalar *U;
55   // TODO load G in shared memory if blockDim.z > 1?
56 
57   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
58     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
59       if (transpose) {  // run with P threads
60         CeedScalar V = 0.0;
61         for (CeedInt dim = 0; dim < BASIS_DIM; dim++) {
62           U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q;
63           for (CeedInt i = 0; i < BASIS_Q; i++) V += d_G[t_id + i * BASIS_P + dim * BASIS_P * BASIS_Q] * U[i];
64         }
65 
66         d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V;
67       } else {  // run with Q threads
68         CeedScalar V[BASIS_DIM];
69         U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P;
70         for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] = 0.0;
71         for (CeedInt i = 0; i < BASIS_P; i++) {
72           const CeedScalar val = U[i];
73           for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] += d_G[i + t_id * BASIS_P + dim * BASIS_P * BASIS_Q] * val;
74         }
75 
76         for (CeedInt dim = 0; dim < BASIS_DIM; dim++) {
77           d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q + t_id] = V[dim];
78         }
79       }
80     }
81   }
82 }
83 
84 //------------------------------------------------------------------------------
85 // Weight
86 //------------------------------------------------------------------------------
87 extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) {
88   const CeedInt t_id = threadIdx.x;
89 
90   // TODO load q_weight in shared memory if blockDim.z > 1?
91   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
92     d_V[elem * BASIS_Q + t_id] = q_weight[t_id];
93   }
94 }
95 
96 //------------------------------------------------------------------------------
97 
98 #endif  // CEED_CUDA_REF_BASIS_NONTENSOR_H
99