xref: /libCEED/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h (revision 019b76820d7ff306c177822c4e76ffe5939c204b)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 
10 //------------------------------------------------------------------------------
11 // Non-Tensor Basis Kernels
12 //------------------------------------------------------------------------------
13 
14 //------------------------------------------------------------------------------
15 // Interp
16 //------------------------------------------------------------------------------
17 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose,
18                                   const CeedScalar *d_B,
19                                   const CeedScalar *__restrict__ d_U,
20                                   CeedScalar *__restrict__ d_V) {
21   const CeedInt t_id = threadIdx.x;
22 
23   const CeedScalar *U;
24   CeedScalar V;
25   //TODO load B in shared memory if blockDim.z > 1?
26 
27   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem;
28        elem += gridDim.x*blockDim.z) {
29     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
30       if (transpose) { // run with P threads
31         U = d_U + elem*BASIS_Q + comp*num_elem*BASIS_Q;
32         V = 0.0;
33         for (CeedInt i = 0; i < BASIS_Q; i++)
34           V += d_B[t_id + i*BASIS_P]*U[i];
35 
36         d_V[elem*BASIS_P + comp*num_elem*BASIS_P + t_id] = V;
37       } else { // run with Q threads
38         U = d_U + elem*BASIS_P + comp*num_elem*BASIS_P;
39         V = 0.0;
40         for (CeedInt i = 0; i < BASIS_P; i++)
41           V += d_B[i + t_id*BASIS_P]*U[i];
42 
43         d_V[elem*BASIS_Q + comp*num_elem*BASIS_Q + t_id] = V;
44       }
45     }
46   }
47 }
48 
49 //------------------------------------------------------------------------------
50 // Grad
51 //------------------------------------------------------------------------------
52 extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose,
53                                 const CeedScalar *d_G,
54                                 const CeedScalar *__restrict__ d_U,
55                                 CeedScalar *__restrict__ d_V) {
56   const CeedInt t_id = threadIdx.x;
57 
58   const CeedScalar *U;
59   //TODO load G in shared memory if blockDim.z > 1?
60 
61   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem;
62        elem += gridDim.x*blockDim.z) {
63     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
64       if (transpose) { // run with P threads
65         CeedScalar V = 0.0;
66         for (CeedInt dim = 0; dim < BASIS_DIM; dim++) {
67           U = d_U + elem*BASIS_Q + comp*num_elem*BASIS_Q +
68               dim*BASIS_NUM_COMP*num_elem*BASIS_Q;
69           for (CeedInt i = 0; i < BASIS_Q; i++)
70             V += d_G[t_id + i*BASIS_P + dim*BASIS_P*BASIS_Q]*U[i];
71         }
72 
73         d_V[elem*BASIS_P + comp*num_elem*BASIS_P + t_id] = V;
74       } else { // run with Q threads
75         CeedScalar V[BASIS_DIM];
76         U = d_U + elem*BASIS_P + comp*num_elem*BASIS_P;
77         for (CeedInt dim = 0; dim < BASIS_DIM; dim++)
78           V[dim] = 0.0;
79         for (CeedInt i = 0; i < BASIS_P; i++) {
80           const CeedScalar val = U[i];
81           for(CeedInt dim = 0; dim < BASIS_DIM; dim++)
82             V[dim] += d_G[i + t_id*BASIS_P + dim*BASIS_P*BASIS_Q]*val;
83         }
84 
85         for (CeedInt dim = 0; dim < BASIS_DIM; dim++) {
86           d_V[elem*BASIS_Q + comp*num_elem*BASIS_Q + dim*BASIS_NUM_COMP*num_elem*BASIS_Q + t_id] = V[dim];
87         }
88       }
89     }
90   }
91 }
92 
93 //------------------------------------------------------------------------------
94 // Weight
95 //------------------------------------------------------------------------------
96 extern "C" __global__ void Weight(const CeedInt num_elem,
97                                   const CeedScalar *__restrict__ q_weight,
98                                   CeedScalar *__restrict__ d_V) {
99   const CeedInt t_id = threadIdx.x;
100   //TODO load q_weight in shared memory if blockDim.z > 1?
101   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem;
102        elem += gridDim.x*blockDim.z) {
103     d_V[elem*BASIS_Q + t_id] = q_weight[t_id];
104   }
105 }
106 
107 //------------------------------------------------------------------------------
108