xref: /libCEED/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h (revision 22070f9510d4ff69aa6119a59aa3c46d57cc1cc7)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA non-tensor product basis templates
10 #ifndef CEED_CUDA_REF_BASIS_NONTENSOR_TEMPLATES_H
11 #define CEED_CUDA_REF_BASIS_NONTENSOR_TEMPLATES_H
12 
13 #include <ceed.h>
14 
15 //------------------------------------------------------------------------------
16 // Tensor contraction
17 //------------------------------------------------------------------------------
18 template <int NUM_COMP, int Q_COMP, int P, int Q>
19 inline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U,
20                                 const CeedInt strides_comp_V, const CeedInt strides_q_comp_V, const CeedScalar *__restrict__ d_B,
21                                 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
22   const CeedInt     t_id = threadIdx.x;
23   const CeedScalar *U;
24   CeedScalar        r_V[Q_COMP];
25   // TODO load B in shared memory if blockDim.z > 1?
26 
27   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
28     // Run with Q threads
29     U = d_U + elem * strides_elem_U + comp * strides_comp_U;
30     for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0;
31     for (CeedInt i = 0; i < P; i++) {
32       const CeedScalar val = U[i];
33 
34       for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] += d_B[i + t_id * P + d * P * Q] * val;
35     }
36     for (CeedInt d = 0; d < Q_COMP; d++) {
37       d_V[elem * strides_elem_V + comp * strides_comp_V + d * strides_q_comp_V + t_id] = r_V[d];
38     }
39   }
40 }
41 
42 //------------------------------------------------------------------------------
43 // Tensor contraction transpose
44 //------------------------------------------------------------------------------
45 template <int NUM_COMP, int Q_COMP, int P, int Q>
46 inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U,
47                                          const CeedInt strides_comp_V, const CeedInt strides_q_comp_U, const CeedScalar *__restrict__ d_B,
48                                          const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
49   const CeedInt     t_id = threadIdx.x;
50   const CeedScalar *U;
51   CeedScalar        r_V;
52   // TODO load B in shared memory if blockDim.z > 1?
53 
54   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
55     // Run with P threads
56     r_V = 0.0;
57     for (CeedInt d = 0; d < Q_COMP; d++) {
58       U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U;
59       for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i];
60     }
61     d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V;
62   }
63 }
64 
65 //------------------------------------------------------------------------------
66 
67 #endif  // CEED_CUDA_REF_BASIS_NONTENSOR_TEMPLATES_H
68