xref: /libCEED/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h (revision ca7355308a14805110a7d98dabf1f658d5ec16d5)
107b31e0eSJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
207b31e0eSJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
307b31e0eSJeremy L Thompson //
407b31e0eSJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
507b31e0eSJeremy L Thompson //
607b31e0eSJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
707b31e0eSJeremy L Thompson 
8b2165e7aSSebastian Grimberg /// @file
9b2165e7aSSebastian Grimberg /// Internal header for CUDA operator diagonal assembly
1094b7b29bSJeremy L Thompson #ifndef CEED_CUDA_REF_OPERATOR_ASSEMBLE_DIAGONAL_H
1194b7b29bSJeremy L Thompson #define CEED_CUDA_REF_OPERATOR_ASSEMBLE_DIAGONAL_H
12b2165e7aSSebastian Grimberg 
13c9c2c079SJeremy L Thompson #include <ceed.h>
1407b31e0eSJeremy L Thompson 
15*ca735530SJeremy L Thompson #if USE_CEEDSIZE
16f7c1b517Snbeams typedef CeedSize IndexType;
17f7c1b517Snbeams #else
18f7c1b517Snbeams typedef CeedInt IndexType;
19f7c1b517Snbeams #endif
20f7c1b517Snbeams 
2107b31e0eSJeremy L Thompson //------------------------------------------------------------------------------
2207b31e0eSJeremy L Thompson // Get Basis Emode Pointer
2307b31e0eSJeremy L Thompson //------------------------------------------------------------------------------
24*ca735530SJeremy L Thompson extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basis_ptr, CeedEvalMode e_mode, const CeedScalar *identity,
252b730f8bSJeremy L Thompson                                                             const CeedScalar *interp, const CeedScalar *grad) {
26*ca735530SJeremy L Thompson   switch (e_mode) {
2707b31e0eSJeremy L Thompson     case CEED_EVAL_NONE:
28*ca735530SJeremy L Thompson       *basis_ptr = identity;
2907b31e0eSJeremy L Thompson       break;
3007b31e0eSJeremy L Thompson     case CEED_EVAL_INTERP:
31*ca735530SJeremy L Thompson       *basis_ptr = interp;
3207b31e0eSJeremy L Thompson       break;
3307b31e0eSJeremy L Thompson     case CEED_EVAL_GRAD:
34*ca735530SJeremy L Thompson       *basis_ptr = grad;
3507b31e0eSJeremy L Thompson       break;
3607b31e0eSJeremy L Thompson     case CEED_EVAL_WEIGHT:
3707b31e0eSJeremy L Thompson     case CEED_EVAL_DIV:
3807b31e0eSJeremy L Thompson     case CEED_EVAL_CURL:
3907b31e0eSJeremy L Thompson       break;  // Caught by QF Assembly
4007b31e0eSJeremy L Thompson   }
4107b31e0eSJeremy L Thompson }
4207b31e0eSJeremy L Thompson 
4307b31e0eSJeremy L Thompson //------------------------------------------------------------------------------
4407b31e0eSJeremy L Thompson // Core code for diagonal assembly
4507b31e0eSJeremy L Thompson //------------------------------------------------------------------------------
46*ca735530SJeremy L Thompson __device__ void diagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity, const CeedScalar *interp_in,
47*ca735530SJeremy L Thompson                              const CeedScalar *grad_in, const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedEvalMode *e_mode_in,
48*ca735530SJeremy L Thompson                              const CeedEvalMode *e_mode_out, const CeedScalar *__restrict__ assembled_qf_array,
49*ca735530SJeremy L Thompson                              CeedScalar *__restrict__ elem_diag_array) {
5007b31e0eSJeremy L Thompson   const int tid = threadIdx.x;  // running with P threads, tid is evec node
51*ca735530SJeremy L Thompson   if (tid >= NUM_NODES) return;
5207b31e0eSJeremy L Thompson 
5307b31e0eSJeremy L Thompson   // Compute the diagonal of B^T D B
5407b31e0eSJeremy L Thompson   // Each element
55*ca735530SJeremy L Thompson   for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < num_elem; e += gridDim.x * blockDim.z) {
56*ca735530SJeremy L Thompson     IndexType d_out = -1;
57*ca735530SJeremy L Thompson 
5807b31e0eSJeremy L Thompson     // Each basis eval mode pair
59*ca735530SJeremy L Thompson     for (IndexType e_out = 0; e_out < NUM_E_MODE_OUT; e_out++) {
60*ca735530SJeremy L Thompson       const CeedScalar *b_t = NULL;
61*ca735530SJeremy L Thompson 
62*ca735530SJeremy L Thompson       if (e_mode_out[e_out] == CEED_EVAL_GRAD) d_out += 1;
63*ca735530SJeremy L Thompson       CeedOperatorGetBasisPointer_Cuda(&b_t, e_mode_out[e_out], identity, interp_out, &grad_out[d_out * NUM_QPTS * NUM_NODES]);
64*ca735530SJeremy L Thompson       IndexType d_in = -1;
65*ca735530SJeremy L Thompson 
66*ca735530SJeremy L Thompson       for (IndexType e_in = 0; e_in < NUM_E_MODE_IN; e_in++) {
6707b31e0eSJeremy L Thompson         const CeedScalar *b = NULL;
68*ca735530SJeremy L Thompson 
69*ca735530SJeremy L Thompson         if (e_mode_in[e_in] == CEED_EVAL_GRAD) d_in += 1;
70*ca735530SJeremy L Thompson         CeedOperatorGetBasisPointer_Cuda(&b, e_mode_in[e_in], identity, interp_in, &grad_in[d_in * NUM_QPTS * NUM_NODES]);
7107b31e0eSJeremy L Thompson         // Each component
72*ca735530SJeremy L Thompson         for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) {
7307b31e0eSJeremy L Thompson           // Each qpoint/node pair
74*ca735530SJeremy L Thompson           if (is_point_block) {
7507b31e0eSJeremy L Thompson             // Point Block Diagonal
76*ca735530SJeremy L Thompson             for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) {
77*ca735530SJeremy L Thompson               CeedScalar e_value = 0.;
78*ca735530SJeremy L Thompson 
79*ca735530SJeremy L Thompson               for (IndexType q = 0; q < NUM_QPTS; q++) {
80*ca735530SJeremy L Thompson                 const CeedScalar qf_value =
81*ca735530SJeremy L Thompson                     assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_E_MODE_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS +
82*ca735530SJeremy L Thompson                                        q];
83*ca735530SJeremy L Thompson 
84*ca735530SJeremy L Thompson                 e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid];
8507b31e0eSJeremy L Thompson               }
86*ca735530SJeremy L Thompson               elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value;
8707b31e0eSJeremy L Thompson             }
8807b31e0eSJeremy L Thompson           } else {
8907b31e0eSJeremy L Thompson             // Diagonal Only
90*ca735530SJeremy L Thompson             CeedScalar e_value = 0.;
91*ca735530SJeremy L Thompson 
92*ca735530SJeremy L Thompson             for (IndexType q = 0; q < NUM_QPTS; q++) {
93*ca735530SJeremy L Thompson               const CeedScalar qf_value =
94*ca735530SJeremy L Thompson                   assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_E_MODE_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + q];
95*ca735530SJeremy L Thompson 
96*ca735530SJeremy L Thompson               e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid];
9707b31e0eSJeremy L Thompson             }
98*ca735530SJeremy L Thompson             elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value;
9907b31e0eSJeremy L Thompson           }
10007b31e0eSJeremy L Thompson         }
10107b31e0eSJeremy L Thompson       }
10207b31e0eSJeremy L Thompson     }
10307b31e0eSJeremy L Thompson   }
10407b31e0eSJeremy L Thompson }
10507b31e0eSJeremy L Thompson 
10607b31e0eSJeremy L Thompson //------------------------------------------------------------------------------
10707b31e0eSJeremy L Thompson // Linear diagonal
10807b31e0eSJeremy L Thompson //------------------------------------------------------------------------------
109*ca735530SJeremy L Thompson extern "C" __global__ void linearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in,
110*ca735530SJeremy L Thompson                                           const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedEvalMode *e_mode_in,
111*ca735530SJeremy L Thompson                                           const CeedEvalMode *e_mode_out, const CeedScalar *__restrict__ assembled_qf_array,
112*ca735530SJeremy L Thompson                                           CeedScalar *__restrict__ elem_diag_array) {
113*ca735530SJeremy L Thompson   diagonalCore(num_elem, false, identity, interp_in, grad_in, interp_out, grad_out, e_mode_in, e_mode_out, assembled_qf_array, elem_diag_array);
11407b31e0eSJeremy L Thompson }
11507b31e0eSJeremy L Thompson 
11607b31e0eSJeremy L Thompson //------------------------------------------------------------------------------
11707b31e0eSJeremy L Thompson // Linear point block diagonal
11807b31e0eSJeremy L Thompson //------------------------------------------------------------------------------
119*ca735530SJeremy L Thompson extern "C" __global__ void linearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in,
120*ca735530SJeremy L Thompson                                                     const CeedScalar *grad_in, const CeedScalar *interp_out, const CeedScalar *grad_out,
121*ca735530SJeremy L Thompson                                                     const CeedEvalMode *e_mode_in, const CeedEvalMode *e_mode_out,
122*ca735530SJeremy L Thompson                                                     const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) {
123*ca735530SJeremy L Thompson   diagonalCore(num_elem, true, identity, interp_in, grad_in, interp_out, grad_out, e_mode_in, e_mode_out, assembled_qf_array, elem_diag_array);
12407b31e0eSJeremy L Thompson }
12507b31e0eSJeremy L Thompson 
12607b31e0eSJeremy L Thompson //------------------------------------------------------------------------------
127b2165e7aSSebastian Grimberg 
12894b7b29bSJeremy L Thompson #endif  // CEED_CUDA_REF_OPERATOR_ASSEMBLE_DIAGONAL_H
129