107b31e0eSJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 207b31e0eSJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 307b31e0eSJeremy L Thompson // 407b31e0eSJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 507b31e0eSJeremy L Thompson // 607b31e0eSJeremy L Thompson // This file is part of CEED: http://github.com/ceed 707b31e0eSJeremy L Thompson 8b2165e7aSSebastian Grimberg /// @file 9b2165e7aSSebastian Grimberg /// Internal header for CUDA operator diagonal assembly 1094b7b29bSJeremy L Thompson #ifndef CEED_CUDA_REF_OPERATOR_ASSEMBLE_DIAGONAL_H 1194b7b29bSJeremy L Thompson #define CEED_CUDA_REF_OPERATOR_ASSEMBLE_DIAGONAL_H 12b2165e7aSSebastian Grimberg 13c9c2c079SJeremy L Thompson #include <ceed.h> 1407b31e0eSJeremy L Thompson 15*ca735530SJeremy L Thompson #if USE_CEEDSIZE 16f7c1b517Snbeams typedef CeedSize IndexType; 17f7c1b517Snbeams #else 18f7c1b517Snbeams typedef CeedInt IndexType; 19f7c1b517Snbeams #endif 20f7c1b517Snbeams 2107b31e0eSJeremy L Thompson //------------------------------------------------------------------------------ 2207b31e0eSJeremy L Thompson // Get Basis Emode Pointer 2307b31e0eSJeremy L Thompson //------------------------------------------------------------------------------ 24*ca735530SJeremy L Thompson extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basis_ptr, CeedEvalMode e_mode, const CeedScalar *identity, 252b730f8bSJeremy L Thompson const CeedScalar *interp, const CeedScalar *grad) { 26*ca735530SJeremy L Thompson switch (e_mode) { 2707b31e0eSJeremy L Thompson case CEED_EVAL_NONE: 28*ca735530SJeremy L Thompson *basis_ptr = identity; 2907b31e0eSJeremy L Thompson break; 3007b31e0eSJeremy L Thompson case CEED_EVAL_INTERP: 31*ca735530SJeremy L Thompson *basis_ptr = interp; 3207b31e0eSJeremy L Thompson break; 3307b31e0eSJeremy L Thompson case CEED_EVAL_GRAD: 34*ca735530SJeremy L Thompson *basis_ptr = grad; 3507b31e0eSJeremy L Thompson break; 3607b31e0eSJeremy L Thompson case CEED_EVAL_WEIGHT: 3707b31e0eSJeremy L Thompson case CEED_EVAL_DIV: 3807b31e0eSJeremy L Thompson case CEED_EVAL_CURL: 3907b31e0eSJeremy L Thompson break; // Caught by QF Assembly 4007b31e0eSJeremy L Thompson } 4107b31e0eSJeremy L Thompson } 4207b31e0eSJeremy L Thompson 4307b31e0eSJeremy L Thompson //------------------------------------------------------------------------------ 4407b31e0eSJeremy L Thompson // Core code for diagonal assembly 4507b31e0eSJeremy L Thompson //------------------------------------------------------------------------------ 46*ca735530SJeremy L Thompson __device__ void diagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity, const CeedScalar *interp_in, 47*ca735530SJeremy L Thompson const CeedScalar *grad_in, const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedEvalMode *e_mode_in, 48*ca735530SJeremy L Thompson const CeedEvalMode *e_mode_out, const CeedScalar *__restrict__ assembled_qf_array, 49*ca735530SJeremy L Thompson CeedScalar *__restrict__ elem_diag_array) { 5007b31e0eSJeremy L Thompson const int tid = threadIdx.x; // running with P threads, tid is evec node 51*ca735530SJeremy L Thompson if (tid >= NUM_NODES) return; 5207b31e0eSJeremy L Thompson 5307b31e0eSJeremy L Thompson // Compute the diagonal of B^T D B 5407b31e0eSJeremy L Thompson // Each element 55*ca735530SJeremy L Thompson for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < num_elem; e += gridDim.x * blockDim.z) { 56*ca735530SJeremy L Thompson IndexType d_out = -1; 57*ca735530SJeremy L Thompson 5807b31e0eSJeremy L Thompson // Each basis eval mode pair 59*ca735530SJeremy L Thompson for (IndexType e_out = 0; e_out < NUM_E_MODE_OUT; e_out++) { 60*ca735530SJeremy L Thompson const CeedScalar *b_t = NULL; 61*ca735530SJeremy L Thompson 62*ca735530SJeremy L Thompson if (e_mode_out[e_out] == CEED_EVAL_GRAD) d_out += 1; 63*ca735530SJeremy L Thompson CeedOperatorGetBasisPointer_Cuda(&b_t, e_mode_out[e_out], identity, interp_out, &grad_out[d_out * NUM_QPTS * NUM_NODES]); 64*ca735530SJeremy L Thompson IndexType d_in = -1; 65*ca735530SJeremy L Thompson 66*ca735530SJeremy L Thompson for (IndexType e_in = 0; e_in < NUM_E_MODE_IN; e_in++) { 6707b31e0eSJeremy L Thompson const CeedScalar *b = NULL; 68*ca735530SJeremy L Thompson 69*ca735530SJeremy L Thompson if (e_mode_in[e_in] == CEED_EVAL_GRAD) d_in += 1; 70*ca735530SJeremy L Thompson CeedOperatorGetBasisPointer_Cuda(&b, e_mode_in[e_in], identity, interp_in, &grad_in[d_in * NUM_QPTS * NUM_NODES]); 7107b31e0eSJeremy L Thompson // Each component 72*ca735530SJeremy L Thompson for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { 7307b31e0eSJeremy L Thompson // Each qpoint/node pair 74*ca735530SJeremy L Thompson if (is_point_block) { 7507b31e0eSJeremy L Thompson // Point Block Diagonal 76*ca735530SJeremy L Thompson for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { 77*ca735530SJeremy L Thompson CeedScalar e_value = 0.; 78*ca735530SJeremy L Thompson 79*ca735530SJeremy L Thompson for (IndexType q = 0; q < NUM_QPTS; q++) { 80*ca735530SJeremy L Thompson const CeedScalar qf_value = 81*ca735530SJeremy L Thompson assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_E_MODE_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + 82*ca735530SJeremy L Thompson q]; 83*ca735530SJeremy L Thompson 84*ca735530SJeremy L Thompson e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; 8507b31e0eSJeremy L Thompson } 86*ca735530SJeremy L Thompson elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value; 8707b31e0eSJeremy L Thompson } 8807b31e0eSJeremy L Thompson } else { 8907b31e0eSJeremy L Thompson // Diagonal Only 90*ca735530SJeremy L Thompson CeedScalar e_value = 0.; 91*ca735530SJeremy L Thompson 92*ca735530SJeremy L Thompson for (IndexType q = 0; q < NUM_QPTS; q++) { 93*ca735530SJeremy L Thompson const CeedScalar qf_value = 94*ca735530SJeremy L Thompson assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_E_MODE_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + q]; 95*ca735530SJeremy L Thompson 96*ca735530SJeremy L Thompson e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; 9707b31e0eSJeremy L Thompson } 98*ca735530SJeremy L Thompson elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value; 9907b31e0eSJeremy L Thompson } 10007b31e0eSJeremy L Thompson } 10107b31e0eSJeremy L Thompson } 10207b31e0eSJeremy L Thompson } 10307b31e0eSJeremy L Thompson } 10407b31e0eSJeremy L Thompson } 10507b31e0eSJeremy L Thompson 10607b31e0eSJeremy L Thompson //------------------------------------------------------------------------------ 10707b31e0eSJeremy L Thompson // Linear diagonal 10807b31e0eSJeremy L Thompson //------------------------------------------------------------------------------ 109*ca735530SJeremy L Thompson extern "C" __global__ void linearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in, 110*ca735530SJeremy L Thompson const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedEvalMode *e_mode_in, 111*ca735530SJeremy L Thompson const CeedEvalMode *e_mode_out, const CeedScalar *__restrict__ assembled_qf_array, 112*ca735530SJeremy L Thompson CeedScalar *__restrict__ elem_diag_array) { 113*ca735530SJeremy L Thompson diagonalCore(num_elem, false, identity, interp_in, grad_in, interp_out, grad_out, e_mode_in, e_mode_out, assembled_qf_array, elem_diag_array); 11407b31e0eSJeremy L Thompson } 11507b31e0eSJeremy L Thompson 11607b31e0eSJeremy L Thompson //------------------------------------------------------------------------------ 11707b31e0eSJeremy L Thompson // Linear point block diagonal 11807b31e0eSJeremy L Thompson //------------------------------------------------------------------------------ 119*ca735530SJeremy L Thompson extern "C" __global__ void linearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, 120*ca735530SJeremy L Thompson const CeedScalar *grad_in, const CeedScalar *interp_out, const CeedScalar *grad_out, 121*ca735530SJeremy L Thompson const CeedEvalMode *e_mode_in, const CeedEvalMode *e_mode_out, 122*ca735530SJeremy L Thompson const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { 123*ca735530SJeremy L Thompson diagonalCore(num_elem, true, identity, interp_in, grad_in, interp_out, grad_out, e_mode_in, e_mode_out, assembled_qf_array, elem_diag_array); 12407b31e0eSJeremy L Thompson } 12507b31e0eSJeremy L Thompson 12607b31e0eSJeremy L Thompson //------------------------------------------------------------------------------ 127b2165e7aSSebastian Grimberg 12894b7b29bSJeremy L Thompson #endif // CEED_CUDA_REF_OPERATOR_ASSEMBLE_DIAGONAL_H 129