13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 849aac155SJeremy L Thompson #include <ceed.h> 92b730f8bSJeremy L Thompson #include <ceed/backend.h> 102b730f8bSJeremy L Thompson #include <ceed/jit-tools.h> 11c85e8640SSebastian Grimberg #include <assert.h> 120d0321e0SJeremy L Thompson #include <cuda.h> 130d0321e0SJeremy L Thompson #include <cuda_runtime.h> 140d0321e0SJeremy L Thompson #include <stdbool.h> 150d0321e0SJeremy L Thompson #include <string.h> 162b730f8bSJeremy L Thompson 1749aac155SJeremy L Thompson #include "../cuda/ceed-cuda-common.h" 180d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 192b730f8bSJeremy L Thompson #include "ceed-cuda-ref.h" 200d0321e0SJeremy L Thompson 210d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 220d0321e0SJeremy L Thompson // Destroy operator 230d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 240d0321e0SJeremy L Thompson static int CeedOperatorDestroy_Cuda(CeedOperator op) { 250d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 26ca735530SJeremy L Thompson 272b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 280d0321e0SJeremy L Thompson 290d0321e0SJeremy L Thompson // Apply data 30ca735530SJeremy L Thompson for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { 31ca735530SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i])); 320d0321e0SJeremy L Thompson } 33ca735530SJeremy L Thompson CeedCallBackend(CeedFree(&impl->e_vecs)); 340d0321e0SJeremy L Thompson 35ca735530SJeremy L Thompson for (CeedInt i = 0; i < impl->num_inputs; i++) { 36ca735530SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i])); 370d0321e0SJeremy L Thompson } 38ca735530SJeremy L Thompson CeedCallBackend(CeedFree(&impl->q_vecs_in)); 390d0321e0SJeremy L Thompson 40ca735530SJeremy L Thompson for (CeedInt i = 0; i < impl->num_outputs; i++) { 41ca735530SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); 420d0321e0SJeremy L Thompson } 43ca735530SJeremy L Thompson CeedCallBackend(CeedFree(&impl->q_vecs_out)); 440d0321e0SJeremy L Thompson 450d0321e0SJeremy L Thompson // QFunction assembly data 46ca735530SJeremy L Thompson for (CeedInt i = 0; i < impl->num_active_in; i++) { 47ca735530SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i])); 480d0321e0SJeremy L Thompson } 49ca735530SJeremy L Thompson CeedCallBackend(CeedFree(&impl->qf_active_in)); 500d0321e0SJeremy L Thompson 510d0321e0SJeremy L Thompson // Diag data 520d0321e0SJeremy L Thompson if (impl->diag) { 530d0321e0SJeremy L Thompson Ceed ceed; 54ca735530SJeremy L Thompson 552b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 562b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(impl->diag->module)); 57ca735530SJeremy L Thompson CeedCallBackend(CeedFree(&impl->diag->h_e_mode_in)); 58ca735530SJeremy L Thompson CeedCallBackend(CeedFree(&impl->diag->h_e_mode_out)); 59ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_e_mode_in)); 60ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_e_mode_out)); 612b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_identity)); 62ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_in)); 63ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_out)); 64ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_in)); 65ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_out)); 66ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_rstr)); 67ca735530SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); 68ca735530SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); 690d0321e0SJeremy L Thompson } 702b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->diag)); 710d0321e0SJeremy L Thompson 72cc132f9aSnbeams if (impl->asmb) { 73cc132f9aSnbeams Ceed ceed; 74ca735530SJeremy L Thompson 752b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 762b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(impl->asmb->module)); 772b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_in)); 782b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_out)); 79cc132f9aSnbeams } 802b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->asmb)); 81cc132f9aSnbeams 822b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 830d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 840d0321e0SJeremy L Thompson } 850d0321e0SJeremy L Thompson 860d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 870d0321e0SJeremy L Thompson // Setup infields or outfields 880d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 89ca735530SJeremy L Thompson static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt e_start, 90ca735530SJeremy L Thompson CeedInt num_fields, CeedInt Q, CeedInt num_elem) { 910d0321e0SJeremy L Thompson Ceed ceed; 92ca735530SJeremy L Thompson bool is_strided, skip_restriction; 93ca735530SJeremy L Thompson CeedSize q_size; 94ca735530SJeremy L Thompson CeedInt dim, size; 95ca735530SJeremy L Thompson CeedQFunctionField *qf_fields; 96ca735530SJeremy L Thompson CeedOperatorField *op_fields; 970d0321e0SJeremy L Thompson 98ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 99ca735530SJeremy L Thompson 100ca735530SJeremy L Thompson if (is_input) { 101ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); 102ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); 1030d0321e0SJeremy L Thompson } else { 104ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); 105ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); 1060d0321e0SJeremy L Thompson } 1070d0321e0SJeremy L Thompson 1080d0321e0SJeremy L Thompson // Loop over fields 109ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_fields; i++) { 110ca735530SJeremy L Thompson CeedEvalMode e_mode; 111ca735530SJeremy L Thompson CeedBasis basis; 1120d0321e0SJeremy L Thompson 113ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); 114ca735530SJeremy L Thompson 115ca735530SJeremy L Thompson is_strided = false; 116ca735530SJeremy L Thompson skip_restriction = false; 117ca735530SJeremy L Thompson if (e_mode != CEED_EVAL_WEIGHT) { 118*edb2538eSJeremy L Thompson CeedElemRestriction elem_rstr; 119ca735530SJeremy L Thompson 120*edb2538eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); 1210d0321e0SJeremy L Thompson 1220d0321e0SJeremy L Thompson // Check whether this field can skip the element restriction: 123ca735530SJeremy L Thompson // must be passive input, with e_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. 1240d0321e0SJeremy L Thompson 1250d0321e0SJeremy L Thompson // First, check whether the field is input or output: 126ca735530SJeremy L Thompson if (is_input) { 127ca735530SJeremy L Thompson CeedVector vec; 128ca735530SJeremy L Thompson 1290d0321e0SJeremy L Thompson // Check for passive input: 130ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); 131ca735530SJeremy L Thompson if (vec != CEED_VECTOR_ACTIVE) { 132ca735530SJeremy L Thompson // Check e_mode 133ca735530SJeremy L Thompson if (e_mode == CEED_EVAL_NONE) { 1340d0321e0SJeremy L Thompson // Check for strided restriction 135*edb2538eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 136ca735530SJeremy L Thompson if (is_strided) { 1370d0321e0SJeremy L Thompson // Check if vector is already in preferred backend ordering 138*edb2538eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction)); 1390d0321e0SJeremy L Thompson } 1400d0321e0SJeremy L Thompson } 1410d0321e0SJeremy L Thompson } 1420d0321e0SJeremy L Thompson } 143ca735530SJeremy L Thompson if (skip_restriction) { 144ea61e9acSJeremy L Thompson // We do not need an E-Vector, but will use the input field vector's data directly in the operator application. 145ca735530SJeremy L Thompson e_vecs[i + e_start] = NULL; 1460d0321e0SJeremy L Thompson } else { 147*edb2538eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + e_start])); 1480d0321e0SJeremy L Thompson } 1490d0321e0SJeremy L Thompson } 1500d0321e0SJeremy L Thompson 151ca735530SJeremy L Thompson switch (e_mode) { 1520d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 153ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); 154ca735530SJeremy L Thompson q_size = (CeedSize)num_elem * Q * size; 155ca735530SJeremy L Thompson CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); 1560d0321e0SJeremy L Thompson break; 1570d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 158ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); 159ca735530SJeremy L Thompson q_size = (CeedSize)num_elem * Q * size; 160ca735530SJeremy L Thompson CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); 1610d0321e0SJeremy L Thompson break; 1620d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 163ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); 164ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); 1652b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 166ca735530SJeremy L Thompson q_size = (CeedSize)num_elem * Q * size; 167ca735530SJeremy L Thompson CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); 1680d0321e0SJeremy L Thompson break; 1690d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: // Only on input fields 170ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); 171ca735530SJeremy L Thompson q_size = (CeedSize)num_elem * Q; 172ca735530SJeremy L Thompson CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); 173ca735530SJeremy L Thompson CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); 1740d0321e0SJeremy L Thompson break; 1750d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 1760d0321e0SJeremy L Thompson break; // TODO: Not implemented 1770d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 1780d0321e0SJeremy L Thompson break; // TODO: Not implemented 1790d0321e0SJeremy L Thompson } 1800d0321e0SJeremy L Thompson } 1810d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1820d0321e0SJeremy L Thompson } 1830d0321e0SJeremy L Thompson 1840d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 185ea61e9acSJeremy L Thompson // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction. 1860d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1870d0321e0SJeremy L Thompson static int CeedOperatorSetup_Cuda(CeedOperator op) { 1880d0321e0SJeremy L Thompson Ceed ceed; 189ca735530SJeremy L Thompson bool is_setup_done; 190ca735530SJeremy L Thompson CeedInt Q, num_elem, num_input_fields, num_output_fields; 191ca735530SJeremy L Thompson CeedQFunctionField *qf_input_fields, *qf_output_fields; 1920d0321e0SJeremy L Thompson CeedQFunction qf; 193ca735530SJeremy L Thompson CeedOperatorField *op_input_fields, *op_output_fields; 194ca735530SJeremy L Thompson CeedOperator_Cuda *impl; 195ca735530SJeremy L Thompson 196ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 197ca735530SJeremy L Thompson if (is_setup_done) return CEED_ERROR_SUCCESS; 198ca735530SJeremy L Thompson 199ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 200ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 2012b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 2022b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 203ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 204ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 205ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 2060d0321e0SJeremy L Thompson 2070d0321e0SJeremy L Thompson // Allocate 208ca735530SJeremy L Thompson CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs)); 2090d0321e0SJeremy L Thompson 210ca735530SJeremy L Thompson CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); 211ca735530SJeremy L Thompson CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); 2120d0321e0SJeremy L Thompson 213ca735530SJeremy L Thompson impl->num_inputs = num_input_fields; 214ca735530SJeremy L Thompson impl->num_outputs = num_output_fields; 2150d0321e0SJeremy L Thompson 216ca735530SJeremy L Thompson // Set up infield and outfield e_vecs and q_vecs 2170d0321e0SJeremy L Thompson // Infields 218ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem)); 2190d0321e0SJeremy L Thompson // Outfields 220ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); 2210d0321e0SJeremy L Thompson 2222b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetSetupDone(op)); 2230d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2240d0321e0SJeremy L Thompson } 2250d0321e0SJeremy L Thompson 2260d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2270d0321e0SJeremy L Thompson // Setup Operator Inputs 2280d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 229ca735530SJeremy L Thompson static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, 230ca735530SJeremy L Thompson CeedVector in_vec, const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], 2310d0321e0SJeremy L Thompson CeedOperator_Cuda *impl, CeedRequest *request) { 232ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 233ca735530SJeremy L Thompson CeedEvalMode e_mode; 2340d0321e0SJeremy L Thompson CeedVector vec; 235*edb2538eSJeremy L Thompson CeedElemRestriction elem_rstr; 2360d0321e0SJeremy L Thompson 2370d0321e0SJeremy L Thompson // Get input vector 238ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 2390d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 240ca735530SJeremy L Thompson if (skip_active_in) continue; 241ca735530SJeremy L Thompson else vec = in_vec; 2420d0321e0SJeremy L Thompson } 2430d0321e0SJeremy L Thompson 244ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); 245ca735530SJeremy L Thompson if (e_mode == CEED_EVAL_WEIGHT) { // Skip 2460d0321e0SJeremy L Thompson } else { 2470d0321e0SJeremy L Thompson // Get input element restriction 248*edb2538eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 249ca735530SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) vec = in_vec; 2500d0321e0SJeremy L Thompson // Restrict, if necessary 251ca735530SJeremy L Thompson if (!impl->e_vecs[i]) { 2520d0321e0SJeremy L Thompson // No restriction for this field; read data directly from vec. 253ca735530SJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); 2540d0321e0SJeremy L Thompson } else { 255*edb2538eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request)); 2560d0321e0SJeremy L Thompson // Get evec 257ca735530SJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); 2580d0321e0SJeremy L Thompson } 2590d0321e0SJeremy L Thompson } 2600d0321e0SJeremy L Thompson } 2610d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2620d0321e0SJeremy L Thompson } 2630d0321e0SJeremy L Thompson 2640d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2650d0321e0SJeremy L Thompson // Input Basis Action 2660d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 267ca735530SJeremy L Thompson static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, 268ca735530SJeremy L Thompson CeedInt num_input_fields, const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], 2692b730f8bSJeremy L Thompson CeedOperator_Cuda *impl) { 270ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 271ca735530SJeremy L Thompson CeedInt elem_size, size; 272ca735530SJeremy L Thompson CeedEvalMode e_mode; 273*edb2538eSJeremy L Thompson CeedElemRestriction elem_rstr; 2740d0321e0SJeremy L Thompson CeedBasis basis; 2750d0321e0SJeremy L Thompson 2760d0321e0SJeremy L Thompson // Skip active input 277ca735530SJeremy L Thompson if (skip_active_in) { 2780d0321e0SJeremy L Thompson CeedVector vec; 279ca735530SJeremy L Thompson 280ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 2812b730f8bSJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) continue; 2820d0321e0SJeremy L Thompson } 283ca735530SJeremy L Thompson // Get elem_size, e_mode, size 284*edb2538eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 285*edb2538eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 286ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); 287ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); 2880d0321e0SJeremy L Thompson // Basis action 289ca735530SJeremy L Thompson switch (e_mode) { 2900d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 291ca735530SJeremy L Thompson CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); 2920d0321e0SJeremy L Thompson break; 2930d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 294ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 295ca735530SJeremy L Thompson CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs[i], impl->q_vecs_in[i])); 2960d0321e0SJeremy L Thompson break; 2970d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 298ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 299ca735530SJeremy L Thompson CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs[i], impl->q_vecs_in[i])); 3000d0321e0SJeremy L Thompson break; 3010d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: 3020d0321e0SJeremy L Thompson break; // No action 3030d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 3040d0321e0SJeremy L Thompson break; // TODO: Not implemented 3050d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 3060d0321e0SJeremy L Thompson break; // TODO: Not implemented 3070d0321e0SJeremy L Thompson } 3080d0321e0SJeremy L Thompson } 3090d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3100d0321e0SJeremy L Thompson } 3110d0321e0SJeremy L Thompson 3120d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3130d0321e0SJeremy L Thompson // Restore Input Vectors 3140d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 315ca735530SJeremy L Thompson static inline int CeedOperatorRestoreInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, 316ca735530SJeremy L Thompson const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) { 317ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 318ca735530SJeremy L Thompson CeedEvalMode e_mode; 3190d0321e0SJeremy L Thompson CeedVector vec; 3200d0321e0SJeremy L Thompson 3210d0321e0SJeremy L Thompson // Skip active input 322ca735530SJeremy L Thompson if (skip_active_in) { 323ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 3242b730f8bSJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) continue; 3250d0321e0SJeremy L Thompson } 326ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); 327ca735530SJeremy L Thompson if (e_mode == CEED_EVAL_WEIGHT) { // Skip 3280d0321e0SJeremy L Thompson } else { 329ca735530SJeremy L Thompson if (!impl->e_vecs[i]) { // This was a skip_restriction case 330ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 331ca735530SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i])); 3320d0321e0SJeremy L Thompson } else { 333ca735530SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i])); 3340d0321e0SJeremy L Thompson } 3350d0321e0SJeremy L Thompson } 3360d0321e0SJeremy L Thompson } 3370d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3380d0321e0SJeremy L Thompson } 3390d0321e0SJeremy L Thompson 3400d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3410d0321e0SJeremy L Thompson // Apply and add to output 3420d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 343ca735530SJeremy L Thompson static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { 3440d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 345ca735530SJeremy L Thompson CeedInt Q, num_elem, elem_size, num_input_fields, num_output_fields, size; 346ca735530SJeremy L Thompson CeedEvalMode e_mode; 347ca735530SJeremy L Thompson CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL}; 348ca735530SJeremy L Thompson CeedOperatorField *op_input_fields, *op_output_fields; 349ca735530SJeremy L Thompson CeedQFunctionField *qf_input_fields, *qf_output_fields; 3500d0321e0SJeremy L Thompson CeedQFunction qf; 351ca735530SJeremy L Thompson 352ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 3532b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 3542b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 355ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 356ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 357ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 3580d0321e0SJeremy L Thompson 3590d0321e0SJeremy L Thompson // Setup 3602b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetup_Cuda(op)); 3610d0321e0SJeremy L Thompson 362ca735530SJeremy L Thompson // Input e_vecs and Restriction 363ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request)); 3640d0321e0SJeremy L Thompson 3650d0321e0SJeremy L Thompson // Input basis apply if needed 366ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl)); 3670d0321e0SJeremy L Thompson 3680d0321e0SJeremy L Thompson // Output pointers, as necessary 369ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 370ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); 371ca735530SJeremy L Thompson if (e_mode == CEED_EVAL_NONE) { 3720d0321e0SJeremy L Thompson // Set the output Q-Vector to use the E-Vector data directly. 373ca735530SJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields])); 374ca735530SJeremy L Thompson CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields])); 3750d0321e0SJeremy L Thompson } 3760d0321e0SJeremy L Thompson } 3770d0321e0SJeremy L Thompson 3780d0321e0SJeremy L Thompson // Q function 379ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out)); 3800d0321e0SJeremy L Thompson 3810d0321e0SJeremy L Thompson // Output basis apply if needed 382ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 383*edb2538eSJeremy L Thompson CeedElemRestriction elem_rstr; 384ca735530SJeremy L Thompson CeedBasis basis; 385ca735530SJeremy L Thompson 386ca735530SJeremy L Thompson // Get elem_size, e_mode, size 387*edb2538eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 388*edb2538eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 389ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); 390ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); 3910d0321e0SJeremy L Thompson // Basis action 392ca735530SJeremy L Thompson switch (e_mode) { 3930d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 3940d0321e0SJeremy L Thompson break; 3950d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 396ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 397ca735530SJeremy L Thompson CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); 3980d0321e0SJeremy L Thompson break; 3990d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 400ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 401ca735530SJeremy L Thompson CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); 4020d0321e0SJeremy L Thompson break; 4030d0321e0SJeremy L Thompson // LCOV_EXCL_START 4040d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 4050d0321e0SJeremy L Thompson Ceed ceed; 4062b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 4072b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 4080d0321e0SJeremy L Thompson break; // Should not occur 4090d0321e0SJeremy L Thompson } 4100d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 4110d0321e0SJeremy L Thompson break; // TODO: Not implemented 4120d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 4130d0321e0SJeremy L Thompson break; // TODO: Not implemented 4140d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 4150d0321e0SJeremy L Thompson } 4160d0321e0SJeremy L Thompson } 4170d0321e0SJeremy L Thompson 4180d0321e0SJeremy L Thompson // Output restriction 419ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 420ca735530SJeremy L Thompson CeedVector vec; 421*edb2538eSJeremy L Thompson CeedElemRestriction elem_rstr; 422ca735530SJeremy L Thompson 4230d0321e0SJeremy L Thompson // Restore evec 424ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); 425ca735530SJeremy L Thompson if (e_mode == CEED_EVAL_NONE) { 426ca735530SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields])); 4270d0321e0SJeremy L Thompson } 4280d0321e0SJeremy L Thompson // Get output vector 429ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 4300d0321e0SJeremy L Thompson // Restrict 431*edb2538eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 4320d0321e0SJeremy L Thompson // Active 433ca735530SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; 4340d0321e0SJeremy L Thompson 435*edb2538eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request)); 4360d0321e0SJeremy L Thompson } 4370d0321e0SJeremy L Thompson 4380d0321e0SJeremy L Thompson // Restore input arrays 439ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); 4400d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 4410d0321e0SJeremy L Thompson } 4420d0321e0SJeremy L Thompson 4430d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 4440d0321e0SJeremy L Thompson // Core code for assembling linear QFunction 4450d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 4462b730f8bSJeremy L Thompson static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, 4470d0321e0SJeremy L Thompson CeedRequest *request) { 448ca735530SJeremy L Thompson Ceed ceed, ceed_parent; 449ca735530SJeremy L Thompson bool is_identity_qf; 450ca735530SJeremy L Thompson CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; 451e984cf9aSJeremy L Thompson CeedSize q_size; 452ca735530SJeremy L Thompson CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; 453ca735530SJeremy L Thompson CeedVector *active_inputs; 454ca735530SJeremy L Thompson CeedQFunctionField *qf_input_fields, *qf_output_fields; 455ca735530SJeremy L Thompson CeedQFunction qf; 456ca735530SJeremy L Thompson CeedOperatorField *op_input_fields, *op_output_fields; 457ca735530SJeremy L Thompson CeedOperator_Cuda *impl; 458ca735530SJeremy L Thompson 4592b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 460ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent)); 461e984cf9aSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 462e984cf9aSJeremy L Thompson CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 463ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 464e984cf9aSJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 465ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 466ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 467ca735530SJeremy L Thompson active_inputs = impl->qf_active_in; 468ca735530SJeremy L Thompson num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; 4690d0321e0SJeremy L Thompson 4700d0321e0SJeremy L Thompson // Setup 4712b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetup_Cuda(op)); 4720d0321e0SJeremy L Thompson 4730d0321e0SJeremy L Thompson // Check for identity 474ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 475ca735530SJeremy L Thompson CeedCheck(!is_identity_qf, ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); 4760d0321e0SJeremy L Thompson 477ca735530SJeremy L Thompson // Input e_vecs and Restriction 478ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); 4790d0321e0SJeremy L Thompson 4800d0321e0SJeremy L Thompson // Count number of active input fields 481ca735530SJeremy L Thompson if (!num_active_in) { 482ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 483ca735530SJeremy L Thompson CeedScalar *q_vec_array; 484ca735530SJeremy L Thompson CeedVector vec; 485ca735530SJeremy L Thompson 4860d0321e0SJeremy L Thompson // Get input vector 487ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 4880d0321e0SJeremy L Thompson // Check if active input 4890d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 490ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); 491ca735530SJeremy L Thompson CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); 492ca735530SJeremy L Thompson CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array)); 493ca735530SJeremy L Thompson CeedCallBackend(CeedRealloc(num_active_in + size, &active_inputs)); 4940d0321e0SJeremy L Thompson for (CeedInt field = 0; field < size; field++) { 495ca735530SJeremy L Thompson q_size = (CeedSize)Q * num_elem; 496ca735530SJeremy L Thompson CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field])); 497ca735530SJeremy L Thompson CeedCallBackend( 498ca735530SJeremy L Thompson CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); 4990d0321e0SJeremy L Thompson } 500ca735530SJeremy L Thompson num_active_in += size; 501ca735530SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); 5020d0321e0SJeremy L Thompson } 5030d0321e0SJeremy L Thompson } 504ca735530SJeremy L Thompson impl->num_active_in = num_active_in; 505ca735530SJeremy L Thompson impl->qf_active_in = active_inputs; 5060d0321e0SJeremy L Thompson } 5070d0321e0SJeremy L Thompson 5080d0321e0SJeremy L Thompson // Count number of active output fields 509ca735530SJeremy L Thompson if (!num_active_out) { 510ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 511ca735530SJeremy L Thompson CeedVector vec; 512ca735530SJeremy L Thompson 5130d0321e0SJeremy L Thompson // Get output vector 514ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 5150d0321e0SJeremy L Thompson // Check if active output 5160d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 517ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); 518ca735530SJeremy L Thompson num_active_out += size; 5190d0321e0SJeremy L Thompson } 5200d0321e0SJeremy L Thompson } 521ca735530SJeremy L Thompson impl->num_active_out = num_active_out; 5220d0321e0SJeremy L Thompson } 5230d0321e0SJeremy L Thompson 5240d0321e0SJeremy L Thompson // Check sizes 525ca735530SJeremy L Thompson CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); 5260d0321e0SJeremy L Thompson 5270d0321e0SJeremy L Thompson // Build objects if needed 5280d0321e0SJeremy L Thompson if (build_objects) { 5290d0321e0SJeremy L Thompson // Create output restriction 530ca735530SJeremy L Thompson CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ 531ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, 532ca735530SJeremy L Thompson num_active_in * num_active_out * num_elem * Q, strides, rstr)); 5330d0321e0SJeremy L Thompson // Create assembled vector 534ca735530SJeremy L Thompson CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; 535ca735530SJeremy L Thompson CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); 5360d0321e0SJeremy L Thompson } 5372b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); 538ca735530SJeremy L Thompson CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array)); 5390d0321e0SJeremy L Thompson 5400d0321e0SJeremy L Thompson // Input basis apply 541ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl)); 5420d0321e0SJeremy L Thompson 5430d0321e0SJeremy L Thompson // Assemble QFunction 544ca735530SJeremy L Thompson for (CeedInt in = 0; in < num_active_in; in++) { 5450d0321e0SJeremy L Thompson // Set Inputs 546ca735530SJeremy L Thompson CeedCallBackend(CeedVectorSetValue(active_inputs[in], 1.0)); 547ca735530SJeremy L Thompson if (num_active_in > 1) { 548ca735530SJeremy L Thompson CeedCallBackend(CeedVectorSetValue(active_inputs[(in + num_active_in - 1) % num_active_in], 0.0)); 5490d0321e0SJeremy L Thompson } 5500d0321e0SJeremy L Thompson // Set Outputs 551ca735530SJeremy L Thompson for (CeedInt out = 0; out < num_output_fields; out++) { 552ca735530SJeremy L Thompson CeedVector vec; 553ca735530SJeremy L Thompson 5540d0321e0SJeremy L Thompson // Get output vector 555ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); 5560d0321e0SJeremy L Thompson // Check if active output 5570d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 558ca735530SJeremy L Thompson CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array)); 559ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); 560ca735530SJeremy L Thompson assembled_array += size * Q * num_elem; // Advance the pointer by the size of the output 5610d0321e0SJeremy L Thompson } 5620d0321e0SJeremy L Thompson } 5630d0321e0SJeremy L Thompson // Apply QFunction 564ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); 5650d0321e0SJeremy L Thompson } 5660d0321e0SJeremy L Thompson 567ca735530SJeremy L Thompson // Un-set output q_vecs to prevent accidental overwrite of Assembled 568ca735530SJeremy L Thompson for (CeedInt out = 0; out < num_output_fields; out++) { 569ca735530SJeremy L Thompson CeedVector vec; 570ca735530SJeremy L Thompson 5710d0321e0SJeremy L Thompson // Get output vector 572ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); 5730d0321e0SJeremy L Thompson // Check if active output 5740d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 575ca735530SJeremy L Thompson CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL)); 5760d0321e0SJeremy L Thompson } 5770d0321e0SJeremy L Thompson } 5780d0321e0SJeremy L Thompson 5790d0321e0SJeremy L Thompson // Restore input arrays 580ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl)); 5810d0321e0SJeremy L Thompson 5820d0321e0SJeremy L Thompson // Restore output 583ca735530SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); 5840d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 5850d0321e0SJeremy L Thompson } 5860d0321e0SJeremy L Thompson 5870d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 5880d0321e0SJeremy L Thompson // Assemble Linear QFunction 5890d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 5902b730f8bSJeremy L Thompson static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { 5912b730f8bSJeremy L Thompson return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr, request); 5920d0321e0SJeremy L Thompson } 5930d0321e0SJeremy L Thompson 5940d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 5950d0321e0SJeremy L Thompson // Update Assembled Linear QFunction 5960d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 5972b730f8bSJeremy L Thompson static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { 5982b730f8bSJeremy L Thompson return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled, &rstr, request); 5990d0321e0SJeremy L Thompson } 6000d0321e0SJeremy L Thompson 6010d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 6020d0321e0SJeremy L Thompson // Create point block restriction 6030d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 604ca735530SJeremy L Thompson static int CreatePointBlockRestriction(CeedElemRestriction rstr, CeedElemRestriction *point_block_rstr) { 6050d0321e0SJeremy L Thompson Ceed ceed; 606ca735530SJeremy L Thompson CeedSize l_size; 607ca735530SJeremy L Thompson CeedInt num_elem, num_comp, elem_size, comp_stride, *point_block_offsets; 6080d0321e0SJeremy L Thompson const CeedInt *offsets; 609ca735530SJeremy L Thompson 610ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 6112b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); 6120d0321e0SJeremy L Thompson 6130d0321e0SJeremy L Thompson // Expand offsets 614ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 615ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 616ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 617ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); 6182b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); 619ca735530SJeremy L Thompson CeedInt shift = num_comp; 620ca735530SJeremy L Thompson 621ca735530SJeremy L Thompson if (comp_stride != 1) shift *= num_comp; 622ca735530SJeremy L Thompson CeedCallBackend(CeedCalloc(num_elem * elem_size, &point_block_offsets)); 623ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_elem * elem_size; i++) { 624ca735530SJeremy L Thompson point_block_offsets[i] = offsets[i] * shift; 6250d0321e0SJeremy L Thompson } 6260d0321e0SJeremy L Thompson 6270d0321e0SJeremy L Thompson // Create new restriction 628ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp * num_comp, 1, l_size * num_comp, CEED_MEM_HOST, CEED_OWN_POINTER, 629ca735530SJeremy L Thompson point_block_offsets, point_block_rstr)); 6300d0321e0SJeremy L Thompson 6310d0321e0SJeremy L Thompson // Cleanup 6322b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); 6330d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 6340d0321e0SJeremy L Thompson } 6350d0321e0SJeremy L Thompson 6360d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 6370d0321e0SJeremy L Thompson // Assemble diagonal setup 6380d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 639ca735530SJeremy L Thompson static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, const bool is_point_block, CeedInt use_ceedsize_idx) { 6400d0321e0SJeremy L Thompson Ceed ceed; 641ca735530SJeremy L Thompson char *diagonal_kernel_path, *diagonal_kernel_source; 642ca735530SJeremy L Thompson CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, num_comp = 0, dim = 1, num_e_mode_out = 0, num_nodes, num_qpts; 643ca735530SJeremy L Thompson CeedEvalMode *e_mode_in = NULL, *e_mode_out = NULL; 644ca735530SJeremy L Thompson CeedElemRestriction rstr_in = NULL, rstr_out = NULL; 645ca735530SJeremy L Thompson CeedBasis basis_in = NULL, basis_out = NULL; 646ca735530SJeremy L Thompson CeedQFunctionField *qf_fields; 6470d0321e0SJeremy L Thompson CeedQFunction qf; 648ca735530SJeremy L Thompson CeedOperatorField *op_fields; 649ca735530SJeremy L Thompson CeedOperator_Cuda *impl; 650ca735530SJeremy L Thompson 651ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 6522b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 653ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); 6540d0321e0SJeremy L Thompson 6550d0321e0SJeremy L Thompson // Determine active input basis 656ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); 657ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); 658ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 6590d0321e0SJeremy L Thompson CeedVector vec; 660ca735530SJeremy L Thompson 661ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); 6620d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 663ca735530SJeremy L Thompson CeedEvalMode e_mode; 6640d0321e0SJeremy L Thompson CeedElemRestriction rstr; 665ca735530SJeremy L Thompson 666ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); 667ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); 668ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); 669ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); 670ca735530SJeremy L Thompson CeedCheck(!rstr_in || rstr_in == rstr, ceed, CEED_ERROR_BACKEND, 6716574a04fSJeremy L Thompson "Backend does not implement multi-field non-composite operator diagonal assembly"); 672ca735530SJeremy L Thompson rstr_in = rstr; 673ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); 674ca735530SJeremy L Thompson switch (e_mode) { 6750d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 6760d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 677ca735530SJeremy L Thompson CeedCallBackend(CeedRealloc(num_e_mode_in + 1, &e_mode_in)); 678ca735530SJeremy L Thompson e_mode_in[num_e_mode_in] = e_mode; 679ca735530SJeremy L Thompson num_e_mode_in += 1; 6800d0321e0SJeremy L Thompson break; 6810d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 682ca735530SJeremy L Thompson CeedCallBackend(CeedRealloc(num_e_mode_in + dim, &e_mode_in)); 683ca735530SJeremy L Thompson for (CeedInt d = 0; d < dim; d++) e_mode_in[num_e_mode_in + d] = e_mode; 684ca735530SJeremy L Thompson num_e_mode_in += dim; 6850d0321e0SJeremy L Thompson break; 6860d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: 6870d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 6880d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 6890d0321e0SJeremy L Thompson break; // Caught by QF Assembly 6900d0321e0SJeremy L Thompson } 6910d0321e0SJeremy L Thompson } 6920d0321e0SJeremy L Thompson } 6930d0321e0SJeremy L Thompson 6940d0321e0SJeremy L Thompson // Determine active output basis 695ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); 696ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); 697ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 6980d0321e0SJeremy L Thompson CeedVector vec; 699ca735530SJeremy L Thompson 700ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); 7010d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 702ca735530SJeremy L Thompson CeedEvalMode e_mode; 7030d0321e0SJeremy L Thompson CeedElemRestriction rstr; 704ca735530SJeremy L Thompson 705ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); 706ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); 707ca735530SJeremy L Thompson CeedCheck(!rstr_out || rstr_out == rstr, ceed, CEED_ERROR_BACKEND, 7086574a04fSJeremy L Thompson "Backend does not implement multi-field non-composite operator diagonal assembly"); 709ca735530SJeremy L Thompson rstr_out = rstr; 710ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); 711ca735530SJeremy L Thompson switch (e_mode) { 7120d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 7130d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 714ca735530SJeremy L Thompson CeedCallBackend(CeedRealloc(num_e_mode_out + 1, &e_mode_out)); 715ca735530SJeremy L Thompson e_mode_out[num_e_mode_out] = e_mode; 716ca735530SJeremy L Thompson num_e_mode_out += 1; 7170d0321e0SJeremy L Thompson break; 7180d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 719ca735530SJeremy L Thompson CeedCallBackend(CeedRealloc(num_e_mode_out + dim, &e_mode_out)); 720ca735530SJeremy L Thompson for (CeedInt d = 0; d < dim; d++) e_mode_out[num_e_mode_out + d] = e_mode; 721ca735530SJeremy L Thompson num_e_mode_out += dim; 7220d0321e0SJeremy L Thompson break; 7230d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: 7240d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 7250d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 7260d0321e0SJeremy L Thompson break; // Caught by QF Assembly 7270d0321e0SJeremy L Thompson } 7280d0321e0SJeremy L Thompson } 7290d0321e0SJeremy L Thompson } 7300d0321e0SJeremy L Thompson 7310d0321e0SJeremy L Thompson // Operator data struct 7322b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 7332b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl->diag)); 7340d0321e0SJeremy L Thompson CeedOperatorDiag_Cuda *diag = impl->diag; 735ca735530SJeremy L Thompson 736ca735530SJeremy L Thompson diag->basis_in = basis_in; 737ca735530SJeremy L Thompson diag->basis_out = basis_out; 738ca735530SJeremy L Thompson diag->h_e_mode_in = e_mode_in; 739ca735530SJeremy L Thompson diag->h_e_mode_out = e_mode_out; 740ca735530SJeremy L Thompson diag->num_e_mode_in = num_e_mode_in; 741ca735530SJeremy L Thompson diag->num_e_mode_out = num_e_mode_out; 7420d0321e0SJeremy L Thompson 7430d0321e0SJeremy L Thompson // Assemble kernel 7442b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); 74523d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); 7462b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); 74723d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); 748ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); 749ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); 750ca735530SJeremy L Thompson diag->num_nodes = num_nodes; 751ca735530SJeremy L Thompson CeedCallCuda(ceed, 752ca735530SJeremy L Thompson CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_E_MODE_IN", num_e_mode_in, "NUM_E_MODE_OUT", num_e_mode_out, 753ca735530SJeremy L Thompson "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "NUM_COMP", num_comp, "USE_CEEDSIZE", use_ceedsize_idx)); 754eb7e6cafSJeremy L Thompson CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearDiagonal", &diag->linearDiagonal)); 755eb7e6cafSJeremy L Thompson CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearPointBlockDiagonal", &diag->linearPointBlock)); 7562b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&diagonal_kernel_path)); 7572b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&diagonal_kernel_source)); 7580d0321e0SJeremy L Thompson 7590d0321e0SJeremy L Thompson // Basis matrices 760ca735530SJeremy L Thompson const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 761ca735530SJeremy L Thompson const CeedInt interp_bytes = q_bytes * num_nodes; 762ca735530SJeremy L Thompson const CeedInt grad_bytes = q_bytes * num_nodes * dim; 763ca735530SJeremy L Thompson const CeedInt e_mode_bytes = sizeof(CeedEvalMode); 764ca735530SJeremy L Thompson const CeedScalar *interp_in, *interp_out, *grad_in, *grad_out; 7650d0321e0SJeremy L Thompson 7660d0321e0SJeremy L Thompson // CEED_EVAL_NONE 7670d0321e0SJeremy L Thompson CeedScalar *identity = NULL; 768ca735530SJeremy L Thompson bool is_eval_none = false; 769ca735530SJeremy L Thompson 770ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_e_mode_in; i++) is_eval_none = is_eval_none || (e_mode_in[i] == CEED_EVAL_NONE); 771ca735530SJeremy L Thompson for (CeedInt i = 0; i < num_e_mode_out; i++) is_eval_none = is_eval_none || (e_mode_out[i] == CEED_EVAL_NONE); 772ca735530SJeremy L Thompson if (is_eval_none) { 773ca735530SJeremy L Thompson CeedCallBackend(CeedCalloc(num_qpts * num_nodes, &identity)); 774ca735530SJeremy L Thompson for (CeedInt i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0; 775ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_identity, interp_bytes)); 776ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_identity, identity, interp_bytes, cudaMemcpyHostToDevice)); 7770d0321e0SJeremy L Thompson } 7780d0321e0SJeremy L Thompson 7790d0321e0SJeremy L Thompson // CEED_EVAL_INTERP 780ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); 781ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interp_in, interp_bytes)); 782ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_interp_in, interp_in, interp_bytes, cudaMemcpyHostToDevice)); 783ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); 784ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interp_out, interp_bytes)); 785ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_interp_out, interp_out, interp_bytes, cudaMemcpyHostToDevice)); 7860d0321e0SJeremy L Thompson 7870d0321e0SJeremy L Thompson // CEED_EVAL_GRAD 788ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); 789ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_grad_in, grad_bytes)); 790ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_grad_in, grad_in, grad_bytes, cudaMemcpyHostToDevice)); 791ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); 792ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_grad_out, grad_bytes)); 793ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_grad_out, grad_out, grad_bytes, cudaMemcpyHostToDevice)); 7940d0321e0SJeremy L Thompson 795ca735530SJeremy L Thompson // Arrays of e_modes 796ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_e_mode_in, num_e_mode_in * e_mode_bytes)); 797ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_e_mode_in, e_mode_in, num_e_mode_in * e_mode_bytes, cudaMemcpyHostToDevice)); 798ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_e_mode_out, num_e_mode_out * e_mode_bytes)); 799ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_e_mode_out, e_mode_out, num_e_mode_out * e_mode_bytes, cudaMemcpyHostToDevice)); 8000d0321e0SJeremy L Thompson 8010d0321e0SJeremy L Thompson // Restriction 802ca735530SJeremy L Thompson diag->diag_rstr = rstr_out; 8030d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 8040d0321e0SJeremy L Thompson } 8050d0321e0SJeremy L Thompson 8060d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 8070d0321e0SJeremy L Thompson // Assemble diagonal common code 8080d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 809ca735530SJeremy L Thompson static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { 8100d0321e0SJeremy L Thompson Ceed ceed; 811ca735530SJeremy L Thompson CeedSize assembled_length = 0, assembled_qf_length = 0; 812ca735530SJeremy L Thompson CeedInt use_ceedsize_idx = 0, num_elem; 813ca735530SJeremy L Thompson CeedScalar *elem_diag_array; 814ca735530SJeremy L Thompson const CeedScalar *assembled_qf_array; 815ca735530SJeremy L Thompson CeedVector assembled_qf = NULL; 816ca735530SJeremy L Thompson CeedElemRestriction rstr = NULL; 8170d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 818ca735530SJeremy L Thompson 819ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 8202b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 8210d0321e0SJeremy L Thompson 8220d0321e0SJeremy L Thompson // Assemble QFunction 823ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request)); 8242b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); 8250d0321e0SJeremy L Thompson 826f7c1b517Snbeams CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); 827ca735530SJeremy L Thompson CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); 828ca735530SJeremy L Thompson if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; 829f7c1b517Snbeams 8300d0321e0SJeremy L Thompson // Setup 831ca735530SJeremy L Thompson if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op, is_point_block, use_ceedsize_idx)); 8320d0321e0SJeremy L Thompson CeedOperatorDiag_Cuda *diag = impl->diag; 833ca735530SJeremy L Thompson 8340d0321e0SJeremy L Thompson assert(diag != NULL); 8350d0321e0SJeremy L Thompson 8360d0321e0SJeremy L Thompson // Restriction 837ca735530SJeremy L Thompson if (is_point_block && !diag->point_block_rstr) { 838ca735530SJeremy L Thompson CeedElemRestriction point_block_rstr; 839ca735530SJeremy L Thompson 840ca735530SJeremy L Thompson CeedCallBackend(CreatePointBlockRestriction(diag->diag_rstr, &point_block_rstr)); 841ca735530SJeremy L Thompson diag->point_block_rstr = point_block_rstr; 8420d0321e0SJeremy L Thompson } 843ca735530SJeremy L Thompson CeedElemRestriction diag_rstr = is_point_block ? diag->point_block_rstr : diag->diag_rstr; 8440d0321e0SJeremy L Thompson 8450d0321e0SJeremy L Thompson // Create diagonal vector 846ca735530SJeremy L Thompson CeedVector elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; 847ca735530SJeremy L Thompson 848ca735530SJeremy L Thompson if (!elem_diag) { 849ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionCreateVector(diag_rstr, NULL, &elem_diag)); 850ca735530SJeremy L Thompson if (is_point_block) diag->point_block_elem_diag = elem_diag; 851ca735530SJeremy L Thompson else diag->elem_diag = elem_diag; 8520d0321e0SJeremy L Thompson } 853ca735530SJeremy L Thompson CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0)); 8540d0321e0SJeremy L Thompson 8550d0321e0SJeremy L Thompson // Assemble element operator diagonals 856ca735530SJeremy L Thompson CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array)); 857ca735530SJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); 858ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem)); 8590d0321e0SJeremy L Thompson 8600d0321e0SJeremy L Thompson // Compute the diagonal of B^T D B 861ca735530SJeremy L Thompson int elem_per_block = 1; 862ca735530SJeremy L Thompson int grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); 863ca735530SJeremy L Thompson void *args[] = {(void *)&num_elem, &diag->d_identity, &diag->d_interp_in, &diag->d_grad_in, &diag->d_interp_out, 864ca735530SJeremy L Thompson &diag->d_grad_out, &diag->d_e_mode_in, &diag->d_e_mode_out, &assembled_qf_array, &elem_diag_array}; 865ca735530SJeremy L Thompson if (is_point_block) { 866ca735530SJeremy L Thompson CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearPointBlock, grid, diag->num_nodes, 1, elem_per_block, args)); 8670d0321e0SJeremy L Thompson } else { 868ca735530SJeremy L Thompson CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearDiagonal, grid, diag->num_nodes, 1, elem_per_block, args)); 8690d0321e0SJeremy L Thompson } 8700d0321e0SJeremy L Thompson 8710d0321e0SJeremy L Thompson // Restore arrays 872ca735530SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(elem_diag, &elem_diag_array)); 873ca735530SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); 8740d0321e0SJeremy L Thompson 8750d0321e0SJeremy L Thompson // Assemble local operator diagonal 876ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request)); 8770d0321e0SJeremy L Thompson 8780d0321e0SJeremy L Thompson // Cleanup 879ca735530SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&assembled_qf)); 8800d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 8810d0321e0SJeremy L Thompson } 8820d0321e0SJeremy L Thompson 8830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 8840d0321e0SJeremy L Thompson // Assemble Linear Diagonal 8850d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 8862b730f8bSJeremy L Thompson static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) { 8872b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false)); 8886aa95790SJeremy L Thompson return CEED_ERROR_SUCCESS; 8890d0321e0SJeremy L Thompson } 8900d0321e0SJeremy L Thompson 8910d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 8920d0321e0SJeremy L Thompson // Assemble Linear Point Block Diagonal 8930d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 8942b730f8bSJeremy L Thompson static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) { 8952b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true)); 8966aa95790SJeremy L Thompson return CEED_ERROR_SUCCESS; 8970d0321e0SJeremy L Thompson } 8980d0321e0SJeremy L Thompson 8990d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 900cc132f9aSnbeams // Single operator assembly setup 901cc132f9aSnbeams //------------------------------------------------------------------------------ 902f7c1b517Snbeams static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) { 903cc132f9aSnbeams Ceed ceed; 904ca735530SJeremy L Thompson char *assembly_kernel_path, *assembly_kernel_source; 905ca735530SJeremy L Thompson CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_qpts = 0, elem_size = 0, 906ca735530SJeremy L Thompson num_e_mode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0, num_elem, num_comp; 907ca735530SJeremy L Thompson CeedEvalMode *eval_mode_in = NULL, *eval_mode_out = NULL; 908ca735530SJeremy L Thompson CeedElemRestriction rstr_in = NULL, rstr_out = NULL; 909ca735530SJeremy L Thompson CeedBasis basis_in = NULL, basis_out = NULL; 910ca735530SJeremy L Thompson CeedQFunctionField *qf_fields; 911ca735530SJeremy L Thompson CeedQFunction qf; 912ca735530SJeremy L Thompson CeedOperatorField *input_fields, *output_fields; 913cc132f9aSnbeams CeedOperator_Cuda *impl; 914ca735530SJeremy L Thompson 915ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 9162b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 917cc132f9aSnbeams 918cc132f9aSnbeams // Get intput and output fields 9192b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); 920cc132f9aSnbeams 921cc132f9aSnbeams // Determine active input basis eval mode 9222b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 9232b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); 924cc132f9aSnbeams // Note that the kernel will treat each dimension of a gradient action separately; 925ca735530SJeremy L Thompson // i.e., when an active input has a CEED_EVAL_GRAD mode, num_e_mode_in will increment by dim. 926ea61e9acSJeremy L Thompson // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, so 927cc132f9aSnbeams // num_B_in_mats_to_load will be incremented by 1. 928cc132f9aSnbeams for (CeedInt i = 0; i < num_input_fields; i++) { 929cc132f9aSnbeams CeedVector vec; 930ca735530SJeremy L Thompson 9312b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); 932cc132f9aSnbeams if (vec == CEED_VECTOR_ACTIVE) { 933ca735530SJeremy L Thompson CeedEvalMode eval_mode; 934ca735530SJeremy L Thompson 9352b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in)); 9362b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); 937ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); 9382b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); 939ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size)); 9402b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 941cc132f9aSnbeams if (eval_mode != CEED_EVAL_NONE) { 9422b730f8bSJeremy L Thompson CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in)); 943cc132f9aSnbeams eval_mode_in[num_B_in_mats_to_load] = eval_mode; 944cc132f9aSnbeams num_B_in_mats_to_load += 1; 945cc132f9aSnbeams if (eval_mode == CEED_EVAL_GRAD) { 946ca735530SJeremy L Thompson num_e_mode_in += dim; 947ca735530SJeremy L Thompson size_B_in += dim * elem_size * num_qpts; 948cc132f9aSnbeams } else { 949ca735530SJeremy L Thompson num_e_mode_in += 1; 950ca735530SJeremy L Thompson size_B_in += elem_size * num_qpts; 951cc132f9aSnbeams } 952cc132f9aSnbeams } 953cc132f9aSnbeams } 954cc132f9aSnbeams } 955cc132f9aSnbeams 956cc132f9aSnbeams // Determine active output basis; basis_out and rstr_out only used if same as input, TODO 9572b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); 958cc132f9aSnbeams for (CeedInt i = 0; i < num_output_fields; i++) { 959cc132f9aSnbeams CeedVector vec; 960ca735530SJeremy L Thompson 9612b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); 962cc132f9aSnbeams if (vec == CEED_VECTOR_ACTIVE) { 963ca735530SJeremy L Thompson CeedEvalMode eval_mode; 964ca735530SJeremy L Thompson 9652b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out)); 9662b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); 9676574a04fSJeremy L Thompson CeedCheck(!rstr_out || rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly"); 9682b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 969cc132f9aSnbeams if (eval_mode != CEED_EVAL_NONE) { 9702b730f8bSJeremy L Thompson CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out)); 971cc132f9aSnbeams eval_mode_out[num_B_out_mats_to_load] = eval_mode; 972cc132f9aSnbeams num_B_out_mats_to_load += 1; 973cc132f9aSnbeams if (eval_mode == CEED_EVAL_GRAD) { 974ca735530SJeremy L Thompson num_e_mode_out += dim; 975ca735530SJeremy L Thompson size_B_out += dim * elem_size * num_qpts; 976cc132f9aSnbeams } else { 977ca735530SJeremy L Thompson num_e_mode_out += 1; 978ca735530SJeremy L Thompson size_B_out += elem_size * num_qpts; 979cc132f9aSnbeams } 980cc132f9aSnbeams } 981cc132f9aSnbeams } 982cc132f9aSnbeams } 983ca735530SJeremy L Thompson CeedCheck(num_e_mode_in > 0 && num_e_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); 984cc132f9aSnbeams 985ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem)); 986ca735530SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp)); 987cc132f9aSnbeams 9882b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl->asmb)); 989cc132f9aSnbeams CeedOperatorAssemble_Cuda *asmb = impl->asmb; 990ca735530SJeremy L Thompson asmb->num_elem = num_elem; 991cc132f9aSnbeams 992cc132f9aSnbeams // Compile kernels 993ca735530SJeremy L Thompson int elem_per_block = 1; 994ca735530SJeremy L Thompson asmb->elem_per_block = elem_per_block; 995ca735530SJeremy L Thompson CeedInt block_size = elem_size * elem_size * elem_per_block; 996cc132f9aSnbeams Ceed_Cuda *cuda_data; 997ca735530SJeremy L Thompson 9982b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &cuda_data)); 9992b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path)); 100023d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n"); 10012b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); 100223d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n"); 100307b31e0eSJeremy L Thompson bool fallback = block_size > cuda_data->device_prop.maxThreadsPerBlock; 1004ca735530SJeremy L Thompson 100507b31e0eSJeremy L Thompson if (fallback) { 100659ad764aSnbeams // Use fallback kernel with 1D threadblock 1007ca735530SJeremy L Thompson block_size = elem_size * elem_per_block; 1008ca735530SJeremy L Thompson asmb->block_size_x = elem_size; 100959ad764aSnbeams asmb->block_size_y = 1; 101059ad764aSnbeams } else { // Use kernel with 2D threadblock 1011ca735530SJeremy L Thompson asmb->block_size_x = elem_size; 1012ca735530SJeremy L Thompson asmb->block_size_y = elem_size; 101307b31e0eSJeremy L Thompson } 1014ca735530SJeremy L Thompson CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 8, "NUM_ELEM", num_elem, "NUM_E_MODE_IN", num_e_mode_in, 1015ca735530SJeremy L Thompson "NUM_E_MODE_OUT", num_e_mode_out, "NUM_QPTS", num_qpts, "NUM_NODES", elem_size, "BLOCK_SIZE", block_size, 1016ca735530SJeremy L Thompson "NUM_COMP", num_comp, "USE_CEEDSIZE", use_ceedsize_idx)); 1017b2165e7aSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, fallback ? "linearAssembleFallback" : "linearAssemble", &asmb->linearAssemble)); 10182b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&assembly_kernel_path)); 10192b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&assembly_kernel_source)); 1020cc132f9aSnbeams 1021cc132f9aSnbeams // Build 'full' B matrices (not 1D arrays used for tensor-product matrices) 1022cc132f9aSnbeams const CeedScalar *interp_in, *grad_in; 1023ca735530SJeremy L Thompson 10242b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); 10252b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); 1026cc132f9aSnbeams 1027cc132f9aSnbeams // Load into B_in, in order that they will be used in eval_mode 1028cc132f9aSnbeams const CeedInt inBytes = size_B_in * sizeof(CeedScalar); 1029cc132f9aSnbeams CeedInt mat_start = 0; 1030ca735530SJeremy L Thompson 10312b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_in, inBytes)); 1032cc132f9aSnbeams for (int i = 0; i < num_B_in_mats_to_load; i++) { 1033cc132f9aSnbeams CeedEvalMode eval_mode = eval_mode_in[i]; 1034ca735530SJeremy L Thompson 1035cc132f9aSnbeams if (eval_mode == CEED_EVAL_INTERP) { 1036ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], interp_in, elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); 1037ca735530SJeremy L Thompson mat_start += elem_size * num_qpts; 1038cc132f9aSnbeams } else if (eval_mode == CEED_EVAL_GRAD) { 1039ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); 1040ca735530SJeremy L Thompson mat_start += dim * elem_size * num_qpts; 1041cc132f9aSnbeams } 1042cc132f9aSnbeams } 1043cc132f9aSnbeams 1044cc132f9aSnbeams const CeedScalar *interp_out, *grad_out; 1045ca735530SJeremy L Thompson 1046ca735530SJeremy L Thompson // Note that this function currently assumes 1 basis, so this should always be true for now 1047cc132f9aSnbeams if (basis_out == basis_in) { 1048cc132f9aSnbeams interp_out = interp_in; 1049cc132f9aSnbeams grad_out = grad_in; 1050cc132f9aSnbeams } else { 10512b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); 10522b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); 1053cc132f9aSnbeams } 1054cc132f9aSnbeams 1055cc132f9aSnbeams // Load into B_out, in order that they will be used in eval_mode 1056cc132f9aSnbeams const CeedInt outBytes = size_B_out * sizeof(CeedScalar); 1057cc132f9aSnbeams mat_start = 0; 1058ca735530SJeremy L Thompson 10592b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_out, outBytes)); 1060cc132f9aSnbeams for (int i = 0; i < num_B_out_mats_to_load; i++) { 1061cc132f9aSnbeams CeedEvalMode eval_mode = eval_mode_out[i]; 1062ca735530SJeremy L Thompson 1063cc132f9aSnbeams if (eval_mode == CEED_EVAL_INTERP) { 1064ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], interp_out, elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); 1065ca735530SJeremy L Thompson mat_start += elem_size * num_qpts; 1066cc132f9aSnbeams } else if (eval_mode == CEED_EVAL_GRAD) { 1067ca735530SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); 1068ca735530SJeremy L Thompson mat_start += dim * elem_size * num_qpts; 1069cc132f9aSnbeams } 1070cc132f9aSnbeams } 1071cc132f9aSnbeams return CEED_ERROR_SUCCESS; 1072cc132f9aSnbeams } 1073cc132f9aSnbeams 1074cc132f9aSnbeams //------------------------------------------------------------------------------ 1075cefa2673SJeremy L Thompson // Assemble matrix data for COO matrix of assembled operator. 1076cefa2673SJeremy L Thompson // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic. 1077cefa2673SJeremy L Thompson // 1078ea61e9acSJeremy L Thompson // Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval 1079cefa2673SJeremy L Thompson // modes). 1080cefa2673SJeremy L Thompson // TODO: allow multiple active input restrictions/basis objects 1081cc132f9aSnbeams //------------------------------------------------------------------------------ 10822b730f8bSJeremy L Thompson static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) { 1083cc132f9aSnbeams Ceed ceed; 1084ca735530SJeremy L Thompson CeedSize values_length = 0, assembled_qf_length = 0; 1085ca735530SJeremy L Thompson CeedInt use_ceedsize_idx = 0; 1086ca735530SJeremy L Thompson CeedScalar *values_array; 1087ca735530SJeremy L Thompson const CeedScalar *qf_array; 1088ca735530SJeremy L Thompson CeedVector assembled_qf = NULL; 1089ca735530SJeremy L Thompson CeedElemRestriction rstr_q = NULL; 1090cc132f9aSnbeams CeedOperator_Cuda *impl; 1091ca735530SJeremy L Thompson 1092ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 10932b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 1094cc132f9aSnbeams 1095cc132f9aSnbeams // Assemble QFunction 10962b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); 10972b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); 109828ec399dSJeremy L Thompson CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); 1099cc132f9aSnbeams values_array += offset; 11002b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); 1101cc132f9aSnbeams 1102f7c1b517Snbeams CeedCallBackend(CeedVectorGetLength(values, &values_length)); 1103f7c1b517Snbeams CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); 1104f7c1b517Snbeams if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; 1105f7c1b517Snbeams // Setup 1106f7c1b517Snbeams if (!impl->asmb) { 1107f7c1b517Snbeams CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx)); 1108f7c1b517Snbeams assert(impl->asmb != NULL); 1109f7c1b517Snbeams } 1110f7c1b517Snbeams 1111cc132f9aSnbeams // Compute B^T D B 1112ca735530SJeremy L Thompson const CeedInt num_elem = impl->asmb->num_elem; 1113ca735530SJeremy L Thompson const CeedInt elem_per_block = impl->asmb->elem_per_block; 1114ca735530SJeremy L Thompson const CeedInt grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); 11152b730f8bSJeremy L Thompson void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array}; 1116ca735530SJeremy L Thompson 11172b730f8bSJeremy L Thompson CeedCallBackend( 1118ca735530SJeremy L Thompson CeedRunKernelDim_Cuda(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elem_per_block, args)); 1119cc132f9aSnbeams 1120cc132f9aSnbeams // Restore arrays 11212b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); 11222b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &qf_array)); 1123cc132f9aSnbeams 1124cc132f9aSnbeams // Cleanup 11252b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&assembled_qf)); 1126cc132f9aSnbeams return CEED_ERROR_SUCCESS; 1127cc132f9aSnbeams } 1128cc132f9aSnbeams 1129cc132f9aSnbeams //------------------------------------------------------------------------------ 11300d0321e0SJeremy L Thompson // Create operator 11310d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 11320d0321e0SJeremy L Thompson int CeedOperatorCreate_Cuda(CeedOperator op) { 11330d0321e0SJeremy L Thompson Ceed ceed; 11340d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 11350d0321e0SJeremy L Thompson 1136ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 11372b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl)); 11382b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetData(op, impl)); 11390d0321e0SJeremy L Thompson 11402b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda)); 11412b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Cuda)); 11422b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Cuda)); 11432b730f8bSJeremy L Thompson CeedCallBackend( 11442b730f8bSJeremy L Thompson CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda)); 11452b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Cuda)); 11462b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda)); 11472b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda)); 11480d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 11490d0321e0SJeremy L Thompson } 11500d0321e0SJeremy L Thompson 11510d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1152