13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 849aac155SJeremy L Thompson #include <ceed.h> 92b730f8bSJeremy L Thompson #include <ceed/backend.h> 102b730f8bSJeremy L Thompson #include <ceed/jit-tools.h> 11c85e8640SSebastian Grimberg #include <assert.h> 120d0321e0SJeremy L Thompson #include <cuda.h> 130d0321e0SJeremy L Thompson #include <cuda_runtime.h> 140d0321e0SJeremy L Thompson #include <stdbool.h> 150d0321e0SJeremy L Thompson #include <string.h> 162b730f8bSJeremy L Thompson 1749aac155SJeremy L Thompson #include "../cuda/ceed-cuda-common.h" 180d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 192b730f8bSJeremy L Thompson #include "ceed-cuda-ref.h" 200d0321e0SJeremy L Thompson 210d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 220d0321e0SJeremy L Thompson // Destroy operator 230d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 240d0321e0SJeremy L Thompson static int CeedOperatorDestroy_Cuda(CeedOperator op) { 250d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 262b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 270d0321e0SJeremy L Thompson 280d0321e0SJeremy L Thompson // Apply data 290d0321e0SJeremy L Thompson for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) { 302b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&impl->evecs[i])); 310d0321e0SJeremy L Thompson } 322b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->evecs)); 330d0321e0SJeremy L Thompson 340d0321e0SJeremy L Thompson for (CeedInt i = 0; i < impl->numein; i++) { 352b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&impl->qvecsin[i])); 360d0321e0SJeremy L Thompson } 372b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->qvecsin)); 380d0321e0SJeremy L Thompson 390d0321e0SJeremy L Thompson for (CeedInt i = 0; i < impl->numeout; i++) { 402b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&impl->qvecsout[i])); 410d0321e0SJeremy L Thompson } 422b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->qvecsout)); 430d0321e0SJeremy L Thompson 440d0321e0SJeremy L Thompson // QFunction assembly data 450d0321e0SJeremy L Thompson for (CeedInt i = 0; i < impl->qfnumactivein; i++) { 462b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&impl->qfactivein[i])); 470d0321e0SJeremy L Thompson } 482b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->qfactivein)); 490d0321e0SJeremy L Thompson 500d0321e0SJeremy L Thompson // Diag data 510d0321e0SJeremy L Thompson if (impl->diag) { 520d0321e0SJeremy L Thompson Ceed ceed; 532b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 542b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(impl->diag->module)); 552b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->diag->h_emodein)); 562b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->diag->h_emodeout)); 572b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_emodein)); 582b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_emodeout)); 592b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_identity)); 602b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_interpin)); 612b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_interpout)); 622b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_gradin)); 632b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->diag->d_gradout)); 642b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr)); 652b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&impl->diag->elemdiag)); 662b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&impl->diag->pbelemdiag)); 670d0321e0SJeremy L Thompson } 682b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->diag)); 690d0321e0SJeremy L Thompson 70cc132f9aSnbeams if (impl->asmb) { 71cc132f9aSnbeams Ceed ceed; 722b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 732b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(impl->asmb->module)); 742b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_in)); 752b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_out)); 76cc132f9aSnbeams } 772b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->asmb)); 78cc132f9aSnbeams 792b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 800d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 810d0321e0SJeremy L Thompson } 820d0321e0SJeremy L Thompson 830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 840d0321e0SJeremy L Thompson // Setup infields or outfields 850d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 862b730f8bSJeremy L Thompson static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool isinput, CeedVector *evecs, CeedVector *qvecs, CeedInt starte, 872b730f8bSJeremy L Thompson CeedInt numfields, CeedInt Q, CeedInt numelements) { 882b730f8bSJeremy L Thompson CeedInt dim, size; 89d2643443SJeremy L Thompson CeedSize q_size; 900d0321e0SJeremy L Thompson Ceed ceed; 912b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 920d0321e0SJeremy L Thompson CeedBasis basis; 930d0321e0SJeremy L Thompson CeedElemRestriction Erestrict; 940d0321e0SJeremy L Thompson CeedOperatorField *opfields; 950d0321e0SJeremy L Thompson CeedQFunctionField *qffields; 960d0321e0SJeremy L Thompson CeedVector fieldvec; 970d0321e0SJeremy L Thompson bool strided; 980d0321e0SJeremy L Thompson bool skiprestrict; 990d0321e0SJeremy L Thompson 1000d0321e0SJeremy L Thompson if (isinput) { 1012b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); 1022b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); 1030d0321e0SJeremy L Thompson } else { 1042b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); 1052b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); 1060d0321e0SJeremy L Thompson } 1070d0321e0SJeremy L Thompson 1080d0321e0SJeremy L Thompson // Loop over fields 1090d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numfields; i++) { 1100d0321e0SJeremy L Thompson CeedEvalMode emode; 1112b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); 1120d0321e0SJeremy L Thompson 1130d0321e0SJeremy L Thompson strided = false; 1140d0321e0SJeremy L Thompson skiprestrict = false; 1150d0321e0SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { 1162b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict)); 1170d0321e0SJeremy L Thompson 1180d0321e0SJeremy L Thompson // Check whether this field can skip the element restriction: 119ea61e9acSJeremy L Thompson // must be passive input, with emode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. 1200d0321e0SJeremy L Thompson 1210d0321e0SJeremy L Thompson // First, check whether the field is input or output: 1220d0321e0SJeremy L Thompson if (isinput) { 1230d0321e0SJeremy L Thompson // Check for passive input: 1242b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &fieldvec)); 1250d0321e0SJeremy L Thompson if (fieldvec != CEED_VECTOR_ACTIVE) { 1260d0321e0SJeremy L Thompson // Check emode 1270d0321e0SJeremy L Thompson if (emode == CEED_EVAL_NONE) { 1280d0321e0SJeremy L Thompson // Check for strided restriction 1292b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &strided)); 1300d0321e0SJeremy L Thompson if (strided) { 1310d0321e0SJeremy L Thompson // Check if vector is already in preferred backend ordering 1322b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &skiprestrict)); 1330d0321e0SJeremy L Thompson } 1340d0321e0SJeremy L Thompson } 1350d0321e0SJeremy L Thompson } 1360d0321e0SJeremy L Thompson } 1370d0321e0SJeremy L Thompson if (skiprestrict) { 138ea61e9acSJeremy L Thompson // We do not need an E-Vector, but will use the input field vector's data directly in the operator application. 1390d0321e0SJeremy L Thompson evecs[i + starte] = NULL; 1400d0321e0SJeremy L Thompson } else { 1412b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionCreateVector(Erestrict, NULL, &evecs[i + starte])); 1420d0321e0SJeremy L Thompson } 1430d0321e0SJeremy L Thompson } 1440d0321e0SJeremy L Thompson 1450d0321e0SJeremy L Thompson switch (emode) { 1460d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 1472b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); 148d2643443SJeremy L Thompson q_size = (CeedSize)numelements * Q * size; 1492b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); 1500d0321e0SJeremy L Thompson break; 1510d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 1522b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); 153d2643443SJeremy L Thompson q_size = (CeedSize)numelements * Q * size; 1542b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); 1550d0321e0SJeremy L Thompson break; 1560d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 1572b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); 1582b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); 1592b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 160d2643443SJeremy L Thompson q_size = (CeedSize)numelements * Q * size; 1612b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); 1620d0321e0SJeremy L Thompson break; 1630d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: // Only on input fields 1642b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); 165d2643443SJeremy L Thompson q_size = (CeedSize)numelements * Q; 1662b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); 1676574a04fSJeremy L Thompson CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, qvecs[i])); 1680d0321e0SJeremy L Thompson break; 1690d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 1700d0321e0SJeremy L Thompson break; // TODO: Not implemented 1710d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 1720d0321e0SJeremy L Thompson break; // TODO: Not implemented 1730d0321e0SJeremy L Thompson } 1740d0321e0SJeremy L Thompson } 1750d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1760d0321e0SJeremy L Thompson } 1770d0321e0SJeremy L Thompson 1780d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 179ea61e9acSJeremy L Thompson // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction. 1800d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1810d0321e0SJeremy L Thompson static int CeedOperatorSetup_Cuda(CeedOperator op) { 1820d0321e0SJeremy L Thompson bool setupdone; 1832b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorIsSetupDone(op, &setupdone)); 1842b730f8bSJeremy L Thompson if (setupdone) return CEED_ERROR_SUCCESS; 1850d0321e0SJeremy L Thompson Ceed ceed; 1862b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1870d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 1882b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 1890d0321e0SJeremy L Thompson CeedQFunction qf; 1902b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1910d0321e0SJeremy L Thompson CeedInt Q, numelements, numinputfields, numoutputfields; 1922b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 1932b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); 1940d0321e0SJeremy L Thompson CeedOperatorField *opinputfields, *opoutputfields; 1952b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); 1960d0321e0SJeremy L Thompson CeedQFunctionField *qfinputfields, *qfoutputfields; 1972b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); 1980d0321e0SJeremy L Thompson 1990d0321e0SJeremy L Thompson // Allocate 2002b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(numinputfields + numoutputfields, &impl->evecs)); 2010d0321e0SJeremy L Thompson 2022b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin)); 2032b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout)); 2040d0321e0SJeremy L Thompson 2052b730f8bSJeremy L Thompson impl->numein = numinputfields; 2062b730f8bSJeremy L Thompson impl->numeout = numoutputfields; 2070d0321e0SJeremy L Thompson 2080d0321e0SJeremy L Thompson // Set up infield and outfield evecs and qvecs 2090d0321e0SJeremy L Thompson // Infields 2102b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, impl->evecs, impl->qvecsin, 0, numinputfields, Q, numelements)); 2110d0321e0SJeremy L Thompson 2120d0321e0SJeremy L Thompson // Outfields 2132b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, impl->evecs, impl->qvecsout, numinputfields, numoutputfields, Q, numelements)); 2140d0321e0SJeremy L Thompson 2152b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetSetupDone(op)); 2160d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2170d0321e0SJeremy L Thompson } 2180d0321e0SJeremy L Thompson 2190d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2200d0321e0SJeremy L Thompson // Setup Operator Inputs 2210d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2222b730f8bSJeremy L Thompson static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 2230d0321e0SJeremy L Thompson CeedVector invec, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], 2240d0321e0SJeremy L Thompson CeedOperator_Cuda *impl, CeedRequest *request) { 2250d0321e0SJeremy L Thompson CeedEvalMode emode; 2260d0321e0SJeremy L Thompson CeedVector vec; 2270d0321e0SJeremy L Thompson CeedElemRestriction Erestrict; 2280d0321e0SJeremy L Thompson 2290d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 2300d0321e0SJeremy L Thompson // Get input vector 2312b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); 2320d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 2332b730f8bSJeremy L Thompson if (skipactive) continue; 2342b730f8bSJeremy L Thompson else vec = invec; 2350d0321e0SJeremy L Thompson } 2360d0321e0SJeremy L Thompson 2372b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); 2380d0321e0SJeremy L Thompson if (emode == CEED_EVAL_WEIGHT) { // Skip 2390d0321e0SJeremy L Thompson } else { 2400d0321e0SJeremy L Thompson // Get input vector 2412b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); 2420d0321e0SJeremy L Thompson // Get input element restriction 2432b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict)); 2442b730f8bSJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) vec = invec; 2450d0321e0SJeremy L Thompson // Restrict, if necessary 2460d0321e0SJeremy L Thompson if (!impl->evecs[i]) { 2470d0321e0SJeremy L Thompson // No restriction for this field; read data directly from vec. 2482b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); 2490d0321e0SJeremy L Thompson } else { 2502b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, impl->evecs[i], request)); 2510d0321e0SJeremy L Thompson // Get evec 2522b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); 2530d0321e0SJeremy L Thompson } 2540d0321e0SJeremy L Thompson } 2550d0321e0SJeremy L Thompson } 2560d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2570d0321e0SJeremy L Thompson } 2580d0321e0SJeremy L Thompson 2590d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2600d0321e0SJeremy L Thompson // Input Basis Action 2610d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2622b730f8bSJeremy L Thompson static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 2632b730f8bSJeremy L Thompson CeedInt numinputfields, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], 2642b730f8bSJeremy L Thompson CeedOperator_Cuda *impl) { 2650d0321e0SJeremy L Thompson CeedInt elemsize, size; 2660d0321e0SJeremy L Thompson CeedElemRestriction Erestrict; 2670d0321e0SJeremy L Thompson CeedEvalMode emode; 2680d0321e0SJeremy L Thompson CeedBasis basis; 2690d0321e0SJeremy L Thompson 2700d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 2710d0321e0SJeremy L Thompson // Skip active input 2720d0321e0SJeremy L Thompson if (skipactive) { 2730d0321e0SJeremy L Thompson CeedVector vec; 2742b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); 2752b730f8bSJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) continue; 2760d0321e0SJeremy L Thompson } 2770d0321e0SJeremy L Thompson // Get elemsize, emode, size 2782b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict)); 2792b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize)); 2802b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); 2812b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size)); 2820d0321e0SJeremy L Thompson // Basis action 2830d0321e0SJeremy L Thompson switch (emode) { 2840d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 2852b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i])); 2860d0321e0SJeremy L Thompson break; 2870d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 2882b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); 2892b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->evecs[i], impl->qvecsin[i])); 2900d0321e0SJeremy L Thompson break; 2910d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 2922b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); 2932b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->evecs[i], impl->qvecsin[i])); 2940d0321e0SJeremy L Thompson break; 2950d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: 2960d0321e0SJeremy L Thompson break; // No action 2970d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 2980d0321e0SJeremy L Thompson break; // TODO: Not implemented 2990d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 3000d0321e0SJeremy L Thompson break; // TODO: Not implemented 3010d0321e0SJeremy L Thompson } 3020d0321e0SJeremy L Thompson } 3030d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3040d0321e0SJeremy L Thompson } 3050d0321e0SJeremy L Thompson 3060d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3070d0321e0SJeremy L Thompson // Restore Input Vectors 3080d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3092b730f8bSJeremy L Thompson static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 3102b730f8bSJeremy L Thompson const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) { 3110d0321e0SJeremy L Thompson CeedEvalMode emode; 3120d0321e0SJeremy L Thompson CeedVector vec; 3130d0321e0SJeremy L Thompson 3140d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 3150d0321e0SJeremy L Thompson // Skip active input 3160d0321e0SJeremy L Thompson if (skipactive) { 3172b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); 3182b730f8bSJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) continue; 3190d0321e0SJeremy L Thompson } 3202b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); 3210d0321e0SJeremy L Thompson if (emode == CEED_EVAL_WEIGHT) { // Skip 3220d0321e0SJeremy L Thompson } else { 3230d0321e0SJeremy L Thompson if (!impl->evecs[i]) { // This was a skiprestrict case 3242b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); 3252b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&edata[i])); 3260d0321e0SJeremy L Thompson } else { 3272b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(impl->evecs[i], (const CeedScalar **)&edata[i])); 3280d0321e0SJeremy L Thompson } 3290d0321e0SJeremy L Thompson } 3300d0321e0SJeremy L Thompson } 3310d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3320d0321e0SJeremy L Thompson } 3330d0321e0SJeremy L Thompson 3340d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3350d0321e0SJeremy L Thompson // Apply and add to output 3360d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3372b730f8bSJeremy L Thompson static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request) { 3380d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 3392b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 3400d0321e0SJeremy L Thompson CeedQFunction qf; 3412b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 3420d0321e0SJeremy L Thompson CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size; 3432b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 3442b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); 3450d0321e0SJeremy L Thompson CeedOperatorField *opinputfields, *opoutputfields; 3462b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); 3470d0321e0SJeremy L Thompson CeedQFunctionField *qfinputfields, *qfoutputfields; 3482b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); 3490d0321e0SJeremy L Thompson CeedEvalMode emode; 3500d0321e0SJeremy L Thompson CeedVector vec; 3510d0321e0SJeremy L Thompson CeedBasis basis; 3520d0321e0SJeremy L Thompson CeedElemRestriction Erestrict; 3530d0321e0SJeremy L Thompson CeedScalar *edata[2 * CEED_FIELD_MAX] = {0}; 3540d0321e0SJeremy L Thompson 3550d0321e0SJeremy L Thompson // Setup 3562b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetup_Cuda(op)); 3570d0321e0SJeremy L Thompson 3580d0321e0SJeremy L Thompson // Input Evecs and Restriction 3592b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, opinputfields, invec, false, edata, impl, request)); 3600d0321e0SJeremy L Thompson 3610d0321e0SJeremy L Thompson // Input basis apply if needed 3622b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields, numinputfields, false, edata, impl)); 3630d0321e0SJeremy L Thompson 3640d0321e0SJeremy L Thompson // Output pointers, as necessary 3650d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 3662b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); 3670d0321e0SJeremy L Thompson if (emode == CEED_EVAL_NONE) { 3680d0321e0SJeremy L Thompson // Set the output Q-Vector to use the E-Vector data directly. 3692b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, &edata[i + numinputfields])); 3702b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i + numinputfields])); 3710d0321e0SJeremy L Thompson } 3720d0321e0SJeremy L Thompson } 3730d0321e0SJeremy L Thompson 3740d0321e0SJeremy L Thompson // Q function 3752b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout)); 3760d0321e0SJeremy L Thompson 3770d0321e0SJeremy L Thompson // Output basis apply if needed 3780d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 3790d0321e0SJeremy L Thompson // Get elemsize, emode, size 3802b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); 3812b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize)); 3822b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); 3832b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size)); 3840d0321e0SJeremy L Thompson // Basis action 3850d0321e0SJeremy L Thompson switch (emode) { 3860d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 3870d0321e0SJeremy L Thompson break; 3880d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 3892b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); 3902b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->qvecsout[i], impl->evecs[i + impl->numein])); 3910d0321e0SJeremy L Thompson break; 3920d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 3932b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); 3942b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->qvecsout[i], impl->evecs[i + impl->numein])); 3950d0321e0SJeremy L Thompson break; 3960d0321e0SJeremy L Thompson // LCOV_EXCL_START 3970d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 3980d0321e0SJeremy L Thompson Ceed ceed; 3992b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 4002b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 4010d0321e0SJeremy L Thompson break; // Should not occur 4020d0321e0SJeremy L Thompson } 4030d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 4040d0321e0SJeremy L Thompson break; // TODO: Not implemented 4050d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 4060d0321e0SJeremy L Thompson break; // TODO: Not implemented 4070d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 4080d0321e0SJeremy L Thompson } 4090d0321e0SJeremy L Thompson } 4100d0321e0SJeremy L Thompson 4110d0321e0SJeremy L Thompson // Output restriction 4120d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 4130d0321e0SJeremy L Thompson // Restore evec 4142b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); 4150d0321e0SJeremy L Thompson if (emode == CEED_EVAL_NONE) { 4162b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(impl->evecs[i + impl->numein], &edata[i + numinputfields])); 4170d0321e0SJeremy L Thompson } 4180d0321e0SJeremy L Thompson // Get output vector 4192b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec)); 4200d0321e0SJeremy L Thompson // Restrict 4212b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); 4220d0321e0SJeremy L Thompson // Active 4232b730f8bSJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 4240d0321e0SJeremy L Thompson 4252b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, impl->evecs[i + impl->numein], vec, request)); 4260d0321e0SJeremy L Thompson } 4270d0321e0SJeremy L Thompson 4280d0321e0SJeremy L Thompson // Restore input arrays 4292b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, opinputfields, false, edata, impl)); 4300d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 4310d0321e0SJeremy L Thompson } 4320d0321e0SJeremy L Thompson 4330d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 4340d0321e0SJeremy L Thompson // Core code for assembling linear QFunction 4350d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 4362b730f8bSJeremy L Thompson static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, 4370d0321e0SJeremy L Thompson CeedRequest *request) { 4380d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 4392b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 4400d0321e0SJeremy L Thompson CeedQFunction qf; 4412b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 4420d0321e0SJeremy L Thompson CeedInt Q, numelements, numinputfields, numoutputfields, size; 443d2643443SJeremy L Thompson CeedSize q_size; 4442b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 4452b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); 4460d0321e0SJeremy L Thompson CeedOperatorField *opinputfields, *opoutputfields; 4472b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); 4480d0321e0SJeremy L Thompson CeedQFunctionField *qfinputfields, *qfoutputfields; 4492b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); 4500d0321e0SJeremy L Thompson CeedVector vec; 4510d0321e0SJeremy L Thompson CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout; 4520d0321e0SJeremy L Thompson CeedVector *activein = impl->qfactivein; 4530d0321e0SJeremy L Thompson CeedScalar *a, *tmp; 4540d0321e0SJeremy L Thompson Ceed ceed, ceedparent; 4552b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 4562b730f8bSJeremy L Thompson CeedCallBackend(CeedGetOperatorFallbackParentCeed(ceed, &ceedparent)); 4570d0321e0SJeremy L Thompson ceedparent = ceedparent ? ceedparent : ceed; 4580d0321e0SJeremy L Thompson CeedScalar *edata[2 * CEED_FIELD_MAX]; 4590d0321e0SJeremy L Thompson 4600d0321e0SJeremy L Thompson // Setup 4612b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetup_Cuda(op)); 4620d0321e0SJeremy L Thompson 4630d0321e0SJeremy L Thompson // Check for identity 4640d0321e0SJeremy L Thompson bool identityqf; 4652b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionIsIdentity(qf, &identityqf)); 4666574a04fSJeremy L Thompson CeedCheck(!identityqf, ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); 4670d0321e0SJeremy L Thompson 4680d0321e0SJeremy L Thompson // Input Evecs and Restriction 4692b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, opinputfields, NULL, true, edata, impl, request)); 4700d0321e0SJeremy L Thompson 4710d0321e0SJeremy L Thompson // Count number of active input fields 4720d0321e0SJeremy L Thompson if (!numactivein) { 4730d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 4740d0321e0SJeremy L Thompson // Get input vector 4752b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); 4760d0321e0SJeremy L Thompson // Check if active input 4770d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 4782b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size)); 4792b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorSetValue(impl->qvecsin[i], 0.0)); 4802b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp)); 4812b730f8bSJeremy L Thompson CeedCallBackend(CeedRealloc(numactivein + size, &activein)); 4820d0321e0SJeremy L Thompson for (CeedInt field = 0; field < size; field++) { 483d2643443SJeremy L Thompson q_size = (CeedSize)Q * numelements; 4842b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorCreate(ceed, q_size, &activein[numactivein + field])); 4852b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorSetArray(activein[numactivein + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &tmp[field * Q * numelements])); 4860d0321e0SJeremy L Thompson } 4870d0321e0SJeremy L Thompson numactivein += size; 4882b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(impl->qvecsin[i], &tmp)); 4890d0321e0SJeremy L Thompson } 4900d0321e0SJeremy L Thompson } 4910d0321e0SJeremy L Thompson impl->qfnumactivein = numactivein; 4920d0321e0SJeremy L Thompson impl->qfactivein = activein; 4930d0321e0SJeremy L Thompson } 4940d0321e0SJeremy L Thompson 4950d0321e0SJeremy L Thompson // Count number of active output fields 4960d0321e0SJeremy L Thompson if (!numactiveout) { 4970d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 4980d0321e0SJeremy L Thompson // Get output vector 4992b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec)); 5000d0321e0SJeremy L Thompson // Check if active output 5010d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 5022b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size)); 5030d0321e0SJeremy L Thompson numactiveout += size; 5040d0321e0SJeremy L Thompson } 5050d0321e0SJeremy L Thompson } 5060d0321e0SJeremy L Thompson impl->qfnumactiveout = numactiveout; 5070d0321e0SJeremy L Thompson } 5080d0321e0SJeremy L Thompson 5090d0321e0SJeremy L Thompson // Check sizes 5106574a04fSJeremy L Thompson CeedCheck(numactivein > 0 && numactiveout > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); 5110d0321e0SJeremy L Thompson 5120d0321e0SJeremy L Thompson // Build objects if needed 5130d0321e0SJeremy L Thompson if (build_objects) { 5140d0321e0SJeremy L Thompson // Create output restriction 5150d0321e0SJeremy L Thompson CeedInt strides[3] = {1, numelements * Q, Q}; /* *NOPAD* */ 5162b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionCreateStrided(ceedparent, numelements, Q, numactivein * numactiveout, 5172b730f8bSJeremy L Thompson numactivein * numactiveout * numelements * Q, strides, rstr)); 5180d0321e0SJeremy L Thompson // Create assembled vector 519d2643443SJeremy L Thompson CeedSize l_size = (CeedSize)numelements * Q * numactivein * numactiveout; 5202b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorCreate(ceedparent, l_size, assembled)); 5210d0321e0SJeremy L Thompson } 5222b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); 5232b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a)); 5240d0321e0SJeremy L Thompson 5250d0321e0SJeremy L Thompson // Input basis apply 5262b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields, numinputfields, true, edata, impl)); 5270d0321e0SJeremy L Thompson 5280d0321e0SJeremy L Thompson // Assemble QFunction 5290d0321e0SJeremy L Thompson for (CeedInt in = 0; in < numactivein; in++) { 5300d0321e0SJeremy L Thompson // Set Inputs 5312b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorSetValue(activein[in], 1.0)); 5320d0321e0SJeremy L Thompson if (numactivein > 1) { 5332b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorSetValue(activein[(in + numactivein - 1) % numactivein], 0.0)); 5340d0321e0SJeremy L Thompson } 5350d0321e0SJeremy L Thompson // Set Outputs 5360d0321e0SJeremy L Thompson for (CeedInt out = 0; out < numoutputfields; out++) { 5370d0321e0SJeremy L Thompson // Get output vector 5382b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[out], &vec)); 5390d0321e0SJeremy L Thompson // Check if active output 5400d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 5412b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE, CEED_USE_POINTER, a)); 5422b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[out], &size)); 5430d0321e0SJeremy L Thompson a += size * Q * numelements; // Advance the pointer by the size of the output 5440d0321e0SJeremy L Thompson } 5450d0321e0SJeremy L Thompson } 5460d0321e0SJeremy L Thompson // Apply QFunction 5472b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionApply(qf, Q * numelements, impl->qvecsin, impl->qvecsout)); 5480d0321e0SJeremy L Thompson } 5490d0321e0SJeremy L Thompson 5500d0321e0SJeremy L Thompson // Un-set output Qvecs to prevent accidental overwrite of Assembled 5510d0321e0SJeremy L Thompson for (CeedInt out = 0; out < numoutputfields; out++) { 5520d0321e0SJeremy L Thompson // Get output vector 5532b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[out], &vec)); 5540d0321e0SJeremy L Thompson // Check if active output 5550d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 5562b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL)); 5570d0321e0SJeremy L Thompson } 5580d0321e0SJeremy L Thompson } 5590d0321e0SJeremy L Thompson 5600d0321e0SJeremy L Thompson // Restore input arrays 5612b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, opinputfields, true, edata, impl)); 5620d0321e0SJeremy L Thompson 5630d0321e0SJeremy L Thompson // Restore output 5642b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(*assembled, &a)); 5650d0321e0SJeremy L Thompson 5660d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 5670d0321e0SJeremy L Thompson } 5680d0321e0SJeremy L Thompson 5690d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 5700d0321e0SJeremy L Thompson // Assemble Linear QFunction 5710d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 5722b730f8bSJeremy L Thompson static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { 5732b730f8bSJeremy L Thompson return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr, request); 5740d0321e0SJeremy L Thompson } 5750d0321e0SJeremy L Thompson 5760d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 5770d0321e0SJeremy L Thompson // Update Assembled Linear QFunction 5780d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 5792b730f8bSJeremy L Thompson static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { 5802b730f8bSJeremy L Thompson return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled, &rstr, request); 5810d0321e0SJeremy L Thompson } 5820d0321e0SJeremy L Thompson 5830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 5840d0321e0SJeremy L Thompson // Create point block restriction 5850d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 5862b730f8bSJeremy L Thompson static int CreatePBRestriction(CeedElemRestriction rstr, CeedElemRestriction *pbRstr) { 5870d0321e0SJeremy L Thompson Ceed ceed; 5882b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 5890d0321e0SJeremy L Thompson const CeedInt *offsets; 5902b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); 5910d0321e0SJeremy L Thompson 5920d0321e0SJeremy L Thompson // Expand offsets 5937b63f5c6SJed Brown CeedInt nelem, ncomp, elemsize, compstride, *pbOffsets; 5947b63f5c6SJed Brown CeedSize l_size; 5952b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &nelem)); 5962b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &ncomp)); 5972b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elemsize)); 5982b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &compstride)); 5992b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); 6000d0321e0SJeremy L Thompson CeedInt shift = ncomp; 6012b730f8bSJeremy L Thompson if (compstride != 1) shift *= ncomp; 6022b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(nelem * elemsize, &pbOffsets)); 6030d0321e0SJeremy L Thompson for (CeedInt i = 0; i < nelem * elemsize; i++) { 6040d0321e0SJeremy L Thompson pbOffsets[i] = offsets[i] * shift; 6050d0321e0SJeremy L Thompson } 6060d0321e0SJeremy L Thompson 6070d0321e0SJeremy L Thompson // Create new restriction 6082b730f8bSJeremy L Thompson CeedCallBackend( 6092b730f8bSJeremy L Thompson CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp * ncomp, 1, l_size * ncomp, CEED_MEM_HOST, CEED_OWN_POINTER, pbOffsets, pbRstr)); 6100d0321e0SJeremy L Thompson 6110d0321e0SJeremy L Thompson // Cleanup 6122b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); 6130d0321e0SJeremy L Thompson 6140d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 6150d0321e0SJeremy L Thompson } 6160d0321e0SJeremy L Thompson 6170d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 6180d0321e0SJeremy L Thompson // Assemble diagonal setup 6190d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 620*f7c1b517Snbeams static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, const bool pointBlock, CeedInt use_ceedsize_idx) { 6210d0321e0SJeremy L Thompson Ceed ceed; 6222b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 6230d0321e0SJeremy L Thompson CeedQFunction qf; 6242b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 6250d0321e0SJeremy L Thompson CeedInt numinputfields, numoutputfields; 6262b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields)); 6270d0321e0SJeremy L Thompson 6280d0321e0SJeremy L Thompson // Determine active input basis 6290d0321e0SJeremy L Thompson CeedOperatorField *opfields; 6300d0321e0SJeremy L Thompson CeedQFunctionField *qffields; 6312b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); 6322b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); 6330d0321e0SJeremy L Thompson CeedInt numemodein = 0, ncomp = 0, dim = 1; 6340d0321e0SJeremy L Thompson CeedEvalMode *emodein = NULL; 6350d0321e0SJeremy L Thompson CeedBasis basisin = NULL; 6360d0321e0SJeremy L Thompson CeedElemRestriction rstrin = NULL; 6370d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 6380d0321e0SJeremy L Thompson CeedVector vec; 6392b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); 6400d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 6410d0321e0SJeremy L Thompson CeedElemRestriction rstr; 6422b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisin)); 6432b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basisin, &ncomp)); 6442b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basisin, &dim)); 6452b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); 6466574a04fSJeremy L Thompson CeedCheck(!rstrin || rstrin == rstr, ceed, CEED_ERROR_BACKEND, 6476574a04fSJeremy L Thompson "Backend does not implement multi-field non-composite operator diagonal assembly"); 6480d0321e0SJeremy L Thompson rstrin = rstr; 6490d0321e0SJeremy L Thompson CeedEvalMode emode; 6502b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); 6510d0321e0SJeremy L Thompson switch (emode) { 6520d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 6530d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 6542b730f8bSJeremy L Thompson CeedCallBackend(CeedRealloc(numemodein + 1, &emodein)); 6550d0321e0SJeremy L Thompson emodein[numemodein] = emode; 6560d0321e0SJeremy L Thompson numemodein += 1; 6570d0321e0SJeremy L Thompson break; 6580d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 6592b730f8bSJeremy L Thompson CeedCallBackend(CeedRealloc(numemodein + dim, &emodein)); 6602b730f8bSJeremy L Thompson for (CeedInt d = 0; d < dim; d++) emodein[numemodein + d] = emode; 6610d0321e0SJeremy L Thompson numemodein += dim; 6620d0321e0SJeremy L Thompson break; 6630d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: 6640d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 6650d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 6660d0321e0SJeremy L Thompson break; // Caught by QF Assembly 6670d0321e0SJeremy L Thompson } 6680d0321e0SJeremy L Thompson } 6690d0321e0SJeremy L Thompson } 6700d0321e0SJeremy L Thompson 6710d0321e0SJeremy L Thompson // Determine active output basis 6722b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); 6732b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); 6740d0321e0SJeremy L Thompson CeedInt numemodeout = 0; 6750d0321e0SJeremy L Thompson CeedEvalMode *emodeout = NULL; 6760d0321e0SJeremy L Thompson CeedBasis basisout = NULL; 6770d0321e0SJeremy L Thompson CeedElemRestriction rstrout = NULL; 6780d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 6790d0321e0SJeremy L Thompson CeedVector vec; 6802b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); 6810d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 6820d0321e0SJeremy L Thompson CeedElemRestriction rstr; 6832b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisout)); 6842b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); 6856574a04fSJeremy L Thompson CeedCheck(!rstrout || rstrout == rstr, ceed, CEED_ERROR_BACKEND, 6866574a04fSJeremy L Thompson "Backend does not implement multi-field non-composite operator diagonal assembly"); 6870d0321e0SJeremy L Thompson rstrout = rstr; 6880d0321e0SJeremy L Thompson CeedEvalMode emode; 6892b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); 6900d0321e0SJeremy L Thompson switch (emode) { 6910d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 6920d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 6932b730f8bSJeremy L Thompson CeedCallBackend(CeedRealloc(numemodeout + 1, &emodeout)); 6940d0321e0SJeremy L Thompson emodeout[numemodeout] = emode; 6950d0321e0SJeremy L Thompson numemodeout += 1; 6960d0321e0SJeremy L Thompson break; 6970d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 6982b730f8bSJeremy L Thompson CeedCallBackend(CeedRealloc(numemodeout + dim, &emodeout)); 6992b730f8bSJeremy L Thompson for (CeedInt d = 0; d < dim; d++) emodeout[numemodeout + d] = emode; 7000d0321e0SJeremy L Thompson numemodeout += dim; 7010d0321e0SJeremy L Thompson break; 7020d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: 7030d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 7040d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 7050d0321e0SJeremy L Thompson break; // Caught by QF Assembly 7060d0321e0SJeremy L Thompson } 7070d0321e0SJeremy L Thompson } 7080d0321e0SJeremy L Thompson } 7090d0321e0SJeremy L Thompson 7100d0321e0SJeremy L Thompson // Operator data struct 7110d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 7122b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 7132b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl->diag)); 7140d0321e0SJeremy L Thompson CeedOperatorDiag_Cuda *diag = impl->diag; 7150d0321e0SJeremy L Thompson diag->basisin = basisin; 7160d0321e0SJeremy L Thompson diag->basisout = basisout; 7170d0321e0SJeremy L Thompson diag->h_emodein = emodein; 7180d0321e0SJeremy L Thompson diag->h_emodeout = emodeout; 7190d0321e0SJeremy L Thompson diag->numemodein = numemodein; 7200d0321e0SJeremy L Thompson diag->numemodeout = numemodeout; 7210d0321e0SJeremy L Thompson 7220d0321e0SJeremy L Thompson // Assemble kernel 72307b31e0eSJeremy L Thompson char *diagonal_kernel_path, *diagonal_kernel_source; 7242b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); 72507b31e0eSJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Diagonal Assembly Kernel Source -----\n"); 7262b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); 7272b730f8bSJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Diagonal Assembly Source Complete! -----\n"); 7280d0321e0SJeremy L Thompson CeedInt nnodes, nqpts; 7292b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basisin, &nnodes)); 7302b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basisin, &nqpts)); 7310d0321e0SJeremy L Thompson diag->nnodes = nnodes; 732*f7c1b517Snbeams CeedCallCuda(ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 6, "NUMEMODEIN", numemodein, "NUMEMODEOUT", numemodeout, "NNODES", 733*f7c1b517Snbeams nnodes, "NQPTS", nqpts, "NCOMP", ncomp, "CEEDSIZE", use_ceedsize_idx)); 734eb7e6cafSJeremy L Thompson CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearDiagonal", &diag->linearDiagonal)); 735eb7e6cafSJeremy L Thompson CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearPointBlockDiagonal", &diag->linearPointBlock)); 7362b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&diagonal_kernel_path)); 7372b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&diagonal_kernel_source)); 7380d0321e0SJeremy L Thompson 7390d0321e0SJeremy L Thompson // Basis matrices 7400d0321e0SJeremy L Thompson const CeedInt qBytes = nqpts * sizeof(CeedScalar); 7410d0321e0SJeremy L Thompson const CeedInt iBytes = qBytes * nnodes; 7420d0321e0SJeremy L Thompson const CeedInt gBytes = qBytes * nnodes * dim; 7430d0321e0SJeremy L Thompson const CeedInt eBytes = sizeof(CeedEvalMode); 7440d0321e0SJeremy L Thompson const CeedScalar *interpin, *interpout, *gradin, *gradout; 7450d0321e0SJeremy L Thompson 7460d0321e0SJeremy L Thompson // CEED_EVAL_NONE 7470d0321e0SJeremy L Thompson CeedScalar *identity = NULL; 7480d0321e0SJeremy L Thompson bool evalNone = false; 7492b730f8bSJeremy L Thompson for (CeedInt i = 0; i < numemodein; i++) evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE); 7502b730f8bSJeremy L Thompson for (CeedInt i = 0; i < numemodeout; i++) evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE); 7510d0321e0SJeremy L Thompson if (evalNone) { 7522b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity)); 7532b730f8bSJeremy L Thompson for (CeedInt i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0; 7542b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_identity, iBytes)); 7552b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_identity, identity, iBytes, cudaMemcpyHostToDevice)); 7560d0321e0SJeremy L Thompson } 7570d0321e0SJeremy L Thompson 7580d0321e0SJeremy L Thompson // CEED_EVAL_INTERP 7592b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetInterp(basisin, &interpin)); 7602b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interpin, iBytes)); 7612b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_interpin, interpin, iBytes, cudaMemcpyHostToDevice)); 7622b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetInterp(basisout, &interpout)); 7632b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interpout, iBytes)); 7642b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_interpout, interpout, iBytes, cudaMemcpyHostToDevice)); 7650d0321e0SJeremy L Thompson 7660d0321e0SJeremy L Thompson // CEED_EVAL_GRAD 7672b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetGrad(basisin, &gradin)); 7682b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_gradin, gBytes)); 7692b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_gradin, gradin, gBytes, cudaMemcpyHostToDevice)); 7702b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetGrad(basisout, &gradout)); 7712b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_gradout, gBytes)); 7722b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_gradout, gradout, gBytes, cudaMemcpyHostToDevice)); 7730d0321e0SJeremy L Thompson 7740d0321e0SJeremy L Thompson // Arrays of emodes 7752b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_emodein, numemodein * eBytes)); 7762b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_emodein, emodein, numemodein * eBytes, cudaMemcpyHostToDevice)); 7772b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_emodeout, numemodeout * eBytes)); 7782b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes, cudaMemcpyHostToDevice)); 7790d0321e0SJeremy L Thompson 7800d0321e0SJeremy L Thompson // Restriction 7810d0321e0SJeremy L Thompson diag->diagrstr = rstrout; 7820d0321e0SJeremy L Thompson 7830d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 7840d0321e0SJeremy L Thompson } 7850d0321e0SJeremy L Thompson 7860d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 7870d0321e0SJeremy L Thompson // Assemble diagonal common code 7880d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 7892b730f8bSJeremy L Thompson static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool pointBlock) { 7900d0321e0SJeremy L Thompson Ceed ceed; 7912b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 7920d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 7932b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 7940d0321e0SJeremy L Thompson 7950d0321e0SJeremy L Thompson // Assemble QFunction 796c5f45aeaSJeremy L Thompson CeedVector assembledqf = NULL; 797c5f45aeaSJeremy L Thompson CeedElemRestriction rstr = NULL; 7982b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, &rstr, request)); 7992b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); 8000d0321e0SJeremy L Thompson 801*f7c1b517Snbeams CeedSize assembled_length = 0, assembledqf_length = 0; 802*f7c1b517Snbeams CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); 803*f7c1b517Snbeams CeedCallBackend(CeedVectorGetLength(assembledqf, &assembledqf_length)); 804*f7c1b517Snbeams CeedInt use_ceedsize_idx = 0; 805*f7c1b517Snbeams if ((assembled_length > INT_MAX) || (assembledqf_length > INT_MAX)) use_ceedsize_idx = 1; 806*f7c1b517Snbeams 8070d0321e0SJeremy L Thompson // Setup 8080d0321e0SJeremy L Thompson if (!impl->diag) { 809*f7c1b517Snbeams CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock, use_ceedsize_idx)); 8100d0321e0SJeremy L Thompson } 8110d0321e0SJeremy L Thompson CeedOperatorDiag_Cuda *diag = impl->diag; 8120d0321e0SJeremy L Thompson assert(diag != NULL); 8130d0321e0SJeremy L Thompson 8140d0321e0SJeremy L Thompson // Restriction 8150d0321e0SJeremy L Thompson if (pointBlock && !diag->pbdiagrstr) { 8160d0321e0SJeremy L Thompson CeedElemRestriction pbdiagrstr; 8172b730f8bSJeremy L Thompson CeedCallBackend(CreatePBRestriction(diag->diagrstr, &pbdiagrstr)); 8180d0321e0SJeremy L Thompson diag->pbdiagrstr = pbdiagrstr; 8190d0321e0SJeremy L Thompson } 8200d0321e0SJeremy L Thompson CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr; 8210d0321e0SJeremy L Thompson 8220d0321e0SJeremy L Thompson // Create diagonal vector 8230d0321e0SJeremy L Thompson CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag; 8240d0321e0SJeremy L Thompson if (!elemdiag) { 8252b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag)); 8262b730f8bSJeremy L Thompson if (pointBlock) diag->pbelemdiag = elemdiag; 8272b730f8bSJeremy L Thompson else diag->elemdiag = elemdiag; 8280d0321e0SJeremy L Thompson } 8292b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorSetValue(elemdiag, 0.0)); 8300d0321e0SJeremy L Thompson 8310d0321e0SJeremy L Thompson // Assemble element operator diagonals 8320d0321e0SJeremy L Thompson CeedScalar *elemdiagarray; 8330d0321e0SJeremy L Thompson const CeedScalar *assembledqfarray; 8342b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray)); 8352b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray)); 8360d0321e0SJeremy L Thompson CeedInt nelem; 8372b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(diagrstr, &nelem)); 8380d0321e0SJeremy L Thompson 8390d0321e0SJeremy L Thompson // Compute the diagonal of B^T D B 8400d0321e0SJeremy L Thompson int elemsPerBlock = 1; 8410d0321e0SJeremy L Thompson int grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); 8422b730f8bSJeremy L Thompson void *args[] = {(void *)&nelem, &diag->d_identity, &diag->d_interpin, &diag->d_gradin, &diag->d_interpout, 8432b730f8bSJeremy L Thompson &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout, &assembledqfarray, &elemdiagarray}; 8440d0321e0SJeremy L Thompson if (pointBlock) { 845eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearPointBlock, grid, diag->nnodes, 1, elemsPerBlock, args)); 8460d0321e0SJeremy L Thompson } else { 847eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearDiagonal, grid, diag->nnodes, 1, elemsPerBlock, args)); 8480d0321e0SJeremy L Thompson } 8490d0321e0SJeremy L Thompson 8500d0321e0SJeremy L Thompson // Restore arrays 8512b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(elemdiag, &elemdiagarray)); 8522b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray)); 8530d0321e0SJeremy L Thompson 8540d0321e0SJeremy L Thompson // Assemble local operator diagonal 8552b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, assembled, request)); 8560d0321e0SJeremy L Thompson 8570d0321e0SJeremy L Thompson // Cleanup 8582b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&assembledqf)); 8590d0321e0SJeremy L Thompson 8600d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 8610d0321e0SJeremy L Thompson } 8620d0321e0SJeremy L Thompson 8630d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 8640d0321e0SJeremy L Thompson // Assemble Linear Diagonal 8650d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 8662b730f8bSJeremy L Thompson static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) { 8672b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false)); 8686aa95790SJeremy L Thompson return CEED_ERROR_SUCCESS; 8690d0321e0SJeremy L Thompson } 8700d0321e0SJeremy L Thompson 8710d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 8720d0321e0SJeremy L Thompson // Assemble Linear Point Block Diagonal 8730d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 8742b730f8bSJeremy L Thompson static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) { 8752b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true)); 8766aa95790SJeremy L Thompson return CEED_ERROR_SUCCESS; 8770d0321e0SJeremy L Thompson } 8780d0321e0SJeremy L Thompson 8790d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 880cc132f9aSnbeams // Single operator assembly setup 881cc132f9aSnbeams //------------------------------------------------------------------------------ 882*f7c1b517Snbeams static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) { 883cc132f9aSnbeams Ceed ceed; 8842b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 885cc132f9aSnbeams CeedOperator_Cuda *impl; 8862b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 887cc132f9aSnbeams 888cc132f9aSnbeams // Get intput and output fields 889cc132f9aSnbeams CeedInt num_input_fields, num_output_fields; 890cc132f9aSnbeams CeedOperatorField *input_fields; 891cc132f9aSnbeams CeedOperatorField *output_fields; 8922b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); 893cc132f9aSnbeams 894cc132f9aSnbeams // Determine active input basis eval mode 895cc132f9aSnbeams CeedQFunction qf; 8962b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 897cc132f9aSnbeams CeedQFunctionField *qf_fields; 8982b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); 899cc132f9aSnbeams // Note that the kernel will treat each dimension of a gradient action separately; 900ea61e9acSJeremy L Thompson // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment by dim. 901ea61e9acSJeremy L Thompson // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, so 902cc132f9aSnbeams // num_B_in_mats_to_load will be incremented by 1. 903cc132f9aSnbeams CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0; 904cc132f9aSnbeams CeedEvalMode *eval_mode_in = NULL; // will be of size num_B_in_mats_load 905cc132f9aSnbeams CeedBasis basis_in = NULL; 906b216396cSJeremy L Thompson CeedInt nqpts = 0, esize = 0; 907cc132f9aSnbeams CeedElemRestriction rstr_in = NULL; 908cc132f9aSnbeams for (CeedInt i = 0; i < num_input_fields; i++) { 909cc132f9aSnbeams CeedVector vec; 9102b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); 911cc132f9aSnbeams if (vec == CEED_VECTOR_ACTIVE) { 9122b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in)); 9132b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); 9142b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &nqpts)); 9152b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); 9162b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &esize)); 917cc132f9aSnbeams CeedEvalMode eval_mode; 9182b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 919cc132f9aSnbeams if (eval_mode != CEED_EVAL_NONE) { 9202b730f8bSJeremy L Thompson CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in)); 921cc132f9aSnbeams eval_mode_in[num_B_in_mats_to_load] = eval_mode; 922cc132f9aSnbeams num_B_in_mats_to_load += 1; 923cc132f9aSnbeams if (eval_mode == CEED_EVAL_GRAD) { 924cc132f9aSnbeams num_emode_in += dim; 925cc132f9aSnbeams size_B_in += dim * esize * nqpts; 926cc132f9aSnbeams } else { 927cc132f9aSnbeams num_emode_in += 1; 928cc132f9aSnbeams size_B_in += esize * nqpts; 929cc132f9aSnbeams } 930cc132f9aSnbeams } 931cc132f9aSnbeams } 932cc132f9aSnbeams } 933cc132f9aSnbeams 934cc132f9aSnbeams // Determine active output basis; basis_out and rstr_out only used if same as input, TODO 9352b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); 936cc132f9aSnbeams CeedInt num_emode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0; 937cc132f9aSnbeams CeedEvalMode *eval_mode_out = NULL; 938cc132f9aSnbeams CeedBasis basis_out = NULL; 939cc132f9aSnbeams CeedElemRestriction rstr_out = NULL; 940cc132f9aSnbeams for (CeedInt i = 0; i < num_output_fields; i++) { 941cc132f9aSnbeams CeedVector vec; 9422b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); 943cc132f9aSnbeams if (vec == CEED_VECTOR_ACTIVE) { 9442b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out)); 9452b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); 9466574a04fSJeremy L Thompson CeedCheck(!rstr_out || rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly"); 947cc132f9aSnbeams CeedEvalMode eval_mode; 9482b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 949cc132f9aSnbeams if (eval_mode != CEED_EVAL_NONE) { 9502b730f8bSJeremy L Thompson CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out)); 951cc132f9aSnbeams eval_mode_out[num_B_out_mats_to_load] = eval_mode; 952cc132f9aSnbeams num_B_out_mats_to_load += 1; 953cc132f9aSnbeams if (eval_mode == CEED_EVAL_GRAD) { 954cc132f9aSnbeams num_emode_out += dim; 955cc132f9aSnbeams size_B_out += dim * esize * nqpts; 956cc132f9aSnbeams } else { 957cc132f9aSnbeams num_emode_out += 1; 958cc132f9aSnbeams size_B_out += esize * nqpts; 959cc132f9aSnbeams } 960cc132f9aSnbeams } 961cc132f9aSnbeams } 962cc132f9aSnbeams } 9636574a04fSJeremy L Thompson CeedCheck(num_emode_in > 0 && num_emode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); 964cc132f9aSnbeams 965cc132f9aSnbeams CeedInt nelem, ncomp; 9662b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &nelem)); 9672b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &ncomp)); 968cc132f9aSnbeams 9692b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl->asmb)); 970cc132f9aSnbeams CeedOperatorAssemble_Cuda *asmb = impl->asmb; 971cc132f9aSnbeams asmb->nelem = nelem; 972cc132f9aSnbeams 973cc132f9aSnbeams // Compile kernels 974cc132f9aSnbeams int elemsPerBlock = 1; 975cc132f9aSnbeams asmb->elemsPerBlock = elemsPerBlock; 97659ad764aSnbeams CeedInt block_size = esize * esize * elemsPerBlock; 977cc132f9aSnbeams Ceed_Cuda *cuda_data; 9782b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &cuda_data)); 97907b31e0eSJeremy L Thompson char *assembly_kernel_path, *assembly_kernel_source; 9802b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path)); 98107b31e0eSJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Assembly Kernel Source -----\n"); 9822b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); 98307b31e0eSJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Assembly Source Complete! -----\n"); 98407b31e0eSJeremy L Thompson bool fallback = block_size > cuda_data->device_prop.maxThreadsPerBlock; 98507b31e0eSJeremy L Thompson if (fallback) { 98659ad764aSnbeams // Use fallback kernel with 1D threadblock 98759ad764aSnbeams block_size = esize * elemsPerBlock; 98859ad764aSnbeams asmb->block_size_x = esize; 98959ad764aSnbeams asmb->block_size_y = 1; 99059ad764aSnbeams } else { // Use kernel with 2D threadblock 99159ad764aSnbeams asmb->block_size_x = esize; 99259ad764aSnbeams asmb->block_size_y = esize; 99307b31e0eSJeremy L Thompson } 994*f7c1b517Snbeams CeedCallCuda( 995*f7c1b517Snbeams ceed, CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 8, "NELEM", nelem, "NUMEMODEIN", num_emode_in, "NUMEMODEOUT", num_emode_out, 996*f7c1b517Snbeams "NQPTS", nqpts, "NNODES", esize, "BLOCK_SIZE", block_size, "NCOMP", ncomp, "CEEDSIZE", use_ceedsize_idx)); 997eb7e6cafSJeremy L Thompson CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, asmb->module, fallback ? "linearAssembleFallback" : "linearAssemble", &asmb->linearAssemble)); 9982b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&assembly_kernel_path)); 9992b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&assembly_kernel_source)); 1000cc132f9aSnbeams 1001cc132f9aSnbeams // Build 'full' B matrices (not 1D arrays used for tensor-product matrices) 1002cc132f9aSnbeams const CeedScalar *interp_in, *grad_in; 10032b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); 10042b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); 1005cc132f9aSnbeams 1006cc132f9aSnbeams // Load into B_in, in order that they will be used in eval_mode 1007cc132f9aSnbeams const CeedInt inBytes = size_B_in * sizeof(CeedScalar); 1008cc132f9aSnbeams CeedInt mat_start = 0; 10092b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_in, inBytes)); 1010cc132f9aSnbeams for (int i = 0; i < num_B_in_mats_to_load; i++) { 1011cc132f9aSnbeams CeedEvalMode eval_mode = eval_mode_in[i]; 1012cc132f9aSnbeams if (eval_mode == CEED_EVAL_INTERP) { 10132b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], interp_in, esize * nqpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); 1014cc132f9aSnbeams mat_start += esize * nqpts; 1015cc132f9aSnbeams } else if (eval_mode == CEED_EVAL_GRAD) { 10162b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * esize * nqpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); 1017cc132f9aSnbeams mat_start += dim * esize * nqpts; 1018cc132f9aSnbeams } 1019cc132f9aSnbeams } 1020cc132f9aSnbeams 1021cc132f9aSnbeams const CeedScalar *interp_out, *grad_out; 1022cc132f9aSnbeams // Note that this function currently assumes 1 basis, so this should always be true 1023cc132f9aSnbeams // for now 1024cc132f9aSnbeams if (basis_out == basis_in) { 1025cc132f9aSnbeams interp_out = interp_in; 1026cc132f9aSnbeams grad_out = grad_in; 1027cc132f9aSnbeams } else { 10282b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); 10292b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); 1030cc132f9aSnbeams } 1031cc132f9aSnbeams 1032cc132f9aSnbeams // Load into B_out, in order that they will be used in eval_mode 1033cc132f9aSnbeams const CeedInt outBytes = size_B_out * sizeof(CeedScalar); 1034cc132f9aSnbeams mat_start = 0; 10352b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_out, outBytes)); 1036cc132f9aSnbeams for (int i = 0; i < num_B_out_mats_to_load; i++) { 1037cc132f9aSnbeams CeedEvalMode eval_mode = eval_mode_out[i]; 1038cc132f9aSnbeams if (eval_mode == CEED_EVAL_INTERP) { 10392b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], interp_out, esize * nqpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); 1040cc132f9aSnbeams mat_start += esize * nqpts; 1041cc132f9aSnbeams } else if (eval_mode == CEED_EVAL_GRAD) { 10422b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * esize * nqpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); 1043cc132f9aSnbeams mat_start += dim * esize * nqpts; 1044cc132f9aSnbeams } 1045cc132f9aSnbeams } 1046cc132f9aSnbeams return CEED_ERROR_SUCCESS; 1047cc132f9aSnbeams } 1048cc132f9aSnbeams 1049cc132f9aSnbeams //------------------------------------------------------------------------------ 1050cefa2673SJeremy L Thompson // Assemble matrix data for COO matrix of assembled operator. 1051cefa2673SJeremy L Thompson // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic. 1052cefa2673SJeremy L Thompson // 1053ea61e9acSJeremy L Thompson // Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval 1054cefa2673SJeremy L Thompson // modes). 1055cefa2673SJeremy L Thompson // TODO: allow multiple active input restrictions/basis objects 1056cc132f9aSnbeams //------------------------------------------------------------------------------ 10572b730f8bSJeremy L Thompson static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) { 1058cc132f9aSnbeams Ceed ceed; 10592b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1060cc132f9aSnbeams CeedOperator_Cuda *impl; 10612b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &impl)); 1062cc132f9aSnbeams 1063cc132f9aSnbeams // Assemble QFunction 1064c5f45aeaSJeremy L Thompson CeedVector assembled_qf = NULL; 1065c5f45aeaSJeremy L Thompson CeedElemRestriction rstr_q = NULL; 10662b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); 10672b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); 1068cc132f9aSnbeams CeedScalar *values_array; 106928ec399dSJeremy L Thompson CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); 1070cc132f9aSnbeams values_array += offset; 1071cc132f9aSnbeams const CeedScalar *qf_array; 10722b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); 1073cc132f9aSnbeams 1074*f7c1b517Snbeams CeedSize values_length = 0, assembled_qf_length = 0; 1075*f7c1b517Snbeams CeedCallBackend(CeedVectorGetLength(values, &values_length)); 1076*f7c1b517Snbeams CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); 1077*f7c1b517Snbeams CeedInt use_ceedsize_idx = 0; 1078*f7c1b517Snbeams if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; 1079*f7c1b517Snbeams // Setup 1080*f7c1b517Snbeams if (!impl->asmb) { 1081*f7c1b517Snbeams CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx)); 1082*f7c1b517Snbeams assert(impl->asmb != NULL); 1083*f7c1b517Snbeams } 1084*f7c1b517Snbeams 1085cc132f9aSnbeams // Compute B^T D B 1086cc132f9aSnbeams const CeedInt nelem = impl->asmb->nelem; 1087cc132f9aSnbeams const CeedInt elemsPerBlock = impl->asmb->elemsPerBlock; 10882b730f8bSJeremy L Thompson const CeedInt grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); 10892b730f8bSJeremy L Thompson void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array}; 10902b730f8bSJeremy L Thompson CeedCallBackend( 1091eb7e6cafSJeremy L Thompson CeedRunKernelDim_Cuda(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elemsPerBlock, args)); 1092cc132f9aSnbeams 1093cc132f9aSnbeams // Restore arrays 10942b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); 10952b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &qf_array)); 1096cc132f9aSnbeams 1097cc132f9aSnbeams // Cleanup 10982b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&assembled_qf)); 1099cc132f9aSnbeams 1100cc132f9aSnbeams return CEED_ERROR_SUCCESS; 1101cc132f9aSnbeams } 1102cc132f9aSnbeams 1103cc132f9aSnbeams //------------------------------------------------------------------------------ 11040d0321e0SJeremy L Thompson // Create operator 11050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 11060d0321e0SJeremy L Thompson int CeedOperatorCreate_Cuda(CeedOperator op) { 11070d0321e0SJeremy L Thompson Ceed ceed; 11082b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 11090d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 11100d0321e0SJeremy L Thompson 11112b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl)); 11122b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetData(op, impl)); 11130d0321e0SJeremy L Thompson 11142b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda)); 11152b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Cuda)); 11162b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Cuda)); 11172b730f8bSJeremy L Thompson CeedCallBackend( 11182b730f8bSJeremy L Thompson CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda)); 11192b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Cuda)); 11202b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda)); 11212b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda)); 11220d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 11230d0321e0SJeremy L Thompson } 11240d0321e0SJeremy L Thompson 11250d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1126