1*bd882c8aSJames Wright // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other 2*bd882c8aSJames Wright // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE 3*bd882c8aSJames Wright // files for details. 4*bd882c8aSJames Wright // 5*bd882c8aSJames Wright // SPDX-License-Identifier: BSD-2-Clause 6*bd882c8aSJames Wright // 7*bd882c8aSJames Wright // This file is part of CEED: http://github.com/ceed 8*bd882c8aSJames Wright 9*bd882c8aSJames Wright #include <ceed/backend.h> 10*bd882c8aSJames Wright #include <ceed/ceed.h> 11*bd882c8aSJames Wright 12*bd882c8aSJames Wright #include <cassert> 13*bd882c8aSJames Wright #include <string> 14*bd882c8aSJames Wright #include <sycl/sycl.hpp> 15*bd882c8aSJames Wright 16*bd882c8aSJames Wright #include "../sycl/ceed-sycl-compile.hpp" 17*bd882c8aSJames Wright #include "ceed-sycl-ref.hpp" 18*bd882c8aSJames Wright 19*bd882c8aSJames Wright class CeedOperatorSyclLinearDiagonal; 20*bd882c8aSJames Wright class CeedOperatorSyclLinearAssemble; 21*bd882c8aSJames Wright class CeedOperatorSyclLinearAssembleFallback; 22*bd882c8aSJames Wright 23*bd882c8aSJames Wright //------------------------------------------------------------------------------ 24*bd882c8aSJames Wright // Get Basis Emode Pointer 25*bd882c8aSJames Wright //------------------------------------------------------------------------------ 26*bd882c8aSJames Wright void CeedOperatorGetBasisPointer_Sycl(const CeedScalar **basisptr, CeedEvalMode emode, const CeedScalar *identity, const CeedScalar *interp, 27*bd882c8aSJames Wright const CeedScalar *grad) { 28*bd882c8aSJames Wright switch (emode) { 29*bd882c8aSJames Wright case CEED_EVAL_NONE: 30*bd882c8aSJames Wright *basisptr = identity; 31*bd882c8aSJames Wright break; 32*bd882c8aSJames Wright case CEED_EVAL_INTERP: 33*bd882c8aSJames Wright *basisptr = interp; 34*bd882c8aSJames Wright break; 35*bd882c8aSJames Wright case CEED_EVAL_GRAD: 36*bd882c8aSJames Wright *basisptr = grad; 37*bd882c8aSJames Wright break; 38*bd882c8aSJames Wright case CEED_EVAL_WEIGHT: 39*bd882c8aSJames Wright case CEED_EVAL_DIV: 40*bd882c8aSJames Wright case CEED_EVAL_CURL: 41*bd882c8aSJames Wright break; // Caught by QF Assembly 42*bd882c8aSJames Wright } 43*bd882c8aSJames Wright } 44*bd882c8aSJames Wright 45*bd882c8aSJames Wright //------------------------------------------------------------------------------ 46*bd882c8aSJames Wright // Destroy operator 47*bd882c8aSJames Wright //------------------------------------------------------------------------------ 48*bd882c8aSJames Wright static int CeedOperatorDestroy_Sycl(CeedOperator op) { 49*bd882c8aSJames Wright CeedOperator_Sycl *impl; 50*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetData(op, &impl)); 51*bd882c8aSJames Wright Ceed ceed; 52*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 53*bd882c8aSJames Wright Ceed_Sycl *sycl_data; 54*bd882c8aSJames Wright CeedCallBackend(CeedGetData(ceed, &sycl_data)); 55*bd882c8aSJames Wright 56*bd882c8aSJames Wright // Apply data 57*bd882c8aSJames Wright for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) { 58*bd882c8aSJames Wright CeedCallBackend(CeedVectorDestroy(&impl->evecs[i])); 59*bd882c8aSJames Wright } 60*bd882c8aSJames Wright CeedCallBackend(CeedFree(&impl->evecs)); 61*bd882c8aSJames Wright 62*bd882c8aSJames Wright for (CeedInt i = 0; i < impl->numein; i++) { 63*bd882c8aSJames Wright CeedCallBackend(CeedVectorDestroy(&impl->qvecsin[i])); 64*bd882c8aSJames Wright } 65*bd882c8aSJames Wright CeedCallBackend(CeedFree(&impl->qvecsin)); 66*bd882c8aSJames Wright 67*bd882c8aSJames Wright for (CeedInt i = 0; i < impl->numeout; i++) { 68*bd882c8aSJames Wright CeedCallBackend(CeedVectorDestroy(&impl->qvecsout[i])); 69*bd882c8aSJames Wright } 70*bd882c8aSJames Wright CeedCallBackend(CeedFree(&impl->qvecsout)); 71*bd882c8aSJames Wright 72*bd882c8aSJames Wright // QFunction assembly data 73*bd882c8aSJames Wright for (CeedInt i = 0; i < impl->qfnumactivein; i++) { 74*bd882c8aSJames Wright CeedCallBackend(CeedVectorDestroy(&impl->qfactivein[i])); 75*bd882c8aSJames Wright } 76*bd882c8aSJames Wright CeedCallBackend(CeedFree(&impl->qfactivein)); 77*bd882c8aSJames Wright 78*bd882c8aSJames Wright // Diag data 79*bd882c8aSJames Wright if (impl->diag) { 80*bd882c8aSJames Wright CeedCallBackend(CeedFree(&impl->diag->h_emodein)); 81*bd882c8aSJames Wright CeedCallBackend(CeedFree(&impl->diag->h_emodeout)); 82*bd882c8aSJames Wright 83*bd882c8aSJames Wright CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw()); 84*bd882c8aSJames Wright CeedCallSycl(ceed, sycl::free(impl->diag->d_emodein, sycl_data->sycl_context)); 85*bd882c8aSJames Wright CeedCallSycl(ceed, sycl::free(impl->diag->d_emodeout, sycl_data->sycl_context)); 86*bd882c8aSJames Wright CeedCallSycl(ceed, sycl::free(impl->diag->d_identity, sycl_data->sycl_context)); 87*bd882c8aSJames Wright CeedCallSycl(ceed, sycl::free(impl->diag->d_interpin, sycl_data->sycl_context)); 88*bd882c8aSJames Wright CeedCallSycl(ceed, sycl::free(impl->diag->d_interpout, sycl_data->sycl_context)); 89*bd882c8aSJames Wright CeedCallSycl(ceed, sycl::free(impl->diag->d_gradin, sycl_data->sycl_context)); 90*bd882c8aSJames Wright CeedCallSycl(ceed, sycl::free(impl->diag->d_gradout, sycl_data->sycl_context)); 91*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr)); 92*bd882c8aSJames Wright 93*bd882c8aSJames Wright CeedCallBackend(CeedVectorDestroy(&impl->diag->elemdiag)); 94*bd882c8aSJames Wright CeedCallBackend(CeedVectorDestroy(&impl->diag->pbelemdiag)); 95*bd882c8aSJames Wright } 96*bd882c8aSJames Wright CeedCallBackend(CeedFree(&impl->diag)); 97*bd882c8aSJames Wright 98*bd882c8aSJames Wright if (impl->asmb) { 99*bd882c8aSJames Wright CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw()); 100*bd882c8aSJames Wright CeedCallSycl(ceed, sycl::free(impl->asmb->d_B_in, sycl_data->sycl_context)); 101*bd882c8aSJames Wright CeedCallSycl(ceed, sycl::free(impl->asmb->d_B_out, sycl_data->sycl_context)); 102*bd882c8aSJames Wright } 103*bd882c8aSJames Wright CeedCallBackend(CeedFree(&impl->asmb)); 104*bd882c8aSJames Wright 105*bd882c8aSJames Wright CeedCallBackend(CeedFree(&impl)); 106*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 107*bd882c8aSJames Wright } 108*bd882c8aSJames Wright 109*bd882c8aSJames Wright //------------------------------------------------------------------------------ 110*bd882c8aSJames Wright // Setup infields or outfields 111*bd882c8aSJames Wright //------------------------------------------------------------------------------ 112*bd882c8aSJames Wright static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool isinput, CeedVector *evecs, CeedVector *qvecs, CeedInt starte, 113*bd882c8aSJames Wright CeedInt numfields, CeedInt Q, CeedInt numelements) { 114*bd882c8aSJames Wright CeedInt dim, size; 115*bd882c8aSJames Wright CeedSize q_size; 116*bd882c8aSJames Wright Ceed ceed; 117*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 118*bd882c8aSJames Wright CeedBasis basis; 119*bd882c8aSJames Wright CeedElemRestriction Erestrict; 120*bd882c8aSJames Wright CeedOperatorField *opfields; 121*bd882c8aSJames Wright CeedQFunctionField *qffields; 122*bd882c8aSJames Wright CeedVector fieldvec; 123*bd882c8aSJames Wright bool strided; 124*bd882c8aSJames Wright bool skiprestrict; 125*bd882c8aSJames Wright 126*bd882c8aSJames Wright if (isinput) { 127*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); 128*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); 129*bd882c8aSJames Wright } else { 130*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); 131*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); 132*bd882c8aSJames Wright } 133*bd882c8aSJames Wright 134*bd882c8aSJames Wright // Loop over fields 135*bd882c8aSJames Wright for (CeedInt i = 0; i < numfields; i++) { 136*bd882c8aSJames Wright CeedEvalMode emode; 137*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); 138*bd882c8aSJames Wright 139*bd882c8aSJames Wright strided = false; 140*bd882c8aSJames Wright skiprestrict = false; 141*bd882c8aSJames Wright if (emode != CEED_EVAL_WEIGHT) { 142*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict)); 143*bd882c8aSJames Wright 144*bd882c8aSJames Wright // Check whether this field can skip the element restriction: 145*bd882c8aSJames Wright // must be passive input, with emode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. 146*bd882c8aSJames Wright 147*bd882c8aSJames Wright // First, check whether the field is input or output: 148*bd882c8aSJames Wright if (isinput) { 149*bd882c8aSJames Wright // Check for passive input: 150*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &fieldvec)); 151*bd882c8aSJames Wright if (fieldvec != CEED_VECTOR_ACTIVE) { 152*bd882c8aSJames Wright // Check emode 153*bd882c8aSJames Wright if (emode == CEED_EVAL_NONE) { 154*bd882c8aSJames Wright // Check for strided restriction 155*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &strided)); 156*bd882c8aSJames Wright if (strided) { 157*bd882c8aSJames Wright // Check if vector is already in preferred backend ordering 158*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &skiprestrict)); 159*bd882c8aSJames Wright } 160*bd882c8aSJames Wright } 161*bd882c8aSJames Wright } 162*bd882c8aSJames Wright } 163*bd882c8aSJames Wright if (skiprestrict) { 164*bd882c8aSJames Wright // We do not need an E-Vector, but will use the input field vector's data directly in the operator application 165*bd882c8aSJames Wright evecs[i + starte] = NULL; 166*bd882c8aSJames Wright } else { 167*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionCreateVector(Erestrict, NULL, &evecs[i + starte])); 168*bd882c8aSJames Wright } 169*bd882c8aSJames Wright } 170*bd882c8aSJames Wright 171*bd882c8aSJames Wright switch (emode) { 172*bd882c8aSJames Wright case CEED_EVAL_NONE: 173*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); 174*bd882c8aSJames Wright q_size = (CeedSize)numelements * Q * size; 175*bd882c8aSJames Wright CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); 176*bd882c8aSJames Wright break; 177*bd882c8aSJames Wright case CEED_EVAL_INTERP: 178*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); 179*bd882c8aSJames Wright q_size = (CeedSize)numelements * Q * size; 180*bd882c8aSJames Wright CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); 181*bd882c8aSJames Wright break; 182*bd882c8aSJames Wright case CEED_EVAL_GRAD: 183*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); 184*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); 185*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 186*bd882c8aSJames Wright q_size = (CeedSize)numelements * Q * size; 187*bd882c8aSJames Wright CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); 188*bd882c8aSJames Wright break; 189*bd882c8aSJames Wright case CEED_EVAL_WEIGHT: // Only on input fields 190*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); 191*bd882c8aSJames Wright q_size = (CeedSize)numelements * Q; 192*bd882c8aSJames Wright CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); 193*bd882c8aSJames Wright CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, NULL, qvecs[i])); 194*bd882c8aSJames Wright break; 195*bd882c8aSJames Wright case CEED_EVAL_DIV: 196*bd882c8aSJames Wright break; // TODO: Not implemented 197*bd882c8aSJames Wright case CEED_EVAL_CURL: 198*bd882c8aSJames Wright break; // TODO: Not implemented 199*bd882c8aSJames Wright } 200*bd882c8aSJames Wright } 201*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 202*bd882c8aSJames Wright } 203*bd882c8aSJames Wright 204*bd882c8aSJames Wright //------------------------------------------------------------------------------ 205*bd882c8aSJames Wright // CeedOperator needs to connect all the named fields (be they active or 206*bd882c8aSJames Wright // passive) to the named inputs and outputs of its CeedQFunction. 207*bd882c8aSJames Wright //------------------------------------------------------------------------------ 208*bd882c8aSJames Wright static int CeedOperatorSetup_Sycl(CeedOperator op) { 209*bd882c8aSJames Wright bool setupdone; 210*bd882c8aSJames Wright CeedCallBackend(CeedOperatorIsSetupDone(op, &setupdone)); 211*bd882c8aSJames Wright if (setupdone) return CEED_ERROR_SUCCESS; 212*bd882c8aSJames Wright 213*bd882c8aSJames Wright Ceed ceed; 214*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 215*bd882c8aSJames Wright CeedOperator_Sycl *impl; 216*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetData(op, &impl)); 217*bd882c8aSJames Wright CeedQFunction qf; 218*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 219*bd882c8aSJames Wright CeedInt Q, numelements, numinputfields, numoutputfields; 220*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 221*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); 222*bd882c8aSJames Wright CeedOperatorField *opinputfields, *opoutputfields; 223*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); 224*bd882c8aSJames Wright CeedQFunctionField *qfinputfields, *qfoutputfields; 225*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); 226*bd882c8aSJames Wright 227*bd882c8aSJames Wright // Allocate 228*bd882c8aSJames Wright CeedCallBackend(CeedCalloc(numinputfields + numoutputfields, &impl->evecs)); 229*bd882c8aSJames Wright 230*bd882c8aSJames Wright CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin)); 231*bd882c8aSJames Wright CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout)); 232*bd882c8aSJames Wright 233*bd882c8aSJames Wright impl->numein = numinputfields; 234*bd882c8aSJames Wright impl->numeout = numoutputfields; 235*bd882c8aSJames Wright 236*bd882c8aSJames Wright // Set up infield and outfield evecs and qvecs 237*bd882c8aSJames Wright // Infields 238*bd882c8aSJames Wright CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, true, impl->evecs, impl->qvecsin, 0, numinputfields, Q, numelements)); 239*bd882c8aSJames Wright 240*bd882c8aSJames Wright // Outfields 241*bd882c8aSJames Wright CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, false, impl->evecs, impl->qvecsout, numinputfields, numoutputfields, Q, numelements)); 242*bd882c8aSJames Wright 243*bd882c8aSJames Wright CeedCallBackend(CeedOperatorSetSetupDone(op)); 244*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 245*bd882c8aSJames Wright } 246*bd882c8aSJames Wright 247*bd882c8aSJames Wright //------------------------------------------------------------------------------ 248*bd882c8aSJames Wright // Setup Operator Inputs 249*bd882c8aSJames Wright //------------------------------------------------------------------------------ 250*bd882c8aSJames Wright static inline int CeedOperatorSetupInputs_Sycl(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 251*bd882c8aSJames Wright CeedVector invec, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], 252*bd882c8aSJames Wright CeedOperator_Sycl *impl, CeedRequest *request) { 253*bd882c8aSJames Wright CeedEvalMode emode; 254*bd882c8aSJames Wright CeedVector vec; 255*bd882c8aSJames Wright CeedElemRestriction Erestrict; 256*bd882c8aSJames Wright 257*bd882c8aSJames Wright for (CeedInt i = 0; i < numinputfields; i++) { 258*bd882c8aSJames Wright // Get input vector 259*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); 260*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) { 261*bd882c8aSJames Wright if (skipactive) continue; 262*bd882c8aSJames Wright else vec = invec; 263*bd882c8aSJames Wright } 264*bd882c8aSJames Wright 265*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); 266*bd882c8aSJames Wright if (emode == CEED_EVAL_WEIGHT) { // Skip 267*bd882c8aSJames Wright } else { 268*bd882c8aSJames Wright // Get input vector 269*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); 270*bd882c8aSJames Wright // Get input element restriction 271*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict)); 272*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) vec = invec; 273*bd882c8aSJames Wright // Restrict, if necessary 274*bd882c8aSJames Wright if (!impl->evecs[i]) { 275*bd882c8aSJames Wright // No restriction for this field; read data directly from vec. 276*bd882c8aSJames Wright CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); 277*bd882c8aSJames Wright } else { 278*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, impl->evecs[i], request)); 279*bd882c8aSJames Wright // Get evec 280*bd882c8aSJames Wright CeedCallBackend(CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); 281*bd882c8aSJames Wright } 282*bd882c8aSJames Wright } 283*bd882c8aSJames Wright } 284*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 285*bd882c8aSJames Wright } 286*bd882c8aSJames Wright 287*bd882c8aSJames Wright //------------------------------------------------------------------------------ 288*bd882c8aSJames Wright // Input Basis Action 289*bd882c8aSJames Wright //------------------------------------------------------------------------------ 290*bd882c8aSJames Wright static inline int CeedOperatorInputBasis_Sycl(CeedInt numelements, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 291*bd882c8aSJames Wright CeedInt numinputfields, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], 292*bd882c8aSJames Wright CeedOperator_Sycl *impl) { 293*bd882c8aSJames Wright CeedInt elemsize, size; 294*bd882c8aSJames Wright CeedElemRestriction Erestrict; 295*bd882c8aSJames Wright CeedEvalMode emode; 296*bd882c8aSJames Wright CeedBasis basis; 297*bd882c8aSJames Wright 298*bd882c8aSJames Wright for (CeedInt i = 0; i < numinputfields; i++) { 299*bd882c8aSJames Wright // Skip active input 300*bd882c8aSJames Wright if (skipactive) { 301*bd882c8aSJames Wright CeedVector vec; 302*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); 303*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) continue; 304*bd882c8aSJames Wright } 305*bd882c8aSJames Wright // Get elemsize, emode, size 306*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict)); 307*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize)); 308*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); 309*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size)); 310*bd882c8aSJames Wright // Basis action 311*bd882c8aSJames Wright switch (emode) { 312*bd882c8aSJames Wright case CEED_EVAL_NONE: 313*bd882c8aSJames Wright CeedCallBackend(CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i])); 314*bd882c8aSJames Wright break; 315*bd882c8aSJames Wright case CEED_EVAL_INTERP: 316*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); 317*bd882c8aSJames Wright CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->evecs[i], impl->qvecsin[i])); 318*bd882c8aSJames Wright break; 319*bd882c8aSJames Wright case CEED_EVAL_GRAD: 320*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); 321*bd882c8aSJames Wright CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->evecs[i], impl->qvecsin[i])); 322*bd882c8aSJames Wright break; 323*bd882c8aSJames Wright case CEED_EVAL_WEIGHT: 324*bd882c8aSJames Wright break; // No action 325*bd882c8aSJames Wright case CEED_EVAL_DIV: 326*bd882c8aSJames Wright break; // TODO: Not implemented 327*bd882c8aSJames Wright case CEED_EVAL_CURL: 328*bd882c8aSJames Wright break; // TODO: Not implemented 329*bd882c8aSJames Wright } 330*bd882c8aSJames Wright } 331*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 332*bd882c8aSJames Wright } 333*bd882c8aSJames Wright 334*bd882c8aSJames Wright //------------------------------------------------------------------------------ 335*bd882c8aSJames Wright // Restore Input Vectors 336*bd882c8aSJames Wright //------------------------------------------------------------------------------ 337*bd882c8aSJames Wright static inline int CeedOperatorRestoreInputs_Sycl(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 338*bd882c8aSJames Wright const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], CeedOperator_Sycl *impl) { 339*bd882c8aSJames Wright CeedEvalMode emode; 340*bd882c8aSJames Wright CeedVector vec; 341*bd882c8aSJames Wright 342*bd882c8aSJames Wright for (CeedInt i = 0; i < numinputfields; i++) { 343*bd882c8aSJames Wright // Skip active input 344*bd882c8aSJames Wright if (skipactive) { 345*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); 346*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) continue; 347*bd882c8aSJames Wright } 348*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); 349*bd882c8aSJames Wright if (emode == CEED_EVAL_WEIGHT) { // Skip 350*bd882c8aSJames Wright } else { 351*bd882c8aSJames Wright if (!impl->evecs[i]) { // This was a skiprestrict case 352*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); 353*bd882c8aSJames Wright CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&edata[i])); 354*bd882c8aSJames Wright } else { 355*bd882c8aSJames Wright CeedCallBackend(CeedVectorRestoreArrayRead(impl->evecs[i], (const CeedScalar **)&edata[i])); 356*bd882c8aSJames Wright } 357*bd882c8aSJames Wright } 358*bd882c8aSJames Wright } 359*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 360*bd882c8aSJames Wright } 361*bd882c8aSJames Wright 362*bd882c8aSJames Wright //------------------------------------------------------------------------------ 363*bd882c8aSJames Wright // Apply and add to output 364*bd882c8aSJames Wright //------------------------------------------------------------------------------ 365*bd882c8aSJames Wright static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request) { 366*bd882c8aSJames Wright CeedOperator_Sycl *impl; 367*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetData(op, &impl)); 368*bd882c8aSJames Wright CeedQFunction qf; 369*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 370*bd882c8aSJames Wright CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size; 371*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 372*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); 373*bd882c8aSJames Wright CeedOperatorField *opinputfields, *opoutputfields; 374*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); 375*bd882c8aSJames Wright CeedQFunctionField *qfinputfields, *qfoutputfields; 376*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); 377*bd882c8aSJames Wright CeedEvalMode emode; 378*bd882c8aSJames Wright CeedVector vec; 379*bd882c8aSJames Wright CeedBasis basis; 380*bd882c8aSJames Wright CeedElemRestriction Erestrict; 381*bd882c8aSJames Wright CeedScalar *edata[2 * CEED_FIELD_MAX] = {0}; 382*bd882c8aSJames Wright 383*bd882c8aSJames Wright // Setup 384*bd882c8aSJames Wright CeedCallBackend(CeedOperatorSetup_Sycl(op)); 385*bd882c8aSJames Wright 386*bd882c8aSJames Wright // Input Evecs and Restriction 387*bd882c8aSJames Wright CeedCallBackend(CeedOperatorSetupInputs_Sycl(numinputfields, qfinputfields, opinputfields, invec, false, edata, impl, request)); 388*bd882c8aSJames Wright 389*bd882c8aSJames Wright // Input basis apply if needed 390*bd882c8aSJames Wright CeedCallBackend(CeedOperatorInputBasis_Sycl(numelements, qfinputfields, opinputfields, numinputfields, false, edata, impl)); 391*bd882c8aSJames Wright 392*bd882c8aSJames Wright // Output pointers, as necessary 393*bd882c8aSJames Wright for (CeedInt i = 0; i < numoutputfields; i++) { 394*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); 395*bd882c8aSJames Wright if (emode == CEED_EVAL_NONE) { 396*bd882c8aSJames Wright // Set the output Q-Vector to use the E-Vector data directly 397*bd882c8aSJames Wright CeedCallBackend(CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, &edata[i + numinputfields])); 398*bd882c8aSJames Wright CeedCallBackend(CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i + numinputfields])); 399*bd882c8aSJames Wright } 400*bd882c8aSJames Wright } 401*bd882c8aSJames Wright 402*bd882c8aSJames Wright // Q function 403*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout)); 404*bd882c8aSJames Wright 405*bd882c8aSJames Wright // Output basis apply if needed 406*bd882c8aSJames Wright for (CeedInt i = 0; i < numoutputfields; i++) { 407*bd882c8aSJames Wright // Get elemsize, emode, size 408*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); 409*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize)); 410*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); 411*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size)); 412*bd882c8aSJames Wright // Basis action 413*bd882c8aSJames Wright switch (emode) { 414*bd882c8aSJames Wright case CEED_EVAL_NONE: 415*bd882c8aSJames Wright break; 416*bd882c8aSJames Wright case CEED_EVAL_INTERP: 417*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); 418*bd882c8aSJames Wright CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->qvecsout[i], impl->evecs[i + impl->numein])); 419*bd882c8aSJames Wright break; 420*bd882c8aSJames Wright case CEED_EVAL_GRAD: 421*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); 422*bd882c8aSJames Wright CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->qvecsout[i], impl->evecs[i + impl->numein])); 423*bd882c8aSJames Wright break; 424*bd882c8aSJames Wright // LCOV_EXCL_START 425*bd882c8aSJames Wright case CEED_EVAL_WEIGHT: 426*bd882c8aSJames Wright Ceed ceed; 427*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 428*bd882c8aSJames Wright return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 429*bd882c8aSJames Wright break; // Should not occur 430*bd882c8aSJames Wright case CEED_EVAL_DIV: 431*bd882c8aSJames Wright break; // TODO: Not implemented 432*bd882c8aSJames Wright case CEED_EVAL_CURL: 433*bd882c8aSJames Wright break; // TODO: Not implemented 434*bd882c8aSJames Wright // LCOV_EXCL_STOP 435*bd882c8aSJames Wright } 436*bd882c8aSJames Wright } 437*bd882c8aSJames Wright 438*bd882c8aSJames Wright // Output restriction 439*bd882c8aSJames Wright for (CeedInt i = 0; i < numoutputfields; i++) { 440*bd882c8aSJames Wright // Restore evec 441*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); 442*bd882c8aSJames Wright if (emode == CEED_EVAL_NONE) { 443*bd882c8aSJames Wright CeedCallBackend(CeedVectorRestoreArray(impl->evecs[i + impl->numein], &edata[i + numinputfields])); 444*bd882c8aSJames Wright } 445*bd882c8aSJames Wright // Get output vector 446*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec)); 447*bd882c8aSJames Wright // Restrict 448*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); 449*bd882c8aSJames Wright // Active 450*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 451*bd882c8aSJames Wright 452*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, impl->evecs[i + impl->numein], vec, request)); 453*bd882c8aSJames Wright } 454*bd882c8aSJames Wright 455*bd882c8aSJames Wright // Restore input arrays 456*bd882c8aSJames Wright CeedCallBackend(CeedOperatorRestoreInputs_Sycl(numinputfields, qfinputfields, opinputfields, false, edata, impl)); 457*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 458*bd882c8aSJames Wright } 459*bd882c8aSJames Wright 460*bd882c8aSJames Wright //------------------------------------------------------------------------------ 461*bd882c8aSJames Wright // Core code for assembling linear QFunction 462*bd882c8aSJames Wright //------------------------------------------------------------------------------ 463*bd882c8aSJames Wright static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, 464*bd882c8aSJames Wright CeedRequest *request) { 465*bd882c8aSJames Wright CeedOperator_Sycl *impl; 466*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetData(op, &impl)); 467*bd882c8aSJames Wright CeedQFunction qf; 468*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 469*bd882c8aSJames Wright CeedInt Q, numelements, numinputfields, numoutputfields, size; 470*bd882c8aSJames Wright CeedSize q_size; 471*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 472*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); 473*bd882c8aSJames Wright CeedOperatorField *opinputfields, *opoutputfields; 474*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); 475*bd882c8aSJames Wright CeedQFunctionField *qfinputfields, *qfoutputfields; 476*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); 477*bd882c8aSJames Wright CeedVector vec; 478*bd882c8aSJames Wright CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout; 479*bd882c8aSJames Wright CeedVector *activein = impl->qfactivein; 480*bd882c8aSJames Wright CeedScalar *a, *tmp; 481*bd882c8aSJames Wright Ceed ceed, ceedparent; 482*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 483*bd882c8aSJames Wright CeedCallBackend(CeedGetOperatorFallbackParentCeed(ceed, &ceedparent)); 484*bd882c8aSJames Wright ceedparent = ceedparent ? ceedparent : ceed; 485*bd882c8aSJames Wright CeedScalar *edata[2 * CEED_FIELD_MAX]; 486*bd882c8aSJames Wright 487*bd882c8aSJames Wright // Setup 488*bd882c8aSJames Wright CeedCallBackend(CeedOperatorSetup_Sycl(op)); 489*bd882c8aSJames Wright 490*bd882c8aSJames Wright // Check for identity 491*bd882c8aSJames Wright bool identityqf; 492*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionIsIdentity(qf, &identityqf)); 493*bd882c8aSJames Wright if (identityqf) { 494*bd882c8aSJames Wright // LCOV_EXCL_START 495*bd882c8aSJames Wright return CeedError(ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); 496*bd882c8aSJames Wright // LCOV_EXCL_STOP 497*bd882c8aSJames Wright } 498*bd882c8aSJames Wright 499*bd882c8aSJames Wright // Input Evecs and Restriction 500*bd882c8aSJames Wright CeedCallBackend(CeedOperatorSetupInputs_Sycl(numinputfields, qfinputfields, opinputfields, NULL, true, edata, impl, request)); 501*bd882c8aSJames Wright 502*bd882c8aSJames Wright // Count number of active input fields 503*bd882c8aSJames Wright if (!numactivein) { 504*bd882c8aSJames Wright for (CeedInt i = 0; i < numinputfields; i++) { 505*bd882c8aSJames Wright // Get input vector 506*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); 507*bd882c8aSJames Wright // Check if active input 508*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) { 509*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size)); 510*bd882c8aSJames Wright CeedCallBackend(CeedVectorSetValue(impl->qvecsin[i], 0.0)); 511*bd882c8aSJames Wright CeedCallBackend(CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp)); 512*bd882c8aSJames Wright CeedCallBackend(CeedRealloc(numactivein + size, &activein)); 513*bd882c8aSJames Wright for (CeedInt field = 0; field < size; field++) { 514*bd882c8aSJames Wright q_size = (CeedSize)Q * numelements; 515*bd882c8aSJames Wright CeedCallBackend(CeedVectorCreate(ceed, q_size, &activein[numactivein + field])); 516*bd882c8aSJames Wright CeedCallBackend(CeedVectorSetArray(activein[numactivein + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &tmp[field * Q * numelements])); 517*bd882c8aSJames Wright } 518*bd882c8aSJames Wright numactivein += size; 519*bd882c8aSJames Wright CeedCallBackend(CeedVectorRestoreArray(impl->qvecsin[i], &tmp)); 520*bd882c8aSJames Wright } 521*bd882c8aSJames Wright } 522*bd882c8aSJames Wright impl->qfnumactivein = numactivein; 523*bd882c8aSJames Wright impl->qfactivein = activein; 524*bd882c8aSJames Wright } 525*bd882c8aSJames Wright 526*bd882c8aSJames Wright // Count number of active output fields 527*bd882c8aSJames Wright if (!numactiveout) { 528*bd882c8aSJames Wright for (CeedInt i = 0; i < numoutputfields; i++) { 529*bd882c8aSJames Wright // Get output vector 530*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec)); 531*bd882c8aSJames Wright // Check if active output 532*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) { 533*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size)); 534*bd882c8aSJames Wright numactiveout += size; 535*bd882c8aSJames Wright } 536*bd882c8aSJames Wright } 537*bd882c8aSJames Wright impl->qfnumactiveout = numactiveout; 538*bd882c8aSJames Wright } 539*bd882c8aSJames Wright 540*bd882c8aSJames Wright // Check sizes 541*bd882c8aSJames Wright if (!numactivein || !numactiveout) { 542*bd882c8aSJames Wright // LCOV_EXCL_START 543*bd882c8aSJames Wright return CeedError(ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); 544*bd882c8aSJames Wright // LCOV_EXCL_STOP 545*bd882c8aSJames Wright } 546*bd882c8aSJames Wright 547*bd882c8aSJames Wright // Build objects if needed 548*bd882c8aSJames Wright if (build_objects) { 549*bd882c8aSJames Wright // Create output restriction 550*bd882c8aSJames Wright CeedInt strides[3] = {1, numelements * Q, Q}; /* *NOPAD* */ 551*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionCreateStrided(ceedparent, numelements, Q, numactivein * numactiveout, 552*bd882c8aSJames Wright numactivein * numactiveout * numelements * Q, strides, rstr)); 553*bd882c8aSJames Wright // Create assembled vector 554*bd882c8aSJames Wright CeedSize l_size = (CeedSize)numelements * Q * numactivein * numactiveout; 555*bd882c8aSJames Wright CeedCallBackend(CeedVectorCreate(ceedparent, l_size, assembled)); 556*bd882c8aSJames Wright } 557*bd882c8aSJames Wright CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); 558*bd882c8aSJames Wright CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a)); 559*bd882c8aSJames Wright 560*bd882c8aSJames Wright // Input basis apply 561*bd882c8aSJames Wright CeedCallBackend(CeedOperatorInputBasis_Sycl(numelements, qfinputfields, opinputfields, numinputfields, true, edata, impl)); 562*bd882c8aSJames Wright 563*bd882c8aSJames Wright // Assemble QFunction 564*bd882c8aSJames Wright for (CeedInt in = 0; in < numactivein; in++) { 565*bd882c8aSJames Wright // Set Inputs 566*bd882c8aSJames Wright CeedCallBackend(CeedVectorSetValue(activein[in], 1.0)); 567*bd882c8aSJames Wright if (numactivein > 1) { 568*bd882c8aSJames Wright CeedCallBackend(CeedVectorSetValue(activein[(in + numactivein - 1) % numactivein], 0.0)); 569*bd882c8aSJames Wright } 570*bd882c8aSJames Wright // Set Outputs 571*bd882c8aSJames Wright for (CeedInt out = 0; out < numoutputfields; out++) { 572*bd882c8aSJames Wright // Get output vector 573*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[out], &vec)); 574*bd882c8aSJames Wright // Check if active output 575*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) { 576*bd882c8aSJames Wright CeedCallBackend(CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE, CEED_USE_POINTER, a)); 577*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[out], &size)); 578*bd882c8aSJames Wright a += size * Q * numelements; // Advance the pointer by the size of the output 579*bd882c8aSJames Wright } 580*bd882c8aSJames Wright } 581*bd882c8aSJames Wright // Apply QFunction 582*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionApply(qf, Q * numelements, impl->qvecsin, impl->qvecsout)); 583*bd882c8aSJames Wright } 584*bd882c8aSJames Wright 585*bd882c8aSJames Wright // Un-set output Qvecs to prevent accidental overwrite of Assembled 586*bd882c8aSJames Wright for (CeedInt out = 0; out < numoutputfields; out++) { 587*bd882c8aSJames Wright // Get output vector 588*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[out], &vec)); 589*bd882c8aSJames Wright // Check if active output 590*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) { 591*bd882c8aSJames Wright CeedCallBackend(CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL)); 592*bd882c8aSJames Wright } 593*bd882c8aSJames Wright } 594*bd882c8aSJames Wright 595*bd882c8aSJames Wright // Restore input arrays 596*bd882c8aSJames Wright CeedCallBackend(CeedOperatorRestoreInputs_Sycl(numinputfields, qfinputfields, opinputfields, true, edata, impl)); 597*bd882c8aSJames Wright 598*bd882c8aSJames Wright // Restore output 599*bd882c8aSJames Wright CeedCallBackend(CeedVectorRestoreArray(*assembled, &a)); 600*bd882c8aSJames Wright 601*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 602*bd882c8aSJames Wright } 603*bd882c8aSJames Wright 604*bd882c8aSJames Wright //------------------------------------------------------------------------------ 605*bd882c8aSJames Wright // Assemble Linear QFunction 606*bd882c8aSJames Wright //------------------------------------------------------------------------------ 607*bd882c8aSJames Wright static int CeedOperatorLinearAssembleQFunction_Sycl(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { 608*bd882c8aSJames Wright return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, true, assembled, rstr, request); 609*bd882c8aSJames Wright } 610*bd882c8aSJames Wright 611*bd882c8aSJames Wright //------------------------------------------------------------------------------ 612*bd882c8aSJames Wright // Update Assembled Linear QFunction 613*bd882c8aSJames Wright //------------------------------------------------------------------------------ 614*bd882c8aSJames Wright static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { 615*bd882c8aSJames Wright return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, false, &assembled, &rstr, request); 616*bd882c8aSJames Wright } 617*bd882c8aSJames Wright 618*bd882c8aSJames Wright //------------------------------------------------------------------------------ 619*bd882c8aSJames Wright // Create point block restriction 620*bd882c8aSJames Wright //------------------------------------------------------------------------------ 621*bd882c8aSJames Wright static int CreatePBRestriction(CeedElemRestriction rstr, CeedElemRestriction *pbRstr) { 622*bd882c8aSJames Wright Ceed ceed; 623*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 624*bd882c8aSJames Wright const CeedInt *offsets; 625*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); 626*bd882c8aSJames Wright 627*bd882c8aSJames Wright // Expand offsets 628*bd882c8aSJames Wright CeedInt nelem, ncomp, elemsize, compstride, *pbOffsets; 629*bd882c8aSJames Wright CeedSize l_size; 630*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &nelem)); 631*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &ncomp)); 632*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elemsize)); 633*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &compstride)); 634*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); 635*bd882c8aSJames Wright CeedInt shift = ncomp; 636*bd882c8aSJames Wright if (compstride != 1) shift *= ncomp; 637*bd882c8aSJames Wright CeedCallBackend(CeedCalloc(nelem * elemsize, &pbOffsets)); 638*bd882c8aSJames Wright for (CeedInt i = 0; i < nelem * elemsize; i++) { 639*bd882c8aSJames Wright pbOffsets[i] = offsets[i] * shift; 640*bd882c8aSJames Wright } 641*bd882c8aSJames Wright 642*bd882c8aSJames Wright // Create new restriction 643*bd882c8aSJames Wright CeedCallBackend( 644*bd882c8aSJames Wright CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp * ncomp, 1, l_size * ncomp, CEED_MEM_HOST, CEED_OWN_POINTER, pbOffsets, pbRstr)); 645*bd882c8aSJames Wright 646*bd882c8aSJames Wright // Cleanup 647*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); 648*bd882c8aSJames Wright 649*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 650*bd882c8aSJames Wright } 651*bd882c8aSJames Wright 652*bd882c8aSJames Wright //------------------------------------------------------------------------------ 653*bd882c8aSJames Wright // Assemble diagonal setup 654*bd882c8aSJames Wright //------------------------------------------------------------------------------ 655*bd882c8aSJames Wright static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const bool pointBlock) { 656*bd882c8aSJames Wright Ceed ceed; 657*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 658*bd882c8aSJames Wright CeedQFunction qf; 659*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 660*bd882c8aSJames Wright CeedInt numinputfields, numoutputfields; 661*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields)); 662*bd882c8aSJames Wright 663*bd882c8aSJames Wright // Determine active input basis 664*bd882c8aSJames Wright CeedOperatorField *opfields; 665*bd882c8aSJames Wright CeedQFunctionField *qffields; 666*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); 667*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); 668*bd882c8aSJames Wright CeedInt numemodein = 0, ncomp = 0, dim = 1; 669*bd882c8aSJames Wright CeedEvalMode *emodein = NULL; 670*bd882c8aSJames Wright CeedBasis basisin = NULL; 671*bd882c8aSJames Wright CeedElemRestriction rstrin = NULL; 672*bd882c8aSJames Wright for (CeedInt i = 0; i < numinputfields; i++) { 673*bd882c8aSJames Wright CeedVector vec; 674*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); 675*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) { 676*bd882c8aSJames Wright CeedElemRestriction rstr; 677*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisin)); 678*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetNumComponents(basisin, &ncomp)); 679*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetDimension(basisin, &dim)); 680*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); 681*bd882c8aSJames Wright if (rstrin && rstrin != rstr) { 682*bd882c8aSJames Wright // LCOV_EXCL_START 683*bd882c8aSJames Wright return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); 684*bd882c8aSJames Wright // LCOV_EXCL_STOP 685*bd882c8aSJames Wright } 686*bd882c8aSJames Wright rstrin = rstr; 687*bd882c8aSJames Wright CeedEvalMode emode; 688*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); 689*bd882c8aSJames Wright switch (emode) { 690*bd882c8aSJames Wright case CEED_EVAL_NONE: 691*bd882c8aSJames Wright case CEED_EVAL_INTERP: 692*bd882c8aSJames Wright CeedCallBackend(CeedRealloc(numemodein + 1, &emodein)); 693*bd882c8aSJames Wright emodein[numemodein] = emode; 694*bd882c8aSJames Wright numemodein += 1; 695*bd882c8aSJames Wright break; 696*bd882c8aSJames Wright case CEED_EVAL_GRAD: 697*bd882c8aSJames Wright CeedCallBackend(CeedRealloc(numemodein + dim, &emodein)); 698*bd882c8aSJames Wright for (CeedInt d = 0; d < dim; d++) emodein[numemodein + d] = emode; 699*bd882c8aSJames Wright numemodein += dim; 700*bd882c8aSJames Wright break; 701*bd882c8aSJames Wright case CEED_EVAL_WEIGHT: 702*bd882c8aSJames Wright case CEED_EVAL_DIV: 703*bd882c8aSJames Wright case CEED_EVAL_CURL: 704*bd882c8aSJames Wright break; // Caught by QF Assembly 705*bd882c8aSJames Wright } 706*bd882c8aSJames Wright } 707*bd882c8aSJames Wright } 708*bd882c8aSJames Wright 709*bd882c8aSJames Wright // Determine active output basis 710*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); 711*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); 712*bd882c8aSJames Wright CeedInt numemodeout = 0; 713*bd882c8aSJames Wright CeedEvalMode *emodeout = NULL; 714*bd882c8aSJames Wright CeedBasis basisout = NULL; 715*bd882c8aSJames Wright CeedElemRestriction rstrout = NULL; 716*bd882c8aSJames Wright for (CeedInt i = 0; i < numoutputfields; i++) { 717*bd882c8aSJames Wright CeedVector vec; 718*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); 719*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) { 720*bd882c8aSJames Wright CeedElemRestriction rstr; 721*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisout)); 722*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); 723*bd882c8aSJames Wright if (rstrout && rstrout != rstr) { 724*bd882c8aSJames Wright // LCOV_EXCL_START 725*bd882c8aSJames Wright return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); 726*bd882c8aSJames Wright // LCOV_EXCL_STOP 727*bd882c8aSJames Wright } 728*bd882c8aSJames Wright rstrout = rstr; 729*bd882c8aSJames Wright CeedEvalMode emode; 730*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); 731*bd882c8aSJames Wright switch (emode) { 732*bd882c8aSJames Wright case CEED_EVAL_NONE: 733*bd882c8aSJames Wright case CEED_EVAL_INTERP: 734*bd882c8aSJames Wright CeedCallBackend(CeedRealloc(numemodeout + 1, &emodeout)); 735*bd882c8aSJames Wright emodeout[numemodeout] = emode; 736*bd882c8aSJames Wright numemodeout += 1; 737*bd882c8aSJames Wright break; 738*bd882c8aSJames Wright case CEED_EVAL_GRAD: 739*bd882c8aSJames Wright CeedCallBackend(CeedRealloc(numemodeout + dim, &emodeout)); 740*bd882c8aSJames Wright for (CeedInt d = 0; d < dim; d++) emodeout[numemodeout + d] = emode; 741*bd882c8aSJames Wright numemodeout += dim; 742*bd882c8aSJames Wright break; 743*bd882c8aSJames Wright case CEED_EVAL_WEIGHT: 744*bd882c8aSJames Wright case CEED_EVAL_DIV: 745*bd882c8aSJames Wright case CEED_EVAL_CURL: 746*bd882c8aSJames Wright break; // Caught by QF Assembly 747*bd882c8aSJames Wright } 748*bd882c8aSJames Wright } 749*bd882c8aSJames Wright } 750*bd882c8aSJames Wright 751*bd882c8aSJames Wright // Operator data struct 752*bd882c8aSJames Wright CeedOperator_Sycl *impl; 753*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetData(op, &impl)); 754*bd882c8aSJames Wright Ceed_Sycl *sycl_data; 755*bd882c8aSJames Wright CeedCallBackend(CeedGetData(ceed, &sycl_data)); 756*bd882c8aSJames Wright CeedCallBackend(CeedCalloc(1, &impl->diag)); 757*bd882c8aSJames Wright CeedOperatorDiag_Sycl *diag = impl->diag; 758*bd882c8aSJames Wright diag->basisin = basisin; 759*bd882c8aSJames Wright diag->basisout = basisout; 760*bd882c8aSJames Wright diag->h_emodein = emodein; 761*bd882c8aSJames Wright diag->h_emodeout = emodeout; 762*bd882c8aSJames Wright diag->numemodein = numemodein; 763*bd882c8aSJames Wright diag->numemodeout = numemodeout; 764*bd882c8aSJames Wright 765*bd882c8aSJames Wright // Kernel parameters 766*bd882c8aSJames Wright CeedInt nnodes, nqpts; 767*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetNumNodes(basisin, &nnodes)); 768*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetNumQuadraturePoints(basisin, &nqpts)); 769*bd882c8aSJames Wright diag->nnodes = nnodes; 770*bd882c8aSJames Wright diag->nqpts = nqpts; 771*bd882c8aSJames Wright diag->ncomp = ncomp; 772*bd882c8aSJames Wright 773*bd882c8aSJames Wright // Basis matrices 774*bd882c8aSJames Wright const CeedInt iLen = nqpts * nnodes; 775*bd882c8aSJames Wright const CeedInt gLen = nqpts * nnodes * dim; 776*bd882c8aSJames Wright const CeedScalar *interpin, *interpout, *gradin, *gradout; 777*bd882c8aSJames Wright 778*bd882c8aSJames Wright // CEED_EVAL_NONE 779*bd882c8aSJames Wright CeedScalar *identity = NULL; 780*bd882c8aSJames Wright bool evalNone = false; 781*bd882c8aSJames Wright for (CeedInt i = 0; i < numemodein; i++) evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE); 782*bd882c8aSJames Wright for (CeedInt i = 0; i < numemodeout; i++) evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE); 783*bd882c8aSJames Wright 784*bd882c8aSJames Wright // Order queue 785*bd882c8aSJames Wright sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier(); 786*bd882c8aSJames Wright 787*bd882c8aSJames Wright std::vector<sycl::event> copy_events; 788*bd882c8aSJames Wright if (evalNone) { 789*bd882c8aSJames Wright CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity)); 790*bd882c8aSJames Wright for (CeedInt i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0; 791*bd882c8aSJames Wright CeedCallSycl(ceed, diag->d_identity = sycl::malloc_device<CeedScalar>(iLen, sycl_data->sycl_device, sycl_data->sycl_context)); 792*bd882c8aSJames Wright sycl::event identity_copy = sycl_data->sycl_queue.copy<CeedScalar>(identity, diag->d_identity, iLen, {e}); 793*bd882c8aSJames Wright copy_events.push_back(identity_copy); 794*bd882c8aSJames Wright } 795*bd882c8aSJames Wright 796*bd882c8aSJames Wright // CEED_EVAL_INTERP 797*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetInterp(basisin, &interpin)); 798*bd882c8aSJames Wright CeedCallSycl(ceed, diag->d_interpin = sycl::malloc_device<CeedScalar>(iLen, sycl_data->sycl_device, sycl_data->sycl_context)); 799*bd882c8aSJames Wright sycl::event interpin_copy = sycl_data->sycl_queue.copy<CeedScalar>(interpin, diag->d_interpin, iLen, {e}); 800*bd882c8aSJames Wright copy_events.push_back(interpin_copy); 801*bd882c8aSJames Wright 802*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetInterp(basisout, &interpout)); 803*bd882c8aSJames Wright CeedCallSycl(ceed, diag->d_interpout = sycl::malloc_device<CeedScalar>(iLen, sycl_data->sycl_device, sycl_data->sycl_context)); 804*bd882c8aSJames Wright sycl::event interpout_copy = sycl_data->sycl_queue.copy<CeedScalar>(interpout, diag->d_interpout, iLen, {e}); 805*bd882c8aSJames Wright copy_events.push_back(interpout_copy); 806*bd882c8aSJames Wright 807*bd882c8aSJames Wright // CEED_EVAL_GRAD 808*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetGrad(basisin, &gradin)); 809*bd882c8aSJames Wright CeedCallSycl(ceed, diag->d_gradin = sycl::malloc_device<CeedScalar>(gLen, sycl_data->sycl_device, sycl_data->sycl_context)); 810*bd882c8aSJames Wright sycl::event gradin_copy = sycl_data->sycl_queue.copy<CeedScalar>(gradin, diag->d_gradin, gLen, {e}); 811*bd882c8aSJames Wright copy_events.push_back(gradin_copy); 812*bd882c8aSJames Wright 813*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetGrad(basisout, &gradout)); 814*bd882c8aSJames Wright CeedCallSycl(ceed, diag->d_gradout = sycl::malloc_device<CeedScalar>(gLen, sycl_data->sycl_device, sycl_data->sycl_context)); 815*bd882c8aSJames Wright sycl::event gradout_copy = sycl_data->sycl_queue.copy<CeedScalar>(gradout, diag->d_gradout, gLen, {e}); 816*bd882c8aSJames Wright copy_events.push_back(gradout_copy); 817*bd882c8aSJames Wright 818*bd882c8aSJames Wright // Arrays of emodes 819*bd882c8aSJames Wright CeedCallSycl(ceed, diag->d_emodein = sycl::malloc_device<CeedEvalMode>(numemodein, sycl_data->sycl_device, sycl_data->sycl_context)); 820*bd882c8aSJames Wright sycl::event emodein_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(emodein, diag->d_emodein, numemodein, {e}); 821*bd882c8aSJames Wright copy_events.push_back(emodein_copy); 822*bd882c8aSJames Wright 823*bd882c8aSJames Wright CeedCallSycl(ceed, diag->d_emodeout = sycl::malloc_device<CeedEvalMode>(numemodeout, sycl_data->sycl_device, sycl_data->sycl_context)); 824*bd882c8aSJames Wright sycl::event emodeout_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(emodeout, diag->d_emodeout, numemodeout, {e}); 825*bd882c8aSJames Wright copy_events.push_back(emodeout_copy); 826*bd882c8aSJames Wright 827*bd882c8aSJames Wright // Restriction 828*bd882c8aSJames Wright diag->diagrstr = rstrout; 829*bd882c8aSJames Wright 830*bd882c8aSJames Wright // Wait for all copies to complete and handle exceptions 831*bd882c8aSJames Wright CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events)); 832*bd882c8aSJames Wright 833*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 834*bd882c8aSJames Wright } 835*bd882c8aSJames Wright 836*bd882c8aSJames Wright //------------------------------------------------------------------------------ 837*bd882c8aSJames Wright // Kernel for diagonal assembly 838*bd882c8aSJames Wright //------------------------------------------------------------------------------ 839*bd882c8aSJames Wright static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool pointBlock, const CeedInt nelem, const CeedOperatorDiag_Sycl *diag, 840*bd882c8aSJames Wright const CeedScalar *assembledqfarray, CeedScalar *elemdiagarray) { 841*bd882c8aSJames Wright const CeedInt nnodes = diag->nnodes; 842*bd882c8aSJames Wright const CeedInt nqpts = diag->nqpts; 843*bd882c8aSJames Wright const CeedInt ncomp = diag->ncomp; 844*bd882c8aSJames Wright const CeedInt numemodein = diag->numemodein; 845*bd882c8aSJames Wright const CeedInt numemodeout = diag->numemodeout; 846*bd882c8aSJames Wright 847*bd882c8aSJames Wright const CeedScalar *identity = diag->d_identity; 848*bd882c8aSJames Wright const CeedScalar *interpin = diag->d_interpin; 849*bd882c8aSJames Wright const CeedScalar *gradin = diag->d_gradin; 850*bd882c8aSJames Wright const CeedScalar *interpout = diag->d_interpout; 851*bd882c8aSJames Wright const CeedScalar *gradout = diag->d_gradout; 852*bd882c8aSJames Wright const CeedEvalMode *emodein = diag->d_emodein; 853*bd882c8aSJames Wright const CeedEvalMode *emodeout = diag->d_emodeout; 854*bd882c8aSJames Wright 855*bd882c8aSJames Wright sycl::range<1> kernel_range(nelem * nnodes); 856*bd882c8aSJames Wright 857*bd882c8aSJames Wright // Order queue 858*bd882c8aSJames Wright sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); 859*bd882c8aSJames Wright sycl_queue.parallel_for<CeedOperatorSyclLinearDiagonal>(kernel_range, {e}, [=](sycl::id<1> idx) { 860*bd882c8aSJames Wright const CeedInt tid = idx % nnodes; 861*bd882c8aSJames Wright const CeedInt e = idx / nnodes; 862*bd882c8aSJames Wright 863*bd882c8aSJames Wright // Compute the diagonal of B^T D B 864*bd882c8aSJames Wright // Each element 865*bd882c8aSJames Wright CeedInt dout = -1; 866*bd882c8aSJames Wright // Each basis eval mode pair 867*bd882c8aSJames Wright for (CeedInt eout = 0; eout < numemodeout; eout++) { 868*bd882c8aSJames Wright const CeedScalar *bt = NULL; 869*bd882c8aSJames Wright if (emodeout[eout] == CEED_EVAL_GRAD) ++dout; 870*bd882c8aSJames Wright CeedOperatorGetBasisPointer_Sycl(&bt, emodeout[eout], identity, interpout, &gradout[dout * nqpts * nnodes]); 871*bd882c8aSJames Wright CeedInt din = -1; 872*bd882c8aSJames Wright for (CeedInt ein = 0; ein < numemodein; ein++) { 873*bd882c8aSJames Wright const CeedScalar *b = NULL; 874*bd882c8aSJames Wright if (emodein[ein] == CEED_EVAL_GRAD) ++din; 875*bd882c8aSJames Wright CeedOperatorGetBasisPointer_Sycl(&b, emodein[ein], identity, interpin, &gradin[din * nqpts * nnodes]); 876*bd882c8aSJames Wright // Each component 877*bd882c8aSJames Wright for (CeedInt compOut = 0; compOut < ncomp; compOut++) { 878*bd882c8aSJames Wright // Each qpoint/node pair 879*bd882c8aSJames Wright if (pointBlock) { 880*bd882c8aSJames Wright // Point Block Diagonal 881*bd882c8aSJames Wright for (CeedInt compIn = 0; compIn < ncomp; compIn++) { 882*bd882c8aSJames Wright CeedScalar evalue = 0.0; 883*bd882c8aSJames Wright for (CeedInt q = 0; q < nqpts; q++) { 884*bd882c8aSJames Wright const CeedScalar qfvalue = 885*bd882c8aSJames Wright assembledqfarray[((((ein * ncomp + compIn) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q]; 886*bd882c8aSJames Wright evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid]; 887*bd882c8aSJames Wright } 888*bd882c8aSJames Wright elemdiagarray[((compOut * ncomp + compIn) * nelem + e) * nnodes + tid] += evalue; 889*bd882c8aSJames Wright } 890*bd882c8aSJames Wright } else { 891*bd882c8aSJames Wright // Diagonal Only 892*bd882c8aSJames Wright CeedScalar evalue = 0.0; 893*bd882c8aSJames Wright for (CeedInt q = 0; q < nqpts; q++) { 894*bd882c8aSJames Wright const CeedScalar qfvalue = 895*bd882c8aSJames Wright assembledqfarray[((((ein * ncomp + compOut) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q]; 896*bd882c8aSJames Wright evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid]; 897*bd882c8aSJames Wright } 898*bd882c8aSJames Wright elemdiagarray[(compOut * nelem + e) * nnodes + tid] += evalue; 899*bd882c8aSJames Wright } 900*bd882c8aSJames Wright } 901*bd882c8aSJames Wright } 902*bd882c8aSJames Wright } 903*bd882c8aSJames Wright }); 904*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 905*bd882c8aSJames Wright } 906*bd882c8aSJames Wright 907*bd882c8aSJames Wright //------------------------------------------------------------------------------ 908*bd882c8aSJames Wright // Assemble diagonal common code 909*bd882c8aSJames Wright //------------------------------------------------------------------------------ 910*bd882c8aSJames Wright static inline int CeedOperatorAssembleDiagonalCore_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool pointBlock) { 911*bd882c8aSJames Wright Ceed ceed; 912*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 913*bd882c8aSJames Wright CeedOperator_Sycl *impl; 914*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetData(op, &impl)); 915*bd882c8aSJames Wright Ceed_Sycl *sycl_data; 916*bd882c8aSJames Wright CeedCallBackend(CeedGetData(ceed, &sycl_data)); 917*bd882c8aSJames Wright 918*bd882c8aSJames Wright // Assemble QFunction 919*bd882c8aSJames Wright CeedVector assembledqf = NULL; 920*bd882c8aSJames Wright CeedElemRestriction rstr = NULL; 921*bd882c8aSJames Wright CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, &rstr, request)); 922*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); 923*bd882c8aSJames Wright 924*bd882c8aSJames Wright // Setup 925*bd882c8aSJames Wright if (!impl->diag) { 926*bd882c8aSJames Wright CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Sycl(op, pointBlock)); 927*bd882c8aSJames Wright } 928*bd882c8aSJames Wright CeedOperatorDiag_Sycl *diag = impl->diag; 929*bd882c8aSJames Wright assert(diag != NULL); 930*bd882c8aSJames Wright 931*bd882c8aSJames Wright // Restriction 932*bd882c8aSJames Wright if (pointBlock && !diag->pbdiagrstr) { 933*bd882c8aSJames Wright CeedElemRestriction pbdiagrstr; 934*bd882c8aSJames Wright CeedCallBackend(CreatePBRestriction(diag->diagrstr, &pbdiagrstr)); 935*bd882c8aSJames Wright diag->pbdiagrstr = pbdiagrstr; 936*bd882c8aSJames Wright } 937*bd882c8aSJames Wright CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr; 938*bd882c8aSJames Wright 939*bd882c8aSJames Wright // Create diagonal vector 940*bd882c8aSJames Wright CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag; 941*bd882c8aSJames Wright if (!elemdiag) { 942*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag)); 943*bd882c8aSJames Wright if (pointBlock) diag->pbelemdiag = elemdiag; 944*bd882c8aSJames Wright else diag->elemdiag = elemdiag; 945*bd882c8aSJames Wright } 946*bd882c8aSJames Wright CeedCallBackend(CeedVectorSetValue(elemdiag, 0.0)); 947*bd882c8aSJames Wright 948*bd882c8aSJames Wright // Assemble element operator diagonals 949*bd882c8aSJames Wright CeedScalar *elemdiagarray; 950*bd882c8aSJames Wright const CeedScalar *assembledqfarray; 951*bd882c8aSJames Wright CeedCallBackend(CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray)); 952*bd882c8aSJames Wright CeedCallBackend(CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray)); 953*bd882c8aSJames Wright CeedInt nelem; 954*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetNumElements(diagrstr, &nelem)); 955*bd882c8aSJames Wright 956*bd882c8aSJames Wright // Compute the diagonal of B^T D B 957*bd882c8aSJames Wright // Umesh: This needs to be reviewed later 958*bd882c8aSJames Wright // if (pointBlock) { 959*bd882c8aSJames Wright // CeedCallBackend(CeedOperatorLinearPointBlockDiagonal_Sycl(sycl_data->sycl_queue, nelem, diag, assembledqfarray, elemdiagarray)); 960*bd882c8aSJames Wright //} else { 961*bd882c8aSJames Wright CeedCallBackend(CeedOperatorLinearDiagonal_Sycl(sycl_data->sycl_queue, pointBlock, nelem, diag, assembledqfarray, elemdiagarray)); 962*bd882c8aSJames Wright // } 963*bd882c8aSJames Wright 964*bd882c8aSJames Wright // Wait for queue to complete and handle exceptions 965*bd882c8aSJames Wright sycl_data->sycl_queue.wait_and_throw(); 966*bd882c8aSJames Wright 967*bd882c8aSJames Wright // Restore arrays 968*bd882c8aSJames Wright CeedCallBackend(CeedVectorRestoreArray(elemdiag, &elemdiagarray)); 969*bd882c8aSJames Wright CeedCallBackend(CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray)); 970*bd882c8aSJames Wright 971*bd882c8aSJames Wright // Assemble local operator diagonal 972*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, assembled, request)); 973*bd882c8aSJames Wright 974*bd882c8aSJames Wright // Cleanup 975*bd882c8aSJames Wright CeedCallBackend(CeedVectorDestroy(&assembledqf)); 976*bd882c8aSJames Wright 977*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 978*bd882c8aSJames Wright } 979*bd882c8aSJames Wright 980*bd882c8aSJames Wright //------------------------------------------------------------------------------ 981*bd882c8aSJames Wright // Assemble Linear Diagonal 982*bd882c8aSJames Wright //------------------------------------------------------------------------------ 983*bd882c8aSJames Wright static int CeedOperatorLinearAssembleAddDiagonal_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request) { 984*bd882c8aSJames Wright CeedCallBackend(CeedOperatorAssembleDiagonalCore_Sycl(op, assembled, request, false)); 985*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 986*bd882c8aSJames Wright } 987*bd882c8aSJames Wright 988*bd882c8aSJames Wright //------------------------------------------------------------------------------ 989*bd882c8aSJames Wright // Assemble Linear Point Block Diagonal 990*bd882c8aSJames Wright //------------------------------------------------------------------------------ 991*bd882c8aSJames Wright static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request) { 992*bd882c8aSJames Wright CeedCallBackend(CeedOperatorAssembleDiagonalCore_Sycl(op, assembled, request, true)); 993*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 994*bd882c8aSJames Wright } 995*bd882c8aSJames Wright 996*bd882c8aSJames Wright //------------------------------------------------------------------------------ 997*bd882c8aSJames Wright // Single operator assembly setup 998*bd882c8aSJames Wright //------------------------------------------------------------------------------ 999*bd882c8aSJames Wright static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) { 1000*bd882c8aSJames Wright Ceed ceed; 1001*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1002*bd882c8aSJames Wright CeedOperator_Sycl *impl; 1003*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetData(op, &impl)); 1004*bd882c8aSJames Wright 1005*bd882c8aSJames Wright // Get input and output fields 1006*bd882c8aSJames Wright CeedInt num_input_fields, num_output_fields; 1007*bd882c8aSJames Wright CeedOperatorField *input_fields; 1008*bd882c8aSJames Wright CeedOperatorField *output_fields; 1009*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); 1010*bd882c8aSJames Wright 1011*bd882c8aSJames Wright // Determine active input basis eval mode 1012*bd882c8aSJames Wright CeedQFunction qf; 1013*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1014*bd882c8aSJames Wright CeedQFunctionField *qf_fields; 1015*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); 1016*bd882c8aSJames Wright // Note that the kernel will treat each dimension of a gradient action separately; 1017*bd882c8aSJames Wright // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment by dim. 1018*bd882c8aSJames Wright // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, so 1019*bd882c8aSJames Wright // num_B_in_mats_to_load will be incremented by 1. 1020*bd882c8aSJames Wright CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0; 1021*bd882c8aSJames Wright CeedEvalMode *eval_mode_in = NULL; // will be of size num_B_in_mats_load 1022*bd882c8aSJames Wright CeedBasis basis_in = NULL; 1023*bd882c8aSJames Wright CeedInt nqpts = 0, esize = 0; 1024*bd882c8aSJames Wright CeedElemRestriction rstr_in = NULL; 1025*bd882c8aSJames Wright for (CeedInt i = 0; i < num_input_fields; i++) { 1026*bd882c8aSJames Wright CeedVector vec; 1027*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); 1028*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) { 1029*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in)); 1030*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); 1031*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &nqpts)); 1032*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); 1033*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &esize)); 1034*bd882c8aSJames Wright CeedEvalMode eval_mode; 1035*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 1036*bd882c8aSJames Wright if (eval_mode != CEED_EVAL_NONE) { 1037*bd882c8aSJames Wright CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in)); 1038*bd882c8aSJames Wright eval_mode_in[num_B_in_mats_to_load] = eval_mode; 1039*bd882c8aSJames Wright num_B_in_mats_to_load += 1; 1040*bd882c8aSJames Wright if (eval_mode == CEED_EVAL_GRAD) { 1041*bd882c8aSJames Wright num_emode_in += dim; 1042*bd882c8aSJames Wright size_B_in += dim * esize * nqpts; 1043*bd882c8aSJames Wright } else { 1044*bd882c8aSJames Wright num_emode_in += 1; 1045*bd882c8aSJames Wright size_B_in += esize * nqpts; 1046*bd882c8aSJames Wright } 1047*bd882c8aSJames Wright } 1048*bd882c8aSJames Wright } 1049*bd882c8aSJames Wright } 1050*bd882c8aSJames Wright 1051*bd882c8aSJames Wright // Determine active output basis; basis_out and rstr_out only used if same as input, TODO 1052*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); 1053*bd882c8aSJames Wright CeedInt num_emode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0; 1054*bd882c8aSJames Wright CeedEvalMode *eval_mode_out = NULL; 1055*bd882c8aSJames Wright CeedBasis basis_out = NULL; 1056*bd882c8aSJames Wright CeedElemRestriction rstr_out = NULL; 1057*bd882c8aSJames Wright for (CeedInt i = 0; i < num_output_fields; i++) { 1058*bd882c8aSJames Wright CeedVector vec; 1059*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); 1060*bd882c8aSJames Wright if (vec == CEED_VECTOR_ACTIVE) { 1061*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out)); 1062*bd882c8aSJames Wright CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); 1063*bd882c8aSJames Wright if (rstr_out && rstr_out != rstr_in) { 1064*bd882c8aSJames Wright // LCOV_EXCL_START 1065*bd882c8aSJames Wright return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly"); 1066*bd882c8aSJames Wright // LCOV_EXCL_STOP 1067*bd882c8aSJames Wright } 1068*bd882c8aSJames Wright CeedEvalMode eval_mode; 1069*bd882c8aSJames Wright CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 1070*bd882c8aSJames Wright if (eval_mode != CEED_EVAL_NONE) { 1071*bd882c8aSJames Wright CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out)); 1072*bd882c8aSJames Wright eval_mode_out[num_B_out_mats_to_load] = eval_mode; 1073*bd882c8aSJames Wright num_B_out_mats_to_load += 1; 1074*bd882c8aSJames Wright if (eval_mode == CEED_EVAL_GRAD) { 1075*bd882c8aSJames Wright num_emode_out += dim; 1076*bd882c8aSJames Wright size_B_out += dim * esize * nqpts; 1077*bd882c8aSJames Wright } else { 1078*bd882c8aSJames Wright num_emode_out += 1; 1079*bd882c8aSJames Wright size_B_out += esize * nqpts; 1080*bd882c8aSJames Wright } 1081*bd882c8aSJames Wright } 1082*bd882c8aSJames Wright } 1083*bd882c8aSJames Wright } 1084*bd882c8aSJames Wright 1085*bd882c8aSJames Wright if (num_emode_in == 0 || num_emode_out == 0) { 1086*bd882c8aSJames Wright // LCOV_EXCL_START 1087*bd882c8aSJames Wright return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); 1088*bd882c8aSJames Wright // LCOV_EXCL_STOP 1089*bd882c8aSJames Wright } 1090*bd882c8aSJames Wright 1091*bd882c8aSJames Wright CeedInt nelem, ncomp; 1092*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &nelem)); 1093*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &ncomp)); 1094*bd882c8aSJames Wright 1095*bd882c8aSJames Wright CeedCallBackend(CeedCalloc(1, &impl->asmb)); 1096*bd882c8aSJames Wright CeedOperatorAssemble_Sycl *asmb = impl->asmb; 1097*bd882c8aSJames Wright asmb->nelem = nelem; 1098*bd882c8aSJames Wright 1099*bd882c8aSJames Wright Ceed_Sycl *sycl_data; 1100*bd882c8aSJames Wright CeedCallBackend(CeedGetData(ceed, &sycl_data)); 1101*bd882c8aSJames Wright 1102*bd882c8aSJames Wright // Kernel setup 1103*bd882c8aSJames Wright int elemsPerBlock = 1; 1104*bd882c8aSJames Wright asmb->elemsPerBlock = elemsPerBlock; 1105*bd882c8aSJames Wright CeedInt block_size = esize * esize * elemsPerBlock; 1106*bd882c8aSJames Wright /* CeedInt maxThreadsPerBlock = sycl_data->sycl_device.get_info<sycl::info::device::max_work_group_size>(); 1107*bd882c8aSJames Wright bool fallback = block_size > maxThreadsPerBlock; 1108*bd882c8aSJames Wright asmb->fallback = fallback; 1109*bd882c8aSJames Wright if (fallback) { 1110*bd882c8aSJames Wright // Use fallback kernel with 1D threadblock 1111*bd882c8aSJames Wright block_size = esize * elemsPerBlock; 1112*bd882c8aSJames Wright asmb->block_size_x = esize; 1113*bd882c8aSJames Wright asmb->block_size_y = 1; 1114*bd882c8aSJames Wright } else { // Use kernel with 2D threadblock 1115*bd882c8aSJames Wright asmb->block_size_x = esize; 1116*bd882c8aSJames Wright asmb->block_size_y = esize; 1117*bd882c8aSJames Wright }*/ 1118*bd882c8aSJames Wright asmb->block_size_x = esize; 1119*bd882c8aSJames Wright asmb->block_size_y = esize; 1120*bd882c8aSJames Wright asmb->numemodein = num_emode_in; 1121*bd882c8aSJames Wright asmb->numemodeout = num_emode_out; 1122*bd882c8aSJames Wright asmb->nqpts = nqpts; 1123*bd882c8aSJames Wright asmb->nnodes = esize; 1124*bd882c8aSJames Wright asmb->block_size = block_size; 1125*bd882c8aSJames Wright asmb->ncomp = ncomp; 1126*bd882c8aSJames Wright 1127*bd882c8aSJames Wright // Build 'full' B matrices (not 1D arrays used for tensor-product matrices 1128*bd882c8aSJames Wright const CeedScalar *interp_in, *grad_in; 1129*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); 1130*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); 1131*bd882c8aSJames Wright 1132*bd882c8aSJames Wright // Load into B_in, in order that they will be used in eval_mode 1133*bd882c8aSJames Wright CeedInt mat_start = 0; 1134*bd882c8aSJames Wright CeedCallSycl(ceed, asmb->d_B_in = sycl::malloc_device<CeedScalar>(size_B_in, sycl_data->sycl_device, sycl_data->sycl_context)); 1135*bd882c8aSJames Wright for (int i = 0; i < num_B_in_mats_to_load; i++) { 1136*bd882c8aSJames Wright CeedEvalMode eval_mode = eval_mode_in[i]; 1137*bd882c8aSJames Wright if (eval_mode == CEED_EVAL_INTERP) { 1138*bd882c8aSJames Wright // Order queue 1139*bd882c8aSJames Wright sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier(); 1140*bd882c8aSJames Wright sycl_data->sycl_queue.copy<CeedScalar>(interp_in, &asmb->d_B_in[mat_start], esize * nqpts, {e}); 1141*bd882c8aSJames Wright mat_start += esize * nqpts; 1142*bd882c8aSJames Wright } else if (eval_mode == CEED_EVAL_GRAD) { 1143*bd882c8aSJames Wright // Order queue 1144*bd882c8aSJames Wright sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier(); 1145*bd882c8aSJames Wright sycl_data->sycl_queue.copy<CeedScalar>(grad_in, &asmb->d_B_in[mat_start], dim * esize * nqpts, {e}); 1146*bd882c8aSJames Wright mat_start += dim * esize * nqpts; 1147*bd882c8aSJames Wright } 1148*bd882c8aSJames Wright } 1149*bd882c8aSJames Wright 1150*bd882c8aSJames Wright const CeedScalar *interp_out, *grad_out; 1151*bd882c8aSJames Wright // Note that this function currently assumes 1 basis, so this should always be true 1152*bd882c8aSJames Wright // for now 1153*bd882c8aSJames Wright if (basis_out == basis_in) { 1154*bd882c8aSJames Wright interp_out = interp_in; 1155*bd882c8aSJames Wright grad_out = grad_in; 1156*bd882c8aSJames Wright } else { 1157*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); 1158*bd882c8aSJames Wright CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); 1159*bd882c8aSJames Wright } 1160*bd882c8aSJames Wright 1161*bd882c8aSJames Wright // Load into B_out, in order that they will be used in eval_mode 1162*bd882c8aSJames Wright mat_start = 0; 1163*bd882c8aSJames Wright CeedCallSycl(ceed, asmb->d_B_out = sycl::malloc_device<CeedScalar>(size_B_out, sycl_data->sycl_device, sycl_data->sycl_context)); 1164*bd882c8aSJames Wright for (int i = 0; i < num_B_out_mats_to_load; i++) { 1165*bd882c8aSJames Wright CeedEvalMode eval_mode = eval_mode_out[i]; 1166*bd882c8aSJames Wright if (eval_mode == CEED_EVAL_INTERP) { 1167*bd882c8aSJames Wright // Order queue 1168*bd882c8aSJames Wright sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier(); 1169*bd882c8aSJames Wright sycl_data->sycl_queue.copy<CeedScalar>(interp_out, &asmb->d_B_out[mat_start], esize * nqpts, {e}); 1170*bd882c8aSJames Wright mat_start += esize * nqpts; 1171*bd882c8aSJames Wright } else if (eval_mode == CEED_EVAL_GRAD) { 1172*bd882c8aSJames Wright // Order queue 1173*bd882c8aSJames Wright sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier(); 1174*bd882c8aSJames Wright sycl_data->sycl_queue.copy<CeedScalar>(grad_out, &asmb->d_B_out[mat_start], dim * esize * nqpts, {e}); 1175*bd882c8aSJames Wright mat_start += dim * esize * nqpts; 1176*bd882c8aSJames Wright } 1177*bd882c8aSJames Wright } 1178*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 1179*bd882c8aSJames Wright } 1180*bd882c8aSJames Wright 1181*bd882c8aSJames Wright //------------------------------------------------------------------------------ 1182*bd882c8aSJames Wright // Matrix assembly kernel for low-order elements (3D thread block) 1183*bd882c8aSJames Wright //------------------------------------------------------------------------------ 1184*bd882c8aSJames Wright static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOperator_Sycl *impl, const CeedScalar *qf_array, 1185*bd882c8aSJames Wright CeedScalar *values_array) { 1186*bd882c8aSJames Wright // This kernels assumes B_in and B_out have the same number of quadrature points and basis points. 1187*bd882c8aSJames Wright // TODO: expand to more general cases 1188*bd882c8aSJames Wright CeedOperatorAssemble_Sycl *asmb = impl->asmb; 1189*bd882c8aSJames Wright const CeedInt nelem = asmb->nelem; 1190*bd882c8aSJames Wright const CeedInt nnodes = asmb->nnodes; 1191*bd882c8aSJames Wright const CeedInt ncomp = asmb->ncomp; 1192*bd882c8aSJames Wright const CeedInt nqpts = asmb->nqpts; 1193*bd882c8aSJames Wright const CeedInt numemodein = asmb->numemodein; 1194*bd882c8aSJames Wright const CeedInt numemodeout = asmb->numemodeout; 1195*bd882c8aSJames Wright 1196*bd882c8aSJames Wright // Strides for final output ordering, determined by the reference (inference) implementation of the symbolic assembly, slowest --> fastest: element, 1197*bd882c8aSJames Wright // comp_in, comp_out, node_row, node_col 1198*bd882c8aSJames Wright const CeedInt comp_out_stride = nnodes * nnodes; 1199*bd882c8aSJames Wright const CeedInt comp_in_stride = comp_out_stride * ncomp; 1200*bd882c8aSJames Wright const CeedInt e_stride = comp_in_stride * ncomp; 1201*bd882c8aSJames Wright // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt 1202*bd882c8aSJames Wright const CeedInt qe_stride = nqpts; 1203*bd882c8aSJames Wright const CeedInt qcomp_out_stride = nelem * qe_stride; 1204*bd882c8aSJames Wright const CeedInt qemode_out_stride = qcomp_out_stride * ncomp; 1205*bd882c8aSJames Wright const CeedInt qcomp_in_stride = qemode_out_stride * numemodeout; 1206*bd882c8aSJames Wright const CeedInt qemode_in_stride = qcomp_in_stride * ncomp; 1207*bd882c8aSJames Wright 1208*bd882c8aSJames Wright CeedScalar *B_in, *B_out; 1209*bd882c8aSJames Wright B_in = asmb->d_B_in; 1210*bd882c8aSJames Wright B_out = asmb->d_B_out; 1211*bd882c8aSJames Wright const CeedInt block_size_x = asmb->block_size_x; 1212*bd882c8aSJames Wright const CeedInt block_size_y = asmb->block_size_y; 1213*bd882c8aSJames Wright 1214*bd882c8aSJames Wright sycl::range<3> kernel_range(nelem, block_size_y, block_size_x); 1215*bd882c8aSJames Wright 1216*bd882c8aSJames Wright // Order queue 1217*bd882c8aSJames Wright sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); 1218*bd882c8aSJames Wright sycl_queue.parallel_for<CeedOperatorSyclLinearAssemble>(kernel_range, {e}, [=](sycl::id<3> idx) { 1219*bd882c8aSJames Wright const int e = idx.get(0); // Element index 1220*bd882c8aSJames Wright const int l = idx.get(1); // The output column index of each B^TDB operation 1221*bd882c8aSJames Wright const int i = idx.get(2); // The output row index of each B^TDB operation 1222*bd882c8aSJames Wright // such that we have (Bout^T)_ij D_jk Bin_kl = C_il 1223*bd882c8aSJames Wright for (CeedInt comp_in = 0; comp_in < ncomp; comp_in++) { 1224*bd882c8aSJames Wright for (CeedInt comp_out = 0; comp_out < ncomp; comp_out++) { 1225*bd882c8aSJames Wright CeedScalar result = 0.0; 1226*bd882c8aSJames Wright CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; 1227*bd882c8aSJames Wright for (CeedInt emode_in = 0; emode_in < numemodein; emode_in++) { 1228*bd882c8aSJames Wright CeedInt b_in_index = emode_in * nqpts * nnodes; 1229*bd882c8aSJames Wright for (CeedInt emode_out = 0; emode_out < numemodeout; emode_out++) { 1230*bd882c8aSJames Wright CeedInt b_out_index = emode_out * nqpts * nnodes; 1231*bd882c8aSJames Wright CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; 1232*bd882c8aSJames Wright // Perform the B^T D B operation for this 'chunk' of D (the qf_array) 1233*bd882c8aSJames Wright for (CeedInt j = 0; j < nqpts; j++) { 1234*bd882c8aSJames Wright result += B_out[b_out_index + j * nnodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * nnodes + l]; 1235*bd882c8aSJames Wright } 1236*bd882c8aSJames Wright } // end of emode_out 1237*bd882c8aSJames Wright } // end of emode_in 1238*bd882c8aSJames Wright CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l; 1239*bd882c8aSJames Wright values_array[val_index] = result; 1240*bd882c8aSJames Wright } // end of out component 1241*bd882c8aSJames Wright } // end of in component 1242*bd882c8aSJames Wright }); 1243*bd882c8aSJames Wright 1244*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 1245*bd882c8aSJames Wright } 1246*bd882c8aSJames Wright 1247*bd882c8aSJames Wright //------------------------------------------------------------------------------ 1248*bd882c8aSJames Wright // Fallback kernel for larger orders (1D thread block) 1249*bd882c8aSJames Wright //------------------------------------------------------------------------------ 1250*bd882c8aSJames Wright /* 1251*bd882c8aSJames Wright static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, const CeedOperator_Sycl *impl, const CeedScalar *qf_array, 1252*bd882c8aSJames Wright CeedScalar *values_array) { 1253*bd882c8aSJames Wright // This kernel assumes B_in and B_out have the same number of quadrature points and basis points. 1254*bd882c8aSJames Wright // TODO: expand to more general cases 1255*bd882c8aSJames Wright CeedOperatorAssemble_Sycl *asmb = impl->asmb; 1256*bd882c8aSJames Wright const CeedInt nelem = asmb->nelem; 1257*bd882c8aSJames Wright const CeedInt nnodes = asmb->nnodes; 1258*bd882c8aSJames Wright const CeedInt ncomp = asmb->ncomp; 1259*bd882c8aSJames Wright const CeedInt nqpts = asmb->nqpts; 1260*bd882c8aSJames Wright const CeedInt numemodein = asmb->numemodein; 1261*bd882c8aSJames Wright const CeedInt numemodeout = asmb->numemodeout; 1262*bd882c8aSJames Wright 1263*bd882c8aSJames Wright // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: elememt, 1264*bd882c8aSJames Wright // comp_in, comp_out, node_row, node_col 1265*bd882c8aSJames Wright const CeedInt comp_out_stride = nnodes * nnodes; 1266*bd882c8aSJames Wright const CeedInt comp_in_stride = comp_out_stride * ncomp; 1267*bd882c8aSJames Wright const CeedInt e_stride = comp_in_stride * ncomp; 1268*bd882c8aSJames Wright // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt 1269*bd882c8aSJames Wright const CeedInt qe_stride = nqpts; 1270*bd882c8aSJames Wright const CeedInt qcomp_out_stride = nelem * qe_stride; 1271*bd882c8aSJames Wright const CeedInt qemode_out_stride = qcomp_out_stride * ncomp; 1272*bd882c8aSJames Wright const CeedInt qcomp_in_stride = qemode_out_stride * numemodeout; 1273*bd882c8aSJames Wright const CeedInt qemode_in_stride = qcomp_in_stride * ncomp; 1274*bd882c8aSJames Wright 1275*bd882c8aSJames Wright CeedScalar *B_in, *B_out; 1276*bd882c8aSJames Wright B_in = asmb->d_B_in; 1277*bd882c8aSJames Wright B_out = asmb->d_B_out; 1278*bd882c8aSJames Wright const CeedInt elemsPerBlock = asmb->elemsPerBlock; 1279*bd882c8aSJames Wright const CeedInt block_size_x = asmb->block_size_x; 1280*bd882c8aSJames Wright const CeedInt block_size_y = asmb->block_size_y; // This will be 1 for the fallback kernel 1281*bd882c8aSJames Wright 1282*bd882c8aSJames Wright const CeedInt grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); 1283*bd882c8aSJames Wright sycl::range<3> local_range(block_size_x, block_size_y, elemsPerBlock); 1284*bd882c8aSJames Wright sycl::range<3> global_range(grid * block_size_x, block_size_y, elemsPerBlock); 1285*bd882c8aSJames Wright sycl::nd_range<3> kernel_range(global_range, local_range); 1286*bd882c8aSJames Wright 1287*bd882c8aSJames Wright sycl_queue.parallel_for<CeedOperatorSyclLinearAssembleFallback>(kernel_range, [=](sycl::nd_item<3> work_item) { 1288*bd882c8aSJames Wright const CeedInt blockIdx = work_item.get_group(0); 1289*bd882c8aSJames Wright const CeedInt gridDimx = work_item.get_group_range(0); 1290*bd882c8aSJames Wright const CeedInt threadIdx = work_item.get_local_id(0); 1291*bd882c8aSJames Wright const CeedInt threadIdz = work_item.get_local_id(2); 1292*bd882c8aSJames Wright const CeedInt blockDimz = work_item.get_local_range(2); 1293*bd882c8aSJames Wright 1294*bd882c8aSJames Wright const int l = threadIdx; // The output column index of each B^TDB operation 1295*bd882c8aSJames Wright // such that we have (Bout^T)_ij D_jk Bin_kl = C_il 1296*bd882c8aSJames Wright for (CeedInt e = blockIdx * blockDimz + threadIdz; e < nelem; e += gridDimx * blockDimz) { 1297*bd882c8aSJames Wright for (CeedInt comp_in = 0; comp_in < ncomp; comp_in++) { 1298*bd882c8aSJames Wright for (CeedInt comp_out = 0; comp_out < ncomp; comp_out++) { 1299*bd882c8aSJames Wright for (CeedInt i = 0; i < nnodes; i++) { 1300*bd882c8aSJames Wright CeedScalar result = 0.0; 1301*bd882c8aSJames Wright CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; 1302*bd882c8aSJames Wright for (CeedInt emode_in = 0; emode_in < numemodein; emode_in++) { 1303*bd882c8aSJames Wright CeedInt b_in_index = emode_in * nqpts * nnodes; 1304*bd882c8aSJames Wright for (CeedInt emode_out = 0; emode_out < numemodeout; emode_out++) { 1305*bd882c8aSJames Wright CeedInt b_out_index = emode_out * nqpts * nnodes; 1306*bd882c8aSJames Wright CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; 1307*bd882c8aSJames Wright // Perform the B^T D B operation for this 'chunk' of D (the qf_array) 1308*bd882c8aSJames Wright for (CeedInt j = 0; j < nqpts; j++) { 1309*bd882c8aSJames Wright result += B_out[b_out_index + j * nnodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * nnodes + l]; 1310*bd882c8aSJames Wright } 1311*bd882c8aSJames Wright } // end of emode_out 1312*bd882c8aSJames Wright } // end of emode_in 1313*bd882c8aSJames Wright CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l; 1314*bd882c8aSJames Wright values_array[val_index] = result; 1315*bd882c8aSJames Wright } // end of loop over element node index, i 1316*bd882c8aSJames Wright } // end of out component 1317*bd882c8aSJames Wright } // end of in component 1318*bd882c8aSJames Wright } // end of element loop 1319*bd882c8aSJames Wright }); 1320*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 1321*bd882c8aSJames Wright }*/ 1322*bd882c8aSJames Wright 1323*bd882c8aSJames Wright //------------------------------------------------------------------------------ 1324*bd882c8aSJames Wright // Assemble matrix data for COO matrix of assembled operator. 1325*bd882c8aSJames Wright // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic. 1326*bd882c8aSJames Wright // 1327*bd882c8aSJames Wright // Note that this (and other assembly routines) currently assume only one active 1328*bd882c8aSJames Wright // input restriction/basis per operator (could have multiple basis eval modes). 1329*bd882c8aSJames Wright // TODO: allow multiple active input restrictions/basis objects 1330*bd882c8aSJames Wright //------------------------------------------------------------------------------ 1331*bd882c8aSJames Wright static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, CeedVector values) { 1332*bd882c8aSJames Wright Ceed ceed; 1333*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1334*bd882c8aSJames Wright CeedOperator_Sycl *impl; 1335*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetData(op, &impl)); 1336*bd882c8aSJames Wright Ceed_Sycl *sycl_data; 1337*bd882c8aSJames Wright CeedCallBackend(CeedGetData(ceed, &sycl_data)); 1338*bd882c8aSJames Wright 1339*bd882c8aSJames Wright // Setup 1340*bd882c8aSJames Wright if (!impl->asmb) { 1341*bd882c8aSJames Wright CeedCallBackend(CeedSingleOperatorAssembleSetup_Sycl(op)); 1342*bd882c8aSJames Wright assert(impl->asmb != NULL); 1343*bd882c8aSJames Wright } 1344*bd882c8aSJames Wright 1345*bd882c8aSJames Wright // Assemble QFunction 1346*bd882c8aSJames Wright CeedVector assembled_qf = NULL; 1347*bd882c8aSJames Wright CeedElemRestriction rstr_q = NULL; 1348*bd882c8aSJames Wright CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); 1349*bd882c8aSJames Wright CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); 1350*bd882c8aSJames Wright CeedScalar *values_array; 1351*bd882c8aSJames Wright CeedCallBackend(CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array)); 1352*bd882c8aSJames Wright values_array += offset; 1353*bd882c8aSJames Wright const CeedScalar *qf_array; 1354*bd882c8aSJames Wright CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); 1355*bd882c8aSJames Wright 1356*bd882c8aSJames Wright // Compute B^T D B 1357*bd882c8aSJames Wright CeedCallBackend(CeedOperatorLinearAssemble_Sycl(sycl_data->sycl_queue, impl, qf_array, values_array)); 1358*bd882c8aSJames Wright 1359*bd882c8aSJames Wright // Wait for kernels to be completed 1360*bd882c8aSJames Wright // Kris: Review if this is necessary -- enqueing an async barrier may be sufficient 1361*bd882c8aSJames Wright sycl_data->sycl_queue.wait_and_throw(); 1362*bd882c8aSJames Wright 1363*bd882c8aSJames Wright // Restore arrays 1364*bd882c8aSJames Wright CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); 1365*bd882c8aSJames Wright CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &qf_array)); 1366*bd882c8aSJames Wright 1367*bd882c8aSJames Wright // Cleanup 1368*bd882c8aSJames Wright CeedCallBackend(CeedVectorDestroy(&assembled_qf)); 1369*bd882c8aSJames Wright 1370*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 1371*bd882c8aSJames Wright } 1372*bd882c8aSJames Wright 1373*bd882c8aSJames Wright //------------------------------------------------------------------------------ 1374*bd882c8aSJames Wright // Create operator 1375*bd882c8aSJames Wright //------------------------------------------------------------------------------ 1376*bd882c8aSJames Wright int CeedOperatorCreate_Sycl(CeedOperator op) { 1377*bd882c8aSJames Wright Ceed ceed; 1378*bd882c8aSJames Wright CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1379*bd882c8aSJames Wright CeedOperator_Sycl *impl; 1380*bd882c8aSJames Wright 1381*bd882c8aSJames Wright CeedCallBackend(CeedCalloc(1, &impl)); 1382*bd882c8aSJames Wright CeedCallBackend(CeedOperatorSetData(op, impl)); 1383*bd882c8aSJames Wright 1384*bd882c8aSJames Wright CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Sycl)); 1385*bd882c8aSJames Wright CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Sycl)); 1386*bd882c8aSJames Wright CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Sycl)); 1387*bd882c8aSJames Wright CeedCallBackend( 1388*bd882c8aSJames Wright CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl)); 1389*bd882c8aSJames Wright CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Sycl)); 1390*bd882c8aSJames Wright CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl)); 1391*bd882c8aSJames Wright CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl)); 1392*bd882c8aSJames Wright return CEED_ERROR_SUCCESS; 1393*bd882c8aSJames Wright } 1394*bd882c8aSJames Wright 1395*bd882c8aSJames Wright //------------------------------------------------------------------------------ 1396