1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other 2 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE 3 // files for details. 4 // 5 // SPDX-License-Identifier: BSD-2-Clause 6 // 7 // This file is part of CEED: http://github.com/ceed 8 9 #include <ceed/backend.h> 10 #include <ceed/ceed.h> 11 12 #include <cassert> 13 #include <string> 14 #include <sycl/sycl.hpp> 15 16 #include "../sycl/ceed-sycl-compile.hpp" 17 #include "ceed-sycl-ref.hpp" 18 19 class CeedOperatorSyclLinearDiagonal; 20 class CeedOperatorSyclLinearAssemble; 21 class CeedOperatorSyclLinearAssembleFallback; 22 23 //------------------------------------------------------------------------------ 24 // Get Basis Emode Pointer 25 //------------------------------------------------------------------------------ 26 void CeedOperatorGetBasisPointer_Sycl(const CeedScalar **basis_ptr, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar *interp, 27 const CeedScalar *grad) { 28 switch (eval_mode) { 29 case CEED_EVAL_NONE: 30 *basis_ptr = identity; 31 break; 32 case CEED_EVAL_INTERP: 33 *basis_ptr = interp; 34 break; 35 case CEED_EVAL_GRAD: 36 *basis_ptr = grad; 37 break; 38 case CEED_EVAL_WEIGHT: 39 case CEED_EVAL_DIV: 40 case CEED_EVAL_CURL: 41 break; // Caught by QF Assembly 42 } 43 } 44 45 //------------------------------------------------------------------------------ 46 // Destroy operator 47 //------------------------------------------------------------------------------ 48 static int CeedOperatorDestroy_Sycl(CeedOperator op) { 49 Ceed ceed; 50 Ceed_Sycl *sycl_data; 51 CeedOperator_Sycl *impl; 52 53 CeedCallBackend(CeedOperatorGetData(op, &impl)); 54 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 55 CeedCallBackend(CeedGetData(ceed, &sycl_data)); 56 57 // Apply data 58 for (CeedInt i = 0; i < impl->num_e_in + impl->num_e_out; i++) { 59 CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i])); 60 } 61 CeedCallBackend(CeedFree(&impl->e_vecs)); 62 63 for (CeedInt i = 0; i < impl->num_e_in; i++) { 64 CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i])); 65 } 66 CeedCallBackend(CeedFree(&impl->q_vecs_in)); 67 68 for (CeedInt i = 0; i < impl->num_e_out; i++) { 69 CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); 70 } 71 CeedCallBackend(CeedFree(&impl->q_vecs_out)); 72 73 // QFunction assembly data 74 for (CeedInt i = 0; i < impl->num_active_in; i++) { 75 CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i])); 76 } 77 CeedCallBackend(CeedFree(&impl->qf_active_in)); 78 79 // Diag data 80 if (impl->diag) { 81 CeedCallBackend(CeedFree(&impl->diag->h_eval_mode_in)); 82 CeedCallBackend(CeedFree(&impl->diag->h_eval_mode_out)); 83 84 CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw()); 85 CeedCallSycl(ceed, sycl::free(impl->diag->d_eval_mode_in, sycl_data->sycl_context)); 86 CeedCallSycl(ceed, sycl::free(impl->diag->d_eval_mode_out, sycl_data->sycl_context)); 87 CeedCallSycl(ceed, sycl::free(impl->diag->d_identity, sycl_data->sycl_context)); 88 CeedCallSycl(ceed, sycl::free(impl->diag->d_interp_in, sycl_data->sycl_context)); 89 CeedCallSycl(ceed, sycl::free(impl->diag->d_interp_out, sycl_data->sycl_context)); 90 CeedCallSycl(ceed, sycl::free(impl->diag->d_grad_in, sycl_data->sycl_context)); 91 CeedCallSycl(ceed, sycl::free(impl->diag->d_grad_out, sycl_data->sycl_context)); 92 CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); 93 94 CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); 95 CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); 96 } 97 CeedCallBackend(CeedFree(&impl->diag)); 98 99 if (impl->asmb) { 100 CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw()); 101 CeedCallSycl(ceed, sycl::free(impl->asmb->d_B_in, sycl_data->sycl_context)); 102 CeedCallSycl(ceed, sycl::free(impl->asmb->d_B_out, sycl_data->sycl_context)); 103 } 104 CeedCallBackend(CeedFree(&impl->asmb)); 105 106 CeedCallBackend(CeedFree(&impl)); 107 return CEED_ERROR_SUCCESS; 108 } 109 110 //------------------------------------------------------------------------------ 111 // Setup infields or outfields 112 //------------------------------------------------------------------------------ 113 static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, 114 CeedInt num_fields, CeedInt Q, CeedInt num_elem) { 115 Ceed ceed; 116 CeedSize q_size; 117 bool is_strided, skip_restriction; 118 CeedInt dim, size; 119 CeedOperatorField *op_fields; 120 CeedQFunctionField *qf_fields; 121 122 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 123 if (is_input) { 124 CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); 125 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); 126 } else { 127 CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); 128 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); 129 } 130 131 // Loop over fields 132 for (CeedInt i = 0; i < num_fields; i++) { 133 CeedEvalMode eval_mode; 134 CeedVector vec; 135 CeedElemRestriction elem_rstr; 136 CeedBasis basis; 137 138 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 139 140 is_strided = false; 141 skip_restriction = false; 142 if (eval_mode != CEED_EVAL_WEIGHT) { 143 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); 144 145 // Check whether this field can skip the element restriction: 146 // must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. 147 148 // First, check whether the field is input or output: 149 if (is_input) { 150 // Check for passive input: 151 CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); 152 if (vec != CEED_VECTOR_ACTIVE) { 153 // Check eval_mode 154 if (eval_mode == CEED_EVAL_NONE) { 155 // Check for is_strided restriction 156 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 157 if (is_strided) { 158 // Check if vector is already in preferred backend ordering 159 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction)); 160 } 161 } 162 } 163 CeedCallBackend(CeedVectorDestroy(&vec)); 164 } 165 if (skip_restriction) { 166 // We do not need an E-Vector, but will use the input field vector's data directly in the operator application 167 e_vecs[i + start_e] = NULL; 168 } else { 169 CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e])); 170 } 171 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 172 } 173 174 switch (eval_mode) { 175 case CEED_EVAL_NONE: 176 CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); 177 q_size = (CeedSize)num_elem * Q * size; 178 CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); 179 break; 180 case CEED_EVAL_INTERP: 181 CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); 182 q_size = (CeedSize)num_elem * Q * size; 183 CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); 184 break; 185 case CEED_EVAL_GRAD: 186 CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); 187 CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); 188 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 189 CeedCallBackend(CeedBasisDestroy(&basis)); 190 q_size = (CeedSize)num_elem * Q * size; 191 CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); 192 break; 193 case CEED_EVAL_WEIGHT: // Only on input fields 194 CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); 195 q_size = (CeedSize)num_elem * Q; 196 CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); 197 CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); 198 CeedCallBackend(CeedBasisDestroy(&basis)); 199 break; 200 case CEED_EVAL_DIV: 201 break; // TODO: Not implemented 202 case CEED_EVAL_CURL: 203 break; // TODO: Not implemented 204 } 205 } 206 return CEED_ERROR_SUCCESS; 207 } 208 209 //------------------------------------------------------------------------------ 210 // CeedOperator needs to connect all the named fields (be they active or 211 // passive) to the named inputs and outputs of its CeedQFunction. 212 //------------------------------------------------------------------------------ 213 static int CeedOperatorSetup_Sycl(CeedOperator op) { 214 Ceed ceed; 215 bool is_setup_done; 216 CeedInt Q, num_elem, num_input_fields, num_output_fields; 217 CeedQFunctionField *qf_input_fields, *qf_output_fields; 218 CeedQFunction qf; 219 CeedOperatorField *op_input_fields, *op_output_fields; 220 CeedOperator_Sycl *impl; 221 222 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 223 if (is_setup_done) return CEED_ERROR_SUCCESS; 224 225 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 226 CeedCallBackend(CeedOperatorGetData(op, &impl)); 227 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 228 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 229 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 230 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 231 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 232 233 // Allocate 234 CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs)); 235 236 CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); 237 CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); 238 239 impl->num_e_in = num_input_fields; 240 impl->num_e_out = num_output_fields; 241 242 // Set up infield and outfield e_vecs and q_vecs 243 // Infields 244 CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem)); 245 // Outfields 246 CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); 247 248 CeedCallBackend(CeedOperatorSetSetupDone(op)); 249 return CEED_ERROR_SUCCESS; 250 } 251 252 //------------------------------------------------------------------------------ 253 // Setup Operator Inputs 254 //------------------------------------------------------------------------------ 255 static inline int CeedOperatorSetupInputs_Sycl(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, 256 CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], 257 CeedOperator_Sycl *impl, CeedRequest *request) { 258 for (CeedInt i = 0; i < num_input_fields; i++) { 259 bool is_active; 260 CeedEvalMode eval_mode; 261 CeedVector vec; 262 263 // Get input vector 264 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 265 is_active = vec == CEED_VECTOR_ACTIVE; 266 if (is_active) { 267 if (skip_active) continue; 268 else vec = in_vec; 269 } 270 271 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 272 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 273 } else { 274 // Restrict, if necessary 275 if (!impl->e_vecs[i]) { 276 // No restriction for this field; read data directly from vec. 277 CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); 278 } else { 279 CeedElemRestriction elem_rstr; 280 281 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 282 CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request)); 283 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 284 CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); 285 } 286 } 287 if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); 288 } 289 return CEED_ERROR_SUCCESS; 290 } 291 292 //------------------------------------------------------------------------------ 293 // Input Basis Action 294 //------------------------------------------------------------------------------ 295 static inline int CeedOperatorInputBasis_Sycl(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, 296 CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], 297 CeedOperator_Sycl *impl) { 298 for (CeedInt i = 0; i < num_input_fields; i++) { 299 CeedEvalMode eval_mode; 300 301 // Skip active input 302 if (skip_active) { 303 bool is_active; 304 CeedVector vec; 305 306 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 307 is_active = vec == CEED_VECTOR_ACTIVE; 308 CeedCallBackend(CeedVectorDestroy(&vec)); 309 if (is_active) continue; 310 } 311 // Basis action 312 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 313 switch (eval_mode) { 314 case CEED_EVAL_NONE: 315 CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); 316 break; 317 case CEED_EVAL_INTERP: 318 case CEED_EVAL_GRAD: { 319 CeedBasis basis; 320 321 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 322 CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i])); 323 CeedCallBackend(CeedBasisDestroy(&basis)); 324 break; 325 } 326 case CEED_EVAL_WEIGHT: 327 break; // No action 328 case CEED_EVAL_DIV: 329 break; // TODO: Not implemented 330 case CEED_EVAL_CURL: 331 break; // TODO: Not implemented 332 } 333 } 334 return CEED_ERROR_SUCCESS; 335 } 336 337 //------------------------------------------------------------------------------ 338 // Restore Input Vectors 339 //------------------------------------------------------------------------------ 340 static inline int CeedOperatorRestoreInputs_Sycl(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, 341 const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Sycl *impl) { 342 for (CeedInt i = 0; i < num_input_fields; i++) { 343 bool is_active; 344 CeedEvalMode eval_mode; 345 CeedVector vec; 346 347 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 348 is_active = vec == CEED_VECTOR_ACTIVE; 349 // Skip active input 350 if (skip_active) { 351 if (is_active) continue; 352 } 353 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 354 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 355 } else { 356 if (!impl->e_vecs[i]) { // This was a skip_restriction case 357 CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i])); 358 } else { 359 CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i])); 360 } 361 } 362 if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); 363 } 364 return CEED_ERROR_SUCCESS; 365 } 366 367 //------------------------------------------------------------------------------ 368 // Apply and add to output 369 //------------------------------------------------------------------------------ 370 static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { 371 CeedInt Q, num_elem, elem_size, num_input_fields, num_output_fields, size; 372 CeedEvalMode eval_mode; 373 CeedScalar *e_data[2 * CEED_FIELD_MAX] = {0}; 374 CeedQFunctionField *qf_input_fields, *qf_output_fields; 375 CeedQFunction qf; 376 CeedOperatorField *op_input_fields, *op_output_fields; 377 CeedOperator_Sycl *impl; 378 379 CeedCallBackend(CeedOperatorGetData(op, &impl)); 380 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 381 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 382 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 383 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 384 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 385 386 // Setup 387 CeedCallBackend(CeedOperatorSetup_Sycl(op)); 388 389 // Input Evecs and Restriction 390 CeedCallBackend(CeedOperatorSetupInputs_Sycl(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request)); 391 392 // Input basis apply if needed 393 CeedCallBackend(CeedOperatorInputBasis_Sycl(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl)); 394 395 // Output pointers, as necessary 396 for (CeedInt i = 0; i < num_output_fields; i++) { 397 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 398 if (eval_mode == CEED_EVAL_NONE) { 399 // Set the output Q-Vector to use the E-Vector data directly 400 CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_e_in], CEED_MEM_DEVICE, &e_data[i + num_input_fields])); 401 CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields])); 402 } 403 } 404 405 // Q function 406 CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out)); 407 408 // Output basis apply if needed 409 for (CeedInt i = 0; i < num_output_fields; i++) { 410 CeedElemRestriction elem_rstr; 411 412 // Get elem_size, eval_mode, size 413 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 414 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 415 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 416 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 417 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); 418 // Basis action 419 switch (eval_mode) { 420 case CEED_EVAL_NONE: 421 break; 422 case CEED_EVAL_INTERP: 423 case CEED_EVAL_GRAD: { 424 CeedBasis basis; 425 426 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 427 CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in])); 428 CeedCallBackend(CeedBasisDestroy(&basis)); 429 break; 430 } 431 // LCOV_EXCL_START 432 case CEED_EVAL_WEIGHT: { 433 Ceed ceed; 434 435 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 436 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 437 break; // Should not occur 438 } 439 case CEED_EVAL_DIV: 440 case CEED_EVAL_CURL: { 441 Ceed ceed; 442 443 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 444 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 445 break; // Should not occur 446 } 447 // LCOV_EXCL_STOP 448 } 449 } 450 451 // Output restriction 452 for (CeedInt i = 0; i < num_output_fields; i++) { 453 bool is_active; 454 CeedEvalMode eval_mode; 455 CeedVector vec; 456 CeedElemRestriction elem_rstr; 457 458 // Restore evec 459 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 460 if (eval_mode == CEED_EVAL_NONE) { 461 CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_e_in], &e_data[i + num_input_fields])); 462 } 463 // Restrict 464 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 465 is_active = vec == CEED_VECTOR_ACTIVE; 466 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 467 if (is_active) vec = out_vec; 468 CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_e_in], vec, request)); 469 if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec)); 470 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 471 } 472 473 // Restore input arrays 474 CeedCallBackend(CeedOperatorRestoreInputs_Sycl(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); 475 return CEED_ERROR_SUCCESS; 476 } 477 478 //------------------------------------------------------------------------------ 479 // Core code for assembling linear QFunction 480 //------------------------------------------------------------------------------ 481 static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, bool build_objects, CeedVector *assembled, 482 CeedElemRestriction *elem_rstr, CeedRequest *request) { 483 Ceed ceed, ceed_parent; 484 CeedSize q_size; 485 CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; 486 CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; 487 CeedVector *active_in; 488 CeedQFunctionField *qf_input_fields, *qf_output_fields; 489 CeedQFunction qf; 490 CeedOperatorField *op_input_fields, *op_output_fields; 491 CeedOperator_Sycl *impl; 492 493 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 494 CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent)); 495 CeedCallBackend(CeedOperatorGetData(op, &impl)); 496 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 497 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 498 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 499 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 500 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 501 active_in = impl->qf_active_in; 502 num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; 503 504 // Setup 505 CeedCallBackend(CeedOperatorSetup_Sycl(op)); 506 507 // Input Evecs and Restriction 508 CeedCallBackend(CeedOperatorSetupInputs_Sycl(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); 509 510 // Count number of active input fields 511 if (!num_active_in) { 512 for (CeedInt i = 0; i < num_input_fields; i++) { 513 CeedScalar *q_vec_array; 514 CeedVector vec; 515 516 // Check if active input 517 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 518 if (vec == CEED_VECTOR_ACTIVE) { 519 CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); 520 CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); 521 CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array)); 522 CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); 523 for (CeedInt field = 0; field < size; field++) { 524 q_size = (CeedSize)Q * num_elem; 525 CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); 526 CeedCallBackend( 527 CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); 528 } 529 num_active_in += size; 530 CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); 531 } 532 CeedCallBackend(CeedVectorDestroy(&vec)); 533 } 534 impl->num_active_in = num_active_in; 535 impl->qf_active_in = active_in; 536 } 537 538 // Count number of active output fields 539 if (!num_active_out) { 540 for (CeedInt i = 0; i < num_output_fields; i++) { 541 CeedVector vec; 542 543 // Check if active output 544 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 545 if (vec == CEED_VECTOR_ACTIVE) { 546 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); 547 num_active_out += size; 548 } 549 CeedCallBackend(CeedVectorDestroy(&vec)); 550 } 551 impl->num_active_out = num_active_out; 552 } 553 554 // Check sizes 555 CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); 556 557 // Build objects if needed 558 if (build_objects) { 559 CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; 560 CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ 561 562 // Create output restriction 563 CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, l_size, strides, elem_rstr)); 564 // Create assembled vector 565 CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); 566 } 567 CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); 568 CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array)); 569 570 // Input basis apply 571 CeedCallBackend(CeedOperatorInputBasis_Sycl(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl)); 572 573 // Assemble QFunction 574 for (CeedInt in = 0; in < num_active_in; in++) { 575 // Set Inputs 576 CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0)); 577 if (num_active_in > 1) { 578 CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0)); 579 } 580 // Set Outputs 581 for (CeedInt out = 0; out < num_output_fields; out++) { 582 CeedVector vec; 583 584 // Check if active output 585 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); 586 if (vec == CEED_VECTOR_ACTIVE) { 587 CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array)); 588 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); 589 assembled_array += size * Q * num_elem; // Advance the pointer by the size of the output 590 } 591 CeedCallBackend(CeedVectorDestroy(&vec)); 592 } 593 // Apply QFunction 594 CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); 595 } 596 597 // Un-set output Qvecs to prevent accidental overwrite of Assembled 598 for (CeedInt out = 0; out < num_output_fields; out++) { 599 CeedVector vec; 600 601 // Check if active output 602 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); 603 if (vec == CEED_VECTOR_ACTIVE) { 604 CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL)); 605 } 606 CeedCallBackend(CeedVectorDestroy(&vec)); 607 } 608 609 // Restore input arrays 610 CeedCallBackend(CeedOperatorRestoreInputs_Sycl(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl)); 611 612 // Restore output 613 CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); 614 return CEED_ERROR_SUCCESS; 615 } 616 617 //------------------------------------------------------------------------------ 618 // Assemble Linear QFunction 619 //------------------------------------------------------------------------------ 620 static int CeedOperatorLinearAssembleQFunction_Sycl(CeedOperator op, CeedVector *assembled, CeedElemRestriction *elem_rstr, CeedRequest *request) { 621 return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, true, assembled, elem_rstr, request); 622 } 623 624 //------------------------------------------------------------------------------ 625 // Update Assembled Linear QFunction 626 //------------------------------------------------------------------------------ 627 static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedVector assembled, CeedElemRestriction elem_rstr, 628 CeedRequest *request) { 629 return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, false, &assembled, &elem_rstr, request); 630 } 631 632 //------------------------------------------------------------------------------ 633 // Assemble diagonal setup 634 //------------------------------------------------------------------------------ 635 static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) { 636 Ceed ceed; 637 Ceed_Sycl *sycl_data; 638 CeedInt num_input_fields, num_output_fields, num_eval_mode_in = 0, num_comp = 0, dim = 1, num_eval_mode_out = 0; 639 CeedEvalMode *eval_mode_in = NULL, *eval_mode_out = NULL; 640 CeedBasis basis_in = NULL, basis_out = NULL; 641 CeedQFunctionField *qf_fields; 642 CeedQFunction qf; 643 CeedOperatorField *op_fields; 644 CeedOperator_Sycl *impl; 645 646 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 647 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 648 CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); 649 650 // Determine active input basis 651 CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); 652 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); 653 for (CeedInt i = 0; i < num_input_fields; i++) { 654 CeedVector vec; 655 656 CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); 657 if (vec == CEED_VECTOR_ACTIVE) { 658 CeedEvalMode eval_mode; 659 CeedBasis basis; 660 661 CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); 662 CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, 663 "Backend does not implement operator diagonal assembly with multiple active bases"); 664 if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in)); 665 CeedCallBackend(CeedBasisDestroy(&basis)); 666 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 667 switch (eval_mode) { 668 case CEED_EVAL_NONE: 669 case CEED_EVAL_INTERP: 670 CeedCallBackend(CeedRealloc(num_eval_mode_in + 1, &eval_mode_in)); 671 eval_mode_in[num_eval_mode_in] = eval_mode; 672 num_eval_mode_in += 1; 673 break; 674 case CEED_EVAL_GRAD: 675 CeedCallBackend(CeedRealloc(num_eval_mode_in + dim, &eval_mode_in)); 676 for (CeedInt d = 0; d < dim; d++) eval_mode_in[num_eval_mode_in + d] = eval_mode; 677 num_eval_mode_in += dim; 678 break; 679 case CEED_EVAL_WEIGHT: 680 case CEED_EVAL_DIV: 681 case CEED_EVAL_CURL: 682 break; // Caught by QF Assembly 683 } 684 } 685 CeedCallBackend(CeedVectorDestroy(&vec)); 686 } 687 688 // Determine active output basis 689 CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); 690 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); 691 for (CeedInt i = 0; i < num_output_fields; i++) { 692 CeedVector vec; 693 694 CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); 695 if (vec == CEED_VECTOR_ACTIVE) { 696 CeedEvalMode eval_mode; 697 CeedBasis basis; 698 699 CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); 700 CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, 701 "Backend does not implement operator diagonal assembly with multiple active bases"); 702 if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out)); 703 CeedCallBackend(CeedBasisDestroy(&basis)); 704 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 705 switch (eval_mode) { 706 case CEED_EVAL_NONE: 707 case CEED_EVAL_INTERP: 708 CeedCallBackend(CeedRealloc(num_eval_mode_in + 1, &eval_mode_in)); 709 eval_mode_in[num_eval_mode_in] = eval_mode; 710 num_eval_mode_in += 1; 711 break; 712 case CEED_EVAL_GRAD: 713 CeedCallBackend(CeedRealloc(num_eval_mode_in + dim, &eval_mode_in)); 714 for (CeedInt d = 0; d < dim; d++) eval_mode_in[num_eval_mode_in + d] = eval_mode; 715 num_eval_mode_in += dim; 716 break; 717 case CEED_EVAL_WEIGHT: 718 case CEED_EVAL_DIV: 719 case CEED_EVAL_CURL: 720 break; // Caught by QF Assembly 721 } 722 } 723 CeedCallBackend(CeedVectorDestroy(&vec)); 724 } 725 726 // Operator data struct 727 CeedCallBackend(CeedOperatorGetData(op, &impl)); 728 CeedCallBackend(CeedGetData(ceed, &sycl_data)); 729 CeedCallBackend(CeedCalloc(1, &impl->diag)); 730 CeedOperatorDiag_Sycl *diag = impl->diag; 731 732 diag->basis_in = basis_in; 733 diag->basis_out = basis_out; 734 diag->h_eval_mode_in = eval_mode_in; 735 diag->h_eval_mode_out = eval_mode_out; 736 diag->num_eval_mode_in = num_eval_mode_in; 737 diag->num_eval_mode_out = num_eval_mode_out; 738 739 // Kernel parameters 740 CeedInt num_nodes, num_qpts; 741 CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); 742 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); 743 diag->num_nodes = num_nodes; 744 diag->num_qpts = num_qpts; 745 diag->num_comp = num_comp; 746 747 // Basis matrices 748 const CeedInt i_len = num_qpts * num_nodes; 749 const CeedInt g_len = num_qpts * num_nodes * dim; 750 const CeedScalar *interp_in, *interp_out, *grad_in, *grad_out; 751 752 // CEED_EVAL_NONE 753 CeedScalar *identity = NULL; 754 bool has_eval_none = false; 755 for (CeedInt i = 0; i < num_eval_mode_in; i++) has_eval_none = has_eval_none || (eval_mode_in[i] == CEED_EVAL_NONE); 756 for (CeedInt i = 0; i < num_eval_mode_out; i++) has_eval_none = has_eval_none || (eval_mode_out[i] == CEED_EVAL_NONE); 757 758 std::vector<sycl::event> e; 759 760 if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()}; 761 762 std::vector<sycl::event> copy_events; 763 764 if (has_eval_none) { 765 CeedCallBackend(CeedCalloc(num_qpts * num_nodes, &identity)); 766 for (CeedSize i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0; 767 CeedCallSycl(ceed, diag->d_identity = sycl::malloc_device<CeedScalar>(i_len, sycl_data->sycl_device, sycl_data->sycl_context)); 768 sycl::event identity_copy = sycl_data->sycl_queue.copy<CeedScalar>(identity, diag->d_identity, i_len, e); 769 copy_events.push_back(identity_copy); 770 } 771 772 // CEED_EVAL_INTERP 773 CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); 774 CeedCallSycl(ceed, diag->d_interp_in = sycl::malloc_device<CeedScalar>(i_len, sycl_data->sycl_device, sycl_data->sycl_context)); 775 sycl::event interp_in_copy = sycl_data->sycl_queue.copy<CeedScalar>(interp_in, diag->d_interp_in, i_len, e); 776 copy_events.push_back(interp_in_copy); 777 778 CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); 779 CeedCallSycl(ceed, diag->d_interp_out = sycl::malloc_device<CeedScalar>(i_len, sycl_data->sycl_device, sycl_data->sycl_context)); 780 sycl::event interp_out_copy = sycl_data->sycl_queue.copy<CeedScalar>(interp_out, diag->d_interp_out, i_len, e); 781 copy_events.push_back(interp_out_copy); 782 783 // CEED_EVAL_GRAD 784 CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); 785 CeedCallSycl(ceed, diag->d_grad_in = sycl::malloc_device<CeedScalar>(g_len, sycl_data->sycl_device, sycl_data->sycl_context)); 786 sycl::event grad_in_copy = sycl_data->sycl_queue.copy<CeedScalar>(grad_in, diag->d_grad_in, g_len, e); 787 copy_events.push_back(grad_in_copy); 788 789 CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); 790 CeedCallSycl(ceed, diag->d_grad_out = sycl::malloc_device<CeedScalar>(g_len, sycl_data->sycl_device, sycl_data->sycl_context)); 791 sycl::event grad_out_copy = sycl_data->sycl_queue.copy<CeedScalar>(grad_out, diag->d_grad_out, g_len, e); 792 copy_events.push_back(grad_out_copy); 793 794 // Arrays of eval_modes 795 CeedCallSycl(ceed, diag->d_eval_mode_in = sycl::malloc_device<CeedEvalMode>(num_eval_mode_in, sycl_data->sycl_device, sycl_data->sycl_context)); 796 sycl::event eval_mode_in_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(eval_mode_in, diag->d_eval_mode_in, num_eval_mode_in, e); 797 copy_events.push_back(eval_mode_in_copy); 798 799 CeedCallSycl(ceed, diag->d_eval_mode_out = sycl::malloc_device<CeedEvalMode>(num_eval_mode_out, sycl_data->sycl_device, sycl_data->sycl_context)); 800 sycl::event eval_mode_out_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(eval_mode_out, diag->d_eval_mode_out, num_eval_mode_out, e); 801 copy_events.push_back(eval_mode_out_copy); 802 803 // Restriction 804 { 805 CeedElemRestriction rstr_out; 806 807 CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, NULL, &rstr_out)); 808 diag->diag_rstr = rstr_out; 809 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out)); 810 } 811 812 // Wait for all copies to complete and handle exceptions 813 CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events)); 814 return CEED_ERROR_SUCCESS; 815 } 816 817 //------------------------------------------------------------------------------ 818 // Kernel for diagonal assembly 819 //------------------------------------------------------------------------------ 820 static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool is_point_block, const CeedInt num_elem, 821 const CeedOperatorDiag_Sycl *diag, const CeedScalar *assembled_qf_array, CeedScalar *elem_diag_array) { 822 const CeedSize num_nodes = diag->num_nodes; 823 const CeedSize num_qpts = diag->num_qpts; 824 const CeedSize num_comp = diag->num_comp; 825 const CeedSize num_eval_mode_in = diag->num_eval_mode_in; 826 const CeedSize num_eval_mode_out = diag->num_eval_mode_out; 827 const CeedScalar *identity = diag->d_identity; 828 const CeedScalar *interp_in = diag->d_interp_in; 829 const CeedScalar *grad_in = diag->d_grad_in; 830 const CeedScalar *interp_out = diag->d_interp_out; 831 const CeedScalar *grad_out = diag->d_grad_out; 832 const CeedEvalMode *eval_mode_in = diag->d_eval_mode_in; 833 const CeedEvalMode *eval_mode_out = diag->d_eval_mode_out; 834 835 sycl::range<1> kernel_range(num_elem * num_nodes); 836 837 std::vector<sycl::event> e; 838 839 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()}; 840 841 sycl_queue.parallel_for<CeedOperatorSyclLinearDiagonal>(kernel_range, e, [=](sycl::id<1> idx) { 842 const CeedInt tid = idx % num_nodes; 843 const CeedInt e = idx / num_nodes; 844 845 // Compute the diagonal of B^T D B 846 // Each element 847 CeedInt d_out = -1; 848 // Each basis eval mode pair 849 for (CeedSize e_out = 0; e_out < num_eval_mode_out; e_out++) { 850 const CeedScalar *bt = NULL; 851 852 if (eval_mode_out[e_out] == CEED_EVAL_GRAD) ++d_out; 853 CeedOperatorGetBasisPointer_Sycl(&bt, eval_mode_out[e_out], identity, interp_out, &grad_out[d_out * num_qpts * num_nodes]); 854 CeedInt d_in = -1; 855 856 for (CeedSize e_in = 0; e_in < num_eval_mode_in; e_in++) { 857 const CeedScalar *b = NULL; 858 859 if (eval_mode_in[e_in] == CEED_EVAL_GRAD) ++d_in; 860 CeedOperatorGetBasisPointer_Sycl(&b, eval_mode_in[e_in], identity, interp_in, &grad_in[d_in * num_qpts * num_nodes]); 861 // Each component 862 for (CeedSize comp_out = 0; comp_out < num_comp; comp_out++) { 863 // Each qpoint/node pair 864 if (is_point_block) { 865 // Point Block Diagonal 866 for (CeedInt comp_in = 0; comp_in < num_comp; comp_in++) { 867 CeedScalar e_value = 0.0; 868 869 for (CeedSize q = 0; q < num_qpts; q++) { 870 const CeedScalar qf_value = 871 assembled_qf_array[((((e_in * num_comp + comp_in) * num_eval_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts + 872 q]; 873 874 e_value += bt[q * num_nodes + tid] * qf_value * b[q * num_nodes + tid]; 875 } 876 elem_diag_array[((comp_out * num_comp + comp_in) * num_elem + e) * num_nodes + tid] += e_value; 877 } 878 } else { 879 // Diagonal Only 880 CeedScalar e_value = 0.0; 881 882 for (CeedSize q = 0; q < num_qpts; q++) { 883 const CeedScalar qf_value = 884 assembled_qf_array[((((e_in * num_comp + comp_out) * num_eval_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts + 885 q]; 886 e_value += bt[q * num_nodes + tid] * qf_value * b[q * num_nodes + tid]; 887 } 888 elem_diag_array[(comp_out * num_elem + e) * num_nodes + tid] += e_value; 889 } 890 } 891 } 892 } 893 }); 894 return CEED_ERROR_SUCCESS; 895 } 896 897 //------------------------------------------------------------------------------ 898 // Assemble diagonal common code 899 //------------------------------------------------------------------------------ 900 static inline int CeedOperatorAssembleDiagonalCore_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { 901 Ceed ceed; 902 Ceed_Sycl *sycl_data; 903 CeedInt num_elem; 904 CeedScalar *elem_diag_array; 905 const CeedScalar *assembled_qf_array; 906 CeedVector assembled_qf = NULL; 907 CeedOperator_Sycl *impl; 908 909 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 910 CeedCallBackend(CeedOperatorGetData(op, &impl)); 911 CeedCallBackend(CeedGetData(ceed, &sycl_data)); 912 913 // Assemble QFunction 914 { 915 CeedElemRestriction elem_rstr = NULL; 916 917 CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &elem_rstr, request)); 918 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 919 } 920 921 // Setup 922 if (!impl->diag) { 923 CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Sycl(op)); 924 } 925 CeedOperatorDiag_Sycl *diag = impl->diag; 926 927 assert(diag != NULL); 928 929 // Restriction 930 if (is_point_block && !diag->point_block_diag_rstr) { 931 CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(diag->diag_rstr, &diag->point_block_diag_rstr)); 932 } 933 CeedElemRestriction diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; 934 935 // Create diagonal vector 936 CeedVector elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; 937 938 if (!elem_diag) { 939 CeedCallBackend(CeedElemRestrictionCreateVector(diag_rstr, NULL, &elem_diag)); 940 if (is_point_block) diag->point_block_elem_diag = elem_diag; 941 else diag->elem_diag = elem_diag; 942 } 943 CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0)); 944 945 // Assemble element operator diagonals 946 CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array)); 947 CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); 948 CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem)); 949 950 // Compute the diagonal of B^T D B 951 CeedCallBackend(CeedOperatorLinearDiagonal_Sycl(sycl_data->sycl_queue, is_point_block, num_elem, diag, assembled_qf_array, elem_diag_array)); 952 953 // Wait for queue to complete and handle exceptions 954 sycl_data->sycl_queue.wait_and_throw(); 955 956 // Restore arrays 957 CeedCallBackend(CeedVectorRestoreArray(elem_diag, &elem_diag_array)); 958 CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); 959 960 // Assemble local operator diagonal 961 CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request)); 962 963 // Cleanup 964 CeedCallBackend(CeedVectorDestroy(&assembled_qf)); 965 return CEED_ERROR_SUCCESS; 966 } 967 968 //------------------------------------------------------------------------------ 969 // Assemble Linear Diagonal 970 //------------------------------------------------------------------------------ 971 static int CeedOperatorLinearAssembleAddDiagonal_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request) { 972 CeedCallBackend(CeedOperatorAssembleDiagonalCore_Sycl(op, assembled, request, false)); 973 return CEED_ERROR_SUCCESS; 974 } 975 976 //------------------------------------------------------------------------------ 977 // Assemble Linear Point Block Diagonal 978 //------------------------------------------------------------------------------ 979 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request) { 980 CeedCallBackend(CeedOperatorAssembleDiagonalCore_Sycl(op, assembled, request, true)); 981 return CEED_ERROR_SUCCESS; 982 } 983 984 //------------------------------------------------------------------------------ 985 // Single operator assembly setup 986 //------------------------------------------------------------------------------ 987 static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) { 988 Ceed ceed; 989 CeedInt num_input_fields, num_output_fields, num_eval_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_eval_mode_out = 0, 990 num_B_out_mats_to_load = 0, size_B_out = 0, num_qpts = 0, elem_size = 0, num_elem, num_comp, 991 mat_start = 0; 992 CeedEvalMode *eval_mode_in = NULL, *eval_mode_out = NULL; 993 const CeedScalar *interp_in, *grad_in; 994 CeedElemRestriction rstr_in = NULL, rstr_out = NULL; 995 CeedBasis basis_in = NULL, basis_out = NULL; 996 CeedQFunctionField *qf_fields; 997 CeedQFunction qf; 998 CeedOperatorField *input_fields, *output_fields; 999 CeedOperator_Sycl *impl; 1000 1001 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1002 CeedCallBackend(CeedOperatorGetData(op, &impl)); 1003 1004 // Get input and output fields 1005 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); 1006 1007 // Determine active input basis eval mode 1008 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1009 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); 1010 // Note that the kernel will treat each dimension of a gradient action separately; 1011 // i.e., when an active input has a CEED_EVAL_GRAD mode, num_ eval_mode_in will increment by dim. 1012 // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, 1013 // so num_B_in_mats_to_load will be incremented by 1. 1014 for (CeedInt i = 0; i < num_input_fields; i++) { 1015 CeedVector vec; 1016 1017 CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); 1018 if (vec == CEED_VECTOR_ACTIVE) { 1019 CeedEvalMode eval_mode; 1020 CeedElemRestriction elem_rstr; 1021 CeedBasis basis; 1022 1023 CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis)); 1024 CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases"); 1025 if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in)); 1026 CeedCallBackend(CeedBasisDestroy(&basis)); 1027 CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); 1028 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); 1029 CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &elem_rstr)); 1030 if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in)); 1031 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1032 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size)); 1033 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 1034 if (eval_mode != CEED_EVAL_NONE) { 1035 CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in)); 1036 eval_mode_in[num_B_in_mats_to_load] = eval_mode; 1037 num_B_in_mats_to_load += 1; 1038 if (eval_mode == CEED_EVAL_GRAD) { 1039 num_eval_mode_in += dim; 1040 size_B_in += dim * elem_size * num_qpts; 1041 } else { 1042 num_eval_mode_in += 1; 1043 size_B_in += elem_size * num_qpts; 1044 } 1045 } 1046 } 1047 CeedCallBackend(CeedVectorDestroy(&vec)); 1048 } 1049 1050 // Determine active output basis; basis_out and rstr_out only used if same as input, TODO 1051 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); 1052 for (CeedInt i = 0; i < num_output_fields; i++) { 1053 CeedVector vec; 1054 1055 CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); 1056 if (vec == CEED_VECTOR_ACTIVE) { 1057 CeedEvalMode eval_mode; 1058 CeedElemRestriction elem_rstr; 1059 CeedBasis basis; 1060 1061 CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis)); 1062 CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, 1063 "Backend does not implement operator assembly with multiple active bases"); 1064 if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out)); 1065 CeedCallBackend(CeedBasisDestroy(&basis)); 1066 CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &elem_rstr)); 1067 if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out)); 1068 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1069 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 1070 if (eval_mode != CEED_EVAL_NONE) { 1071 CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out)); 1072 eval_mode_out[num_B_out_mats_to_load] = eval_mode; 1073 num_B_out_mats_to_load += 1; 1074 if (eval_mode == CEED_EVAL_GRAD) { 1075 num_eval_mode_out += dim; 1076 size_B_out += dim * elem_size * num_qpts; 1077 } else { 1078 num_eval_mode_out += 1; 1079 size_B_out += elem_size * num_qpts; 1080 } 1081 } 1082 } 1083 CeedCallBackend(CeedVectorDestroy(&vec)); 1084 } 1085 CeedCheck(num_eval_mode_in > 0 && num_eval_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); 1086 1087 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem)); 1088 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp)); 1089 1090 CeedCallBackend(CeedCalloc(1, &impl->asmb)); 1091 CeedOperatorAssemble_Sycl *asmb = impl->asmb; 1092 asmb->num_elem = num_elem; 1093 1094 Ceed_Sycl *sycl_data; 1095 CeedCallBackend(CeedGetData(ceed, &sycl_data)); 1096 1097 // Kernel setup 1098 int elems_per_block = 1; 1099 asmb->elems_per_block = elems_per_block; 1100 asmb->block_size_x = elem_size; 1101 asmb->block_size_y = elem_size; 1102 asmb->num_eval_mode_in = num_eval_mode_in; 1103 asmb->num_eval_mode_out = num_eval_mode_out; 1104 asmb->num_qpts = num_qpts; 1105 asmb->num_nodes = elem_size; 1106 asmb->block_size = elem_size * elem_size * elems_per_block; 1107 asmb->num_comp = num_comp; 1108 1109 // Build 'full' B matrices (not 1D arrays used for tensor-product matrices 1110 CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); 1111 CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); 1112 1113 // Load into B_in, in order that they will be used in eval_mode 1114 CeedCallSycl(ceed, asmb->d_B_in = sycl::malloc_device<CeedScalar>(size_B_in, sycl_data->sycl_device, sycl_data->sycl_context)); 1115 for (int i = 0; i < num_B_in_mats_to_load; i++) { 1116 CeedEvalMode eval_mode = eval_mode_in[i]; 1117 1118 if (eval_mode == CEED_EVAL_INTERP) { 1119 std::vector<sycl::event> e; 1120 1121 if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()}; 1122 sycl_data->sycl_queue.copy<CeedScalar>(interp_in, &asmb->d_B_in[mat_start], elem_size * num_qpts, e); 1123 mat_start += elem_size * num_qpts; 1124 } else if (eval_mode == CEED_EVAL_GRAD) { 1125 std::vector<sycl::event> e; 1126 1127 if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()}; 1128 sycl_data->sycl_queue.copy<CeedScalar>(grad_in, &asmb->d_B_in[mat_start], dim * elem_size * num_qpts, e); 1129 mat_start += dim * elem_size * num_qpts; 1130 } 1131 } 1132 1133 const CeedScalar *interp_out, *grad_out; 1134 // Note that this function currently assumes 1 basis, so this should always be true 1135 // for now 1136 if (basis_out == basis_in) { 1137 interp_out = interp_in; 1138 grad_out = grad_in; 1139 } else { 1140 CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); 1141 CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); 1142 } 1143 1144 // Load into B_out, in order that they will be used in eval_mode 1145 mat_start = 0; 1146 CeedCallSycl(ceed, asmb->d_B_out = sycl::malloc_device<CeedScalar>(size_B_out, sycl_data->sycl_device, sycl_data->sycl_context)); 1147 for (int i = 0; i < num_B_out_mats_to_load; i++) { 1148 CeedEvalMode eval_mode = eval_mode_out[i]; 1149 1150 if (eval_mode == CEED_EVAL_INTERP) { 1151 std::vector<sycl::event> e; 1152 1153 if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()}; 1154 sycl_data->sycl_queue.copy<CeedScalar>(interp_out, &asmb->d_B_out[mat_start], elem_size * num_qpts, e); 1155 mat_start += elem_size * num_qpts; 1156 } else if (eval_mode == CEED_EVAL_GRAD) { 1157 std::vector<sycl::event> e; 1158 1159 if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()}; 1160 sycl_data->sycl_queue.copy<CeedScalar>(grad_out, &asmb->d_B_out[mat_start], dim * elem_size * num_qpts, e); 1161 mat_start += dim * elem_size * num_qpts; 1162 } 1163 } 1164 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in)); 1165 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out)); 1166 CeedCallBackend(CeedBasisDestroy(&basis_in)); 1167 CeedCallBackend(CeedBasisDestroy(&basis_out)); 1168 return CEED_ERROR_SUCCESS; 1169 } 1170 1171 //------------------------------------------------------------------------------ 1172 // Matrix assembly kernel for low-order elements (3D thread block) 1173 //------------------------------------------------------------------------------ 1174 static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOperator_Sycl *impl, const CeedScalar *qf_array, 1175 CeedScalar *values_array) { 1176 // This kernels assumes B_in and B_out have the same number of quadrature points and basis points. 1177 // TODO: expand to more general cases 1178 CeedOperatorAssemble_Sycl *asmb = impl->asmb; 1179 const CeedInt num_elem = asmb->num_elem; 1180 const CeedSize num_nodes = asmb->num_nodes; 1181 const CeedSize num_comp = asmb->num_comp; 1182 const CeedSize num_qpts = asmb->num_qpts; 1183 const CeedSize num_eval_mode_in = asmb->num_eval_mode_in; 1184 const CeedSize num_eval_mode_out = asmb->num_eval_mode_out; 1185 1186 // Strides for final output ordering, determined by the reference (inference) implementation of the symbolic assembly, slowest --> fastest: element, 1187 // comp_in, comp_out, node_row, node_col 1188 const CeedSize comp_out_stride = num_nodes * num_nodes; 1189 const CeedSize comp_in_stride = comp_out_stride * num_comp; 1190 const CeedSize e_stride = comp_in_stride * num_comp; 1191 // Strides for QF array, slowest --> fastest: eval_mode_in, comp_in, eval_mode_out, comp_out, elem, qpt 1192 const CeedSize q_e_stride = num_qpts; 1193 const CeedSize q_comp_out_stride = num_elem * q_e_stride; 1194 const CeedSize q_eval_mode_out_stride = q_comp_out_stride * num_comp; 1195 const CeedSize q_comp_in_stride = q_eval_mode_out_stride * num_eval_mode_out; 1196 const CeedSize q_eval_mode_in_stride = q_comp_in_stride * num_comp; 1197 1198 CeedScalar *B_in, *B_out; 1199 B_in = asmb->d_B_in; 1200 B_out = asmb->d_B_out; 1201 const CeedInt block_size_x = asmb->block_size_x; 1202 const CeedInt block_size_y = asmb->block_size_y; 1203 1204 sycl::range<3> kernel_range(num_elem, block_size_y, block_size_x); 1205 1206 std::vector<sycl::event> e; 1207 1208 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()}; 1209 sycl_queue.parallel_for<CeedOperatorSyclLinearAssemble>(kernel_range, e, [=](sycl::id<3> idx) { 1210 const int e = idx.get(0); // Element index 1211 const int l = idx.get(1); // The output column index of each B^TDB operation 1212 const int i = idx.get(2); // The output row index of each B^TDB operation 1213 // such that we have (Bout^T)_ij D_jk Bin_kl = C_il 1214 for (CeedSize comp_in = 0; comp_in < num_comp; comp_in++) { 1215 for (CeedSize comp_out = 0; comp_out < num_comp; comp_out++) { 1216 CeedScalar result = 0.0; 1217 CeedSize qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; 1218 1219 for (CeedSize eval_mode_in = 0; eval_mode_in < num_eval_mode_in; eval_mode_in++) { 1220 CeedSize b_in_index = eval_mode_in * num_qpts * num_nodes; 1221 1222 for (CeedSize eval_mode_out = 0; eval_mode_out < num_eval_mode_out; eval_mode_out++) { 1223 CeedSize b_out_index = eval_mode_out * num_qpts * num_nodes; 1224 CeedSize qf_index = qf_index_comp + q_eval_mode_out_stride * eval_mode_out + q_eval_mode_in_stride * eval_mode_in; 1225 1226 // Perform the B^T D B operation for this 'chunk' of D (the qf_array) 1227 for (CeedSize j = 0; j < num_qpts; j++) { 1228 result += B_out[b_out_index + j * num_nodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * num_nodes + l]; 1229 } 1230 } // end of eval_mode_out 1231 } // end of eval_mode_in 1232 CeedSize val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + num_nodes * i + l; 1233 1234 values_array[val_index] = result; 1235 } // end of out component 1236 } // end of in component 1237 }); 1238 return CEED_ERROR_SUCCESS; 1239 } 1240 1241 //------------------------------------------------------------------------------ 1242 // Fallback kernel for larger orders (1D thread block) 1243 //------------------------------------------------------------------------------ 1244 /* 1245 static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, const CeedOperator_Sycl *impl, const CeedScalar *qf_array, 1246 CeedScalar *values_array) { 1247 // This kernel assumes B_in and B_out have the same number of quadrature points and basis points. 1248 // TODO: expand to more general cases 1249 CeedOperatorAssemble_Sycl *asmb = impl->asmb; 1250 const CeedInt num_elem = asmb->num_elem; 1251 const CeedInt num_nodes = asmb->num_nodes; 1252 const CeedInt num_comp = asmb->num_comp; 1253 const CeedInt num_qpts = asmb->num_qpts; 1254 const CeedInt num_eval_mode_in = asmb->num_eval_mode_in; 1255 const CeedInt num_eval_mode_out = asmb->num_eval_mode_out; 1256 1257 // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: elememt, 1258 // comp_in, comp_out, node_row, node_col 1259 const CeedInt comp_out_stride = num_nodes * num_nodes; 1260 const CeedInt comp_in_stride = comp_out_stride * num_comp; 1261 const CeedInt e_stride = comp_in_stride * num_comp; 1262 // Strides for QF array, slowest --> fastest: eval_mode_in, comp_in, eval_mode_out, comp_out, elem, qpt 1263 const CeedInt q_e_stride = num_qpts; 1264 const CeedInt q_comp_out_stride = num_elem * q_e_stride; 1265 const CeedInt q_eval_mode_out_stride = q_comp_out_stride * num_comp; 1266 const CeedInt q_comp_in_stride = q_eval_mode_out_stride * num_eval_mode_out; 1267 const CeedInt q_eval_mode_in_stride = q_comp_in_stride * num_comp; 1268 1269 CeedScalar *B_in, *B_out; 1270 B_in = asmb->d_B_in; 1271 B_out = asmb->d_B_out; 1272 const CeedInt elems_per_block = asmb->elems_per_block; 1273 const CeedInt block_size_x = asmb->block_size_x; 1274 const CeedInt block_size_y = asmb->block_size_y; // This will be 1 for the fallback kernel 1275 1276 const CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 1277 sycl::range<3> local_range(block_size_x, block_size_y, elems_per_block); 1278 sycl::range<3> global_range(grid * block_size_x, block_size_y, elems_per_block); 1279 sycl::nd_range<3> kernel_range(global_range, local_range); 1280 1281 sycl_queue.parallel_for<CeedOperatorSyclLinearAssembleFallback>(kernel_range, [=](sycl::nd_item<3> work_item) { 1282 const CeedInt blockIdx = work_item.get_group(0); 1283 const CeedInt gridDimx = work_item.get_group_range(0); 1284 const CeedInt threadIdx = work_item.get_local_id(0); 1285 const CeedInt threadIdz = work_item.get_local_id(2); 1286 const CeedInt blockDimz = work_item.get_local_range(2); 1287 1288 const int l = threadIdx; // The output column index of each B^TDB operation 1289 // such that we have (Bout^T)_ij D_jk Bin_kl = C_il 1290 for (CeedInt e = blockIdx * blockDimz + threadIdz; e < num_elem; e += gridDimx * blockDimz) { 1291 for (CeedInt comp_in = 0; comp_in < num_comp; comp_in++) { 1292 for (CeedInt comp_out = 0; comp_out < num_comp; comp_out++) { 1293 for (CeedInt i = 0; i < num_nodes; i++) { 1294 CeedScalar result = 0.0; 1295 CeedInt qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; 1296 for (CeedInt eval_mode_in = 0; eval_mode_in < num_eval_mode_in; eval_mode_in++) { 1297 CeedInt b_in_index = eval_mode_in * num_qpts * num_nodes; 1298 for (CeedInt eval_mode_out = 0; eval_mode_out < num_eval_mode_out; eval_mode_out++) { 1299 CeedInt b_out_index = eval_mode_out * num_qpts * num_nodes; 1300 CeedInt qf_index = qf_index_comp + q_eval_mode_out_stride * eval_mode_out + q_eval_mode_in_stride * eval_mode_in; 1301 // Perform the B^T D B operation for this 'chunk' of D (the qf_array) 1302 for (CeedInt j = 0; j < num_qpts; j++) { 1303 result += B_out[b_out_index + j * num_nodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * num_nodes + l]; 1304 } 1305 } // end of eval_mode_out 1306 } // end of eval_mode_in 1307 CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + num_nodes * i + l; 1308 values_array[val_index] = result; 1309 } // end of loop over element node index, i 1310 } // end of out component 1311 } // end of in component 1312 } // end of element loop 1313 }); 1314 return CEED_ERROR_SUCCESS; 1315 }*/ 1316 1317 //------------------------------------------------------------------------------ 1318 // Assemble matrix data for COO matrix of assembled operator. 1319 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic. 1320 // 1321 // Note that this (and other assembly routines) currently assume only one active 1322 // input restriction/basis per operator (could have multiple basis eval modes). 1323 // TODO: allow multiple active input restrictions/basis objects 1324 //------------------------------------------------------------------------------ 1325 static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, CeedVector values) { 1326 Ceed ceed; 1327 Ceed_Sycl *sycl_data; 1328 CeedScalar *values_array; 1329 const CeedScalar *qf_array; 1330 CeedVector assembled_qf = NULL; 1331 CeedElemRestriction rstr_q = NULL; 1332 CeedOperator_Sycl *impl; 1333 1334 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1335 CeedCallBackend(CeedOperatorGetData(op, &impl)); 1336 CeedCallBackend(CeedGetData(ceed, &sycl_data)); 1337 1338 // Setup 1339 if (!impl->asmb) { 1340 CeedCallBackend(CeedSingleOperatorAssembleSetup_Sycl(op)); 1341 assert(impl->asmb != NULL); 1342 } 1343 1344 // Assemble QFunction 1345 CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); 1346 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); 1347 CeedCallBackend(CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array)); 1348 values_array += offset; 1349 CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); 1350 1351 // Compute B^T D B 1352 CeedCallBackend(CeedOperatorLinearAssemble_Sycl(sycl_data->sycl_queue, impl, qf_array, values_array)); 1353 1354 // Wait for kernels to be completed 1355 // Kris: Review if this is necessary -- enqueing an async barrier may be sufficient 1356 sycl_data->sycl_queue.wait_and_throw(); 1357 1358 // Restore arrays 1359 CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); 1360 CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &qf_array)); 1361 1362 // Cleanup 1363 CeedCallBackend(CeedVectorDestroy(&assembled_qf)); 1364 return CEED_ERROR_SUCCESS; 1365 } 1366 1367 //------------------------------------------------------------------------------ 1368 // Create operator 1369 //------------------------------------------------------------------------------ 1370 int CeedOperatorCreate_Sycl(CeedOperator op) { 1371 Ceed ceed; 1372 CeedOperator_Sycl *impl; 1373 1374 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1375 1376 CeedCallBackend(CeedCalloc(1, &impl)); 1377 CeedCallBackend(CeedOperatorSetData(op, impl)); 1378 1379 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Sycl)); 1380 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Sycl)); 1381 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Sycl)); 1382 CeedCallBackend( 1383 CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl)); 1384 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Sycl)); 1385 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl)); 1386 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl)); 1387 return CEED_ERROR_SUCCESS; 1388 } 1389 1390 //------------------------------------------------------------------------------ 1391