1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <assert.h> 12 #include <cuda.h> 13 #include <cuda_runtime.h> 14 #include <stdbool.h> 15 #include <string.h> 16 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #include "ceed-cuda-ref.h" 20 21 //------------------------------------------------------------------------------ 22 // Destroy operator 23 //------------------------------------------------------------------------------ 24 static int CeedOperatorDestroy_Cuda(CeedOperator op) { 25 CeedOperator_Cuda *impl; 26 27 CeedCallBackend(CeedOperatorGetData(op, &impl)); 28 29 // Apply data 30 for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { 31 CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i])); 32 } 33 CeedCallBackend(CeedFree(&impl->e_vecs)); 34 CeedCallBackend(CeedFree(&impl->input_states)); 35 36 for (CeedInt i = 0; i < impl->num_inputs; i++) { 37 CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i])); 38 } 39 CeedCallBackend(CeedFree(&impl->q_vecs_in)); 40 41 for (CeedInt i = 0; i < impl->num_outputs; i++) { 42 CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); 43 } 44 CeedCallBackend(CeedFree(&impl->q_vecs_out)); 45 46 // QFunction assembly data 47 for (CeedInt i = 0; i < impl->num_active_in; i++) { 48 CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i])); 49 } 50 CeedCallBackend(CeedFree(&impl->qf_active_in)); 51 52 // Diag data 53 if (impl->diag) { 54 Ceed ceed; 55 56 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 57 if (impl->diag->module) { 58 CeedCallCuda(ceed, cuModuleUnload(impl->diag->module)); 59 } 60 if (impl->diag->module_point_block) { 61 CeedCallCuda(ceed, cuModuleUnload(impl->diag->module_point_block)); 62 } 63 CeedCallCuda(ceed, cudaFree(impl->diag->d_eval_modes_in)); 64 CeedCallCuda(ceed, cudaFree(impl->diag->d_eval_modes_out)); 65 CeedCallCuda(ceed, cudaFree(impl->diag->d_identity)); 66 CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_in)); 67 CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_out)); 68 CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_in)); 69 CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_out)); 70 CeedCallCuda(ceed, cudaFree(impl->diag->d_div_in)); 71 CeedCallCuda(ceed, cudaFree(impl->diag->d_div_out)); 72 CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_in)); 73 CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_out)); 74 CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr)); 75 CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); 76 CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); 77 CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); 78 } 79 CeedCallBackend(CeedFree(&impl->diag)); 80 81 if (impl->asmb) { 82 Ceed ceed; 83 84 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 85 CeedCallCuda(ceed, cuModuleUnload(impl->asmb->module)); 86 CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_in)); 87 CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_out)); 88 } 89 CeedCallBackend(CeedFree(&impl->asmb)); 90 91 CeedCallBackend(CeedFree(&impl)); 92 return CEED_ERROR_SUCCESS; 93 } 94 95 //------------------------------------------------------------------------------ 96 // Setup infields or outfields 97 //------------------------------------------------------------------------------ 98 static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, 99 CeedInt num_fields, CeedInt Q, CeedInt num_elem) { 100 Ceed ceed; 101 CeedQFunctionField *qf_fields; 102 CeedOperatorField *op_fields; 103 104 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 105 if (is_input) { 106 CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); 107 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); 108 } else { 109 CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); 110 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); 111 } 112 113 // Loop over fields 114 for (CeedInt i = 0; i < num_fields; i++) { 115 bool is_strided = false, skip_restriction = false; 116 CeedSize q_size; 117 CeedInt size; 118 CeedEvalMode eval_mode; 119 CeedBasis basis; 120 121 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 122 if (eval_mode != CEED_EVAL_WEIGHT) { 123 CeedElemRestriction elem_rstr; 124 125 // Check whether this field can skip the element restriction: 126 // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. 127 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); 128 129 // First, check whether the field is input or output: 130 if (is_input) { 131 CeedVector vec; 132 133 // Check for passive input 134 CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); 135 if (vec != CEED_VECTOR_ACTIVE) { 136 // Check eval_mode 137 if (eval_mode == CEED_EVAL_NONE) { 138 // Check for strided restriction 139 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 140 if (is_strided) { 141 // Check if vector is already in preferred backend ordering 142 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction)); 143 } 144 } 145 } 146 } 147 if (skip_restriction) { 148 // We do not need an E-Vector, but will use the input field vector's data directly in the operator application. 149 e_vecs[i + start_e] = NULL; 150 } else { 151 CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e])); 152 } 153 } 154 155 switch (eval_mode) { 156 case CEED_EVAL_NONE: 157 CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); 158 q_size = (CeedSize)num_elem * Q * size; 159 CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); 160 break; 161 case CEED_EVAL_INTERP: 162 case CEED_EVAL_GRAD: 163 case CEED_EVAL_DIV: 164 case CEED_EVAL_CURL: 165 CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); 166 q_size = (CeedSize)num_elem * Q * size; 167 CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); 168 break; 169 case CEED_EVAL_WEIGHT: // Only on input fields 170 CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); 171 q_size = (CeedSize)num_elem * Q; 172 CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); 173 CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); 174 break; 175 } 176 } 177 return CEED_ERROR_SUCCESS; 178 } 179 180 //------------------------------------------------------------------------------ 181 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction. 182 //------------------------------------------------------------------------------ 183 static int CeedOperatorSetup_Cuda(CeedOperator op) { 184 Ceed ceed; 185 bool is_setup_done; 186 CeedInt Q, num_elem, num_input_fields, num_output_fields; 187 CeedQFunctionField *qf_input_fields, *qf_output_fields; 188 CeedQFunction qf; 189 CeedOperatorField *op_input_fields, *op_output_fields; 190 CeedOperator_Cuda *impl; 191 192 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 193 if (is_setup_done) return CEED_ERROR_SUCCESS; 194 195 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 196 CeedCallBackend(CeedOperatorGetData(op, &impl)); 197 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 198 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 199 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 200 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 201 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 202 203 // Allocate 204 CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs)); 205 CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states)); 206 CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); 207 CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); 208 impl->num_inputs = num_input_fields; 209 impl->num_outputs = num_output_fields; 210 211 // Set up infield and outfield e_vecs and q_vecs 212 // Infields 213 CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem)); 214 // Outfields 215 CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); 216 217 CeedCallBackend(CeedOperatorSetSetupDone(op)); 218 return CEED_ERROR_SUCCESS; 219 } 220 221 //------------------------------------------------------------------------------ 222 // Setup Operator Inputs 223 //------------------------------------------------------------------------------ 224 static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, 225 CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], 226 CeedOperator_Cuda *impl, CeedRequest *request) { 227 for (CeedInt i = 0; i < num_input_fields; i++) { 228 CeedEvalMode eval_mode; 229 CeedVector vec; 230 CeedElemRestriction elem_rstr; 231 232 // Get input vector 233 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 234 if (vec == CEED_VECTOR_ACTIVE) { 235 if (skip_active) continue; 236 else vec = in_vec; 237 } 238 239 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 240 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 241 } else { 242 // Get input vector 243 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 244 // Get input element restriction 245 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 246 if (vec == CEED_VECTOR_ACTIVE) vec = in_vec; 247 // Restrict, if necessary 248 if (!impl->e_vecs[i]) { 249 // No restriction for this field; read data directly from vec. 250 CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); 251 } else { 252 uint64_t state; 253 254 CeedCallBackend(CeedVectorGetState(vec, &state)); 255 if (state != impl->input_states[i]) { 256 CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request)); 257 impl->input_states[i] = state; 258 } 259 // Get evec 260 CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); 261 } 262 } 263 } 264 return CEED_ERROR_SUCCESS; 265 } 266 267 //------------------------------------------------------------------------------ 268 // Input Basis Action 269 //------------------------------------------------------------------------------ 270 static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, 271 CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], 272 CeedOperator_Cuda *impl) { 273 for (CeedInt i = 0; i < num_input_fields; i++) { 274 CeedInt elem_size, size; 275 CeedEvalMode eval_mode; 276 CeedElemRestriction elem_rstr; 277 CeedBasis basis; 278 279 // Skip active input 280 if (skip_active) { 281 CeedVector vec; 282 283 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 284 if (vec == CEED_VECTOR_ACTIVE) continue; 285 } 286 // Get elem_size, eval_mode, size 287 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 288 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 289 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 290 CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); 291 // Basis action 292 switch (eval_mode) { 293 case CEED_EVAL_NONE: 294 CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); 295 break; 296 case CEED_EVAL_INTERP: 297 case CEED_EVAL_GRAD: 298 case CEED_EVAL_DIV: 299 case CEED_EVAL_CURL: 300 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 301 CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i])); 302 break; 303 case CEED_EVAL_WEIGHT: 304 break; // No action 305 } 306 } 307 return CEED_ERROR_SUCCESS; 308 } 309 310 //------------------------------------------------------------------------------ 311 // Restore Input Vectors 312 //------------------------------------------------------------------------------ 313 static inline int CeedOperatorRestoreInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, 314 const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) { 315 for (CeedInt i = 0; i < num_input_fields; i++) { 316 CeedEvalMode eval_mode; 317 CeedVector vec; 318 319 // Skip active input 320 if (skip_active) { 321 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 322 if (vec == CEED_VECTOR_ACTIVE) continue; 323 } 324 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 325 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 326 } else { 327 if (!impl->e_vecs[i]) { // This was a skip_restriction case 328 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 329 CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i])); 330 } else { 331 CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i])); 332 } 333 } 334 } 335 return CEED_ERROR_SUCCESS; 336 } 337 338 //------------------------------------------------------------------------------ 339 // Apply and add to output 340 //------------------------------------------------------------------------------ 341 static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { 342 CeedInt Q, num_elem, elem_size, num_input_fields, num_output_fields, size; 343 CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL}; 344 CeedQFunctionField *qf_input_fields, *qf_output_fields; 345 CeedQFunction qf; 346 CeedOperatorField *op_input_fields, *op_output_fields; 347 CeedOperator_Cuda *impl; 348 349 CeedCallBackend(CeedOperatorGetData(op, &impl)); 350 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 351 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 352 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 353 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 354 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 355 356 // Setup 357 CeedCallBackend(CeedOperatorSetup_Cuda(op)); 358 359 // Input Evecs and Restriction 360 CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request)); 361 362 // Input basis apply if needed 363 CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl)); 364 365 // Output pointers, as necessary 366 for (CeedInt i = 0; i < num_output_fields; i++) { 367 CeedEvalMode eval_mode; 368 369 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 370 if (eval_mode == CEED_EVAL_NONE) { 371 // Set the output Q-Vector to use the E-Vector data directly. 372 CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields])); 373 CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields])); 374 } 375 } 376 377 // Q function 378 CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out)); 379 380 // Output basis apply if needed 381 for (CeedInt i = 0; i < num_output_fields; i++) { 382 CeedEvalMode eval_mode; 383 CeedElemRestriction elem_rstr; 384 CeedBasis basis; 385 386 // Get elem_size, eval_mode, size 387 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 388 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 389 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 390 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); 391 // Basis action 392 switch (eval_mode) { 393 case CEED_EVAL_NONE: 394 break; // No action 395 case CEED_EVAL_INTERP: 396 case CEED_EVAL_GRAD: 397 case CEED_EVAL_DIV: 398 case CEED_EVAL_CURL: 399 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 400 CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); 401 break; 402 // LCOV_EXCL_START 403 case CEED_EVAL_WEIGHT: { 404 return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 405 // LCOV_EXCL_STOP 406 } 407 } 408 } 409 410 // Output restriction 411 for (CeedInt i = 0; i < num_output_fields; i++) { 412 CeedEvalMode eval_mode; 413 CeedVector vec; 414 CeedElemRestriction elem_rstr; 415 416 // Restore evec 417 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 418 if (eval_mode == CEED_EVAL_NONE) { 419 CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields])); 420 } 421 // Get output vector 422 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 423 // Restrict 424 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 425 // Active 426 if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; 427 428 CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request)); 429 } 430 431 // Restore input arrays 432 CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); 433 return CEED_ERROR_SUCCESS; 434 } 435 436 //------------------------------------------------------------------------------ 437 // Linear QFunction Assembly Core 438 //------------------------------------------------------------------------------ 439 static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, 440 CeedRequest *request) { 441 Ceed ceed, ceed_parent; 442 CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; 443 CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; 444 CeedVector *active_inputs; 445 CeedQFunctionField *qf_input_fields, *qf_output_fields; 446 CeedQFunction qf; 447 CeedOperatorField *op_input_fields, *op_output_fields; 448 CeedOperator_Cuda *impl; 449 450 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 451 CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent)); 452 CeedCallBackend(CeedOperatorGetData(op, &impl)); 453 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 454 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 455 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 456 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 457 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 458 active_inputs = impl->qf_active_in; 459 num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; 460 461 // Setup 462 CeedCallBackend(CeedOperatorSetup_Cuda(op)); 463 464 // Input Evecs and Restriction 465 CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); 466 467 // Count number of active input fields 468 if (!num_active_in) { 469 for (CeedInt i = 0; i < num_input_fields; i++) { 470 CeedScalar *q_vec_array; 471 CeedVector vec; 472 473 // Get input vector 474 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 475 // Check if active input 476 if (vec == CEED_VECTOR_ACTIVE) { 477 CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); 478 CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); 479 CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array)); 480 CeedCallBackend(CeedRealloc(num_active_in + size, &active_inputs)); 481 for (CeedInt field = 0; field < size; field++) { 482 CeedSize q_size = (CeedSize)Q * num_elem; 483 484 CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field])); 485 CeedCallBackend( 486 CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); 487 } 488 num_active_in += size; 489 CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); 490 } 491 } 492 impl->num_active_in = num_active_in; 493 impl->qf_active_in = active_inputs; 494 } 495 496 // Count number of active output fields 497 if (!num_active_out) { 498 for (CeedInt i = 0; i < num_output_fields; i++) { 499 CeedVector vec; 500 501 // Get output vector 502 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 503 // Check if active output 504 if (vec == CEED_VECTOR_ACTIVE) { 505 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); 506 num_active_out += size; 507 } 508 } 509 impl->num_active_out = num_active_out; 510 } 511 512 // Check sizes 513 CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); 514 515 // Build objects if needed 516 if (build_objects) { 517 CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; 518 CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ 519 520 // Create output restriction 521 CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, 522 num_active_in * num_active_out * num_elem * Q, strides, rstr)); 523 // Create assembled vector 524 CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); 525 } 526 CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); 527 CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array)); 528 529 // Input basis apply 530 CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl)); 531 532 // Assemble QFunction 533 for (CeedInt in = 0; in < num_active_in; in++) { 534 // Set Inputs 535 CeedCallBackend(CeedVectorSetValue(active_inputs[in], 1.0)); 536 if (num_active_in > 1) { 537 CeedCallBackend(CeedVectorSetValue(active_inputs[(in + num_active_in - 1) % num_active_in], 0.0)); 538 } 539 // Set Outputs 540 for (CeedInt out = 0; out < num_output_fields; out++) { 541 CeedVector vec; 542 543 // Get output vector 544 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); 545 // Check if active output 546 if (vec == CEED_VECTOR_ACTIVE) { 547 CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array)); 548 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); 549 assembled_array += size * Q * num_elem; // Advance the pointer by the size of the output 550 } 551 } 552 // Apply QFunction 553 CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); 554 } 555 556 // Un-set output q_vecs to prevent accidental overwrite of Assembled 557 for (CeedInt out = 0; out < num_output_fields; out++) { 558 CeedVector vec; 559 560 // Get output vector 561 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); 562 // Check if active output 563 if (vec == CEED_VECTOR_ACTIVE) { 564 CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL)); 565 } 566 } 567 568 // Restore input arrays 569 CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl)); 570 571 // Restore output 572 CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); 573 return CEED_ERROR_SUCCESS; 574 } 575 576 //------------------------------------------------------------------------------ 577 // Assemble Linear QFunction 578 //------------------------------------------------------------------------------ 579 static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { 580 return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr, request); 581 } 582 583 //------------------------------------------------------------------------------ 584 // Update Assembled Linear QFunction 585 //------------------------------------------------------------------------------ 586 static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { 587 return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled, &rstr, request); 588 } 589 590 //------------------------------------------------------------------------------ 591 // Assemble Diagonal Setup 592 //------------------------------------------------------------------------------ 593 static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) { 594 Ceed ceed; 595 CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; 596 CeedInt q_comp, num_nodes, num_qpts; 597 CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; 598 CeedBasis basis_in = NULL, basis_out = NULL; 599 CeedQFunctionField *qf_fields; 600 CeedQFunction qf; 601 CeedOperatorField *op_fields; 602 CeedOperator_Cuda *impl; 603 604 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 605 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 606 CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); 607 608 // Determine active input basis 609 CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); 610 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); 611 for (CeedInt i = 0; i < num_input_fields; i++) { 612 CeedVector vec; 613 614 CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); 615 if (vec == CEED_VECTOR_ACTIVE) { 616 CeedBasis basis; 617 CeedEvalMode eval_mode; 618 619 CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); 620 CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, 621 "Backend does not implement operator diagonal assembly with multiple active bases"); 622 basis_in = basis; 623 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 624 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); 625 if (eval_mode != CEED_EVAL_WEIGHT) { 626 // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly 627 CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in)); 628 for (CeedInt d = 0; d < q_comp; d++) eval_modes_in[num_eval_modes_in + d] = eval_mode; 629 num_eval_modes_in += q_comp; 630 } 631 } 632 } 633 634 // Determine active output basis 635 CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); 636 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); 637 for (CeedInt i = 0; i < num_output_fields; i++) { 638 CeedVector vec; 639 640 CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); 641 if (vec == CEED_VECTOR_ACTIVE) { 642 CeedBasis basis; 643 CeedEvalMode eval_mode; 644 645 CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); 646 CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, 647 "Backend does not implement operator diagonal assembly with multiple active bases"); 648 basis_out = basis; 649 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 650 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); 651 if (eval_mode != CEED_EVAL_WEIGHT) { 652 // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly 653 CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out)); 654 for (CeedInt d = 0; d < q_comp; d++) eval_modes_out[num_eval_modes_out + d] = eval_mode; 655 num_eval_modes_out += q_comp; 656 } 657 } 658 } 659 660 // Operator data struct 661 CeedCallBackend(CeedOperatorGetData(op, &impl)); 662 CeedCallBackend(CeedCalloc(1, &impl->diag)); 663 CeedOperatorDiag_Cuda *diag = impl->diag; 664 665 // Basis matrices 666 CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); 667 if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes; 668 else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); 669 const CeedInt interp_bytes = num_nodes * num_qpts * sizeof(CeedScalar); 670 const CeedInt eval_modes_bytes = sizeof(CeedEvalMode); 671 bool has_eval_none = false; 672 673 // CEED_EVAL_NONE 674 for (CeedInt i = 0; i < num_eval_modes_in; i++) has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE); 675 for (CeedInt i = 0; i < num_eval_modes_out; i++) has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE); 676 if (has_eval_none) { 677 CeedScalar *identity = NULL; 678 679 CeedCallBackend(CeedCalloc(num_nodes * num_qpts, &identity)); 680 for (CeedInt i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0; 681 CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_identity, interp_bytes)); 682 CeedCallCuda(ceed, cudaMemcpy(diag->d_identity, identity, interp_bytes, cudaMemcpyHostToDevice)); 683 CeedCallBackend(CeedFree(&identity)); 684 } 685 686 // CEED_EVAL_INTERP, CEED_EVAL_GRAD, CEED_EVAL_DIV, and CEED_EVAL_CURL 687 for (CeedInt in = 0; in < 2; in++) { 688 CeedFESpace fespace; 689 CeedBasis basis = in ? basis_in : basis_out; 690 691 CeedCallBackend(CeedBasisGetFESpace(basis, &fespace)); 692 switch (fespace) { 693 case CEED_FE_SPACE_H1: { 694 CeedInt q_comp_interp, q_comp_grad; 695 const CeedScalar *interp, *grad; 696 CeedScalar *d_interp, *d_grad; 697 698 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 699 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 700 701 CeedCallBackend(CeedBasisGetInterp(basis, &interp)); 702 CeedCallCuda(ceed, cudaMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); 703 CeedCallCuda(ceed, cudaMemcpy(d_interp, interp, interp_bytes * q_comp_interp, cudaMemcpyHostToDevice)); 704 CeedCallBackend(CeedBasisGetGrad(basis, &grad)); 705 CeedCallCuda(ceed, cudaMalloc((void **)&d_grad, interp_bytes * q_comp_grad)); 706 CeedCallCuda(ceed, cudaMemcpy(d_grad, grad, interp_bytes * q_comp_grad, cudaMemcpyHostToDevice)); 707 if (in) { 708 diag->d_interp_in = d_interp; 709 diag->d_grad_in = d_grad; 710 } else { 711 diag->d_interp_out = d_interp; 712 diag->d_grad_out = d_grad; 713 } 714 } break; 715 case CEED_FE_SPACE_HDIV: { 716 CeedInt q_comp_interp, q_comp_div; 717 const CeedScalar *interp, *div; 718 CeedScalar *d_interp, *d_div; 719 720 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 721 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 722 723 CeedCallBackend(CeedBasisGetInterp(basis, &interp)); 724 CeedCallCuda(ceed, cudaMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); 725 CeedCallCuda(ceed, cudaMemcpy(d_interp, interp, interp_bytes * q_comp_interp, cudaMemcpyHostToDevice)); 726 CeedCallBackend(CeedBasisGetDiv(basis, &div)); 727 CeedCallCuda(ceed, cudaMalloc((void **)&d_div, interp_bytes * q_comp_div)); 728 CeedCallCuda(ceed, cudaMemcpy(d_div, div, interp_bytes * q_comp_div, cudaMemcpyHostToDevice)); 729 if (in) { 730 diag->d_interp_in = d_interp; 731 diag->d_div_in = d_div; 732 } else { 733 diag->d_interp_out = d_interp; 734 diag->d_div_out = d_div; 735 } 736 } break; 737 case CEED_FE_SPACE_HCURL: { 738 CeedInt q_comp_interp, q_comp_curl; 739 const CeedScalar *interp, *curl; 740 CeedScalar *d_interp, *d_curl; 741 742 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 743 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 744 745 CeedCallBackend(CeedBasisGetInterp(basis, &interp)); 746 CeedCallCuda(ceed, cudaMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); 747 CeedCallCuda(ceed, cudaMemcpy(d_interp, interp, interp_bytes * q_comp_interp, cudaMemcpyHostToDevice)); 748 CeedCallBackend(CeedBasisGetCurl(basis, &curl)); 749 CeedCallCuda(ceed, cudaMalloc((void **)&d_curl, interp_bytes * q_comp_curl)); 750 CeedCallCuda(ceed, cudaMemcpy(d_curl, curl, interp_bytes * q_comp_curl, cudaMemcpyHostToDevice)); 751 if (in) { 752 diag->d_interp_in = d_interp; 753 diag->d_curl_in = d_curl; 754 } else { 755 diag->d_interp_out = d_interp; 756 diag->d_curl_out = d_curl; 757 } 758 } break; 759 } 760 } 761 762 // Arrays of eval_modes 763 CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_eval_modes_in, num_eval_modes_in * eval_modes_bytes)); 764 CeedCallCuda(ceed, cudaMemcpy(diag->d_eval_modes_in, eval_modes_in, num_eval_modes_in * eval_modes_bytes, cudaMemcpyHostToDevice)); 765 CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_eval_modes_out, num_eval_modes_out * eval_modes_bytes)); 766 CeedCallCuda(ceed, cudaMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, cudaMemcpyHostToDevice)); 767 CeedCallBackend(CeedFree(&eval_modes_in)); 768 CeedCallBackend(CeedFree(&eval_modes_out)); 769 return CEED_ERROR_SUCCESS; 770 } 771 772 //------------------------------------------------------------------------------ 773 // Assemble Diagonal Setup (Compilation) 774 //------------------------------------------------------------------------------ 775 static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) { 776 Ceed ceed; 777 char *diagonal_kernel_source; 778 const char *diagonal_kernel_path; 779 CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; 780 CeedInt num_comp, q_comp, num_nodes, num_qpts; 781 CeedBasis basis_in = NULL, basis_out = NULL; 782 CeedQFunctionField *qf_fields; 783 CeedQFunction qf; 784 CeedOperatorField *op_fields; 785 CeedOperator_Cuda *impl; 786 787 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 788 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 789 CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); 790 791 // Determine active input basis 792 CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); 793 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); 794 for (CeedInt i = 0; i < num_input_fields; i++) { 795 CeedVector vec; 796 797 CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); 798 if (vec == CEED_VECTOR_ACTIVE) { 799 CeedEvalMode eval_mode; 800 801 CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); 802 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 803 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); 804 if (eval_mode != CEED_EVAL_WEIGHT) { 805 num_eval_modes_in += q_comp; 806 } 807 } 808 } 809 810 // Determine active output basis 811 CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); 812 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); 813 for (CeedInt i = 0; i < num_output_fields; i++) { 814 CeedVector vec; 815 816 CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); 817 if (vec == CEED_VECTOR_ACTIVE) { 818 CeedEvalMode eval_mode; 819 820 CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); 821 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 822 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); 823 if (eval_mode != CEED_EVAL_WEIGHT) { 824 num_eval_modes_out += q_comp; 825 } 826 } 827 } 828 829 // Operator data struct 830 CeedCallBackend(CeedOperatorGetData(op, &impl)); 831 CeedOperatorDiag_Cuda *diag = impl->diag; 832 833 // Assemble kernel 834 CUmodule *module = is_point_block ? &diag->module_point_block : &diag->module; 835 CeedInt elems_per_block = 1; 836 CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); 837 CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); 838 if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes; 839 else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); 840 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); 841 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); 842 CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); 843 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); 844 CeedCallCuda(ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", 845 num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE", 846 use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block)); 847 CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal)); 848 CeedCallBackend(CeedFree(&diagonal_kernel_path)); 849 CeedCallBackend(CeedFree(&diagonal_kernel_source)); 850 return CEED_ERROR_SUCCESS; 851 } 852 853 //------------------------------------------------------------------------------ 854 // Assemble Diagonal Core 855 //------------------------------------------------------------------------------ 856 static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { 857 Ceed ceed; 858 CeedInt num_elem, num_nodes; 859 CeedScalar *elem_diag_array; 860 const CeedScalar *assembled_qf_array; 861 CeedVector assembled_qf = NULL, elem_diag; 862 CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out, diag_rstr; 863 CeedOperator_Cuda *impl; 864 865 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 866 CeedCallBackend(CeedOperatorGetData(op, &impl)); 867 868 // Assemble QFunction 869 CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, request)); 870 CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr)); 871 CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); 872 873 // Setup 874 if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op)); 875 CeedOperatorDiag_Cuda *diag = impl->diag; 876 877 assert(diag != NULL); 878 879 // Assemble kernel if needed 880 if ((!is_point_block && !diag->LinearDiagonal) || (is_point_block && !diag->LinearPointBlock)) { 881 CeedSize assembled_length, assembled_qf_length; 882 CeedInt use_ceedsize_idx = 0; 883 CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); 884 CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); 885 if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; 886 887 CeedCallBackend(CeedOperatorAssembleDiagonalSetupCompile_Cuda(op, use_ceedsize_idx, is_point_block)); 888 } 889 890 // Restriction and diagonal vector 891 CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); 892 CeedCheck(rstr_in == rstr_out, ceed, CEED_ERROR_BACKEND, 893 "Cannot assemble operator diagonal with different input and output active element restrictions"); 894 if (!is_point_block && !diag->diag_rstr) { 895 CeedCallBackend(CeedElemRestrictionCreateUnsignedCopy(rstr_out, &diag->diag_rstr)); 896 CeedCallBackend(CeedElemRestrictionCreateVector(diag->diag_rstr, NULL, &diag->elem_diag)); 897 } else if (is_point_block && !diag->point_block_diag_rstr) { 898 CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr)); 899 CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag)); 900 } 901 diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; 902 elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; 903 CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0)); 904 905 // Only assemble diagonal if the basis has nodes, otherwise inputs are null pointers 906 CeedCallBackend(CeedElemRestrictionGetElementSize(diag_rstr, &num_nodes)); 907 if (num_nodes > 0) { 908 // Assemble element operator diagonals 909 CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem)); 910 CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array)); 911 912 // Compute the diagonal of B^T D B 913 CeedInt elems_per_block = 1; 914 CeedInt grid = CeedDivUpInt(num_elem, elems_per_block); 915 void *args[] = {(void *)&num_elem, &diag->d_identity, &diag->d_interp_in, &diag->d_grad_in, &diag->d_div_in, 916 &diag->d_curl_in, &diag->d_interp_out, &diag->d_grad_out, &diag->d_div_out, &diag->d_curl_out, 917 &diag->d_eval_modes_in, &diag->d_eval_modes_out, &assembled_qf_array, &elem_diag_array}; 918 919 if (is_point_block) { 920 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->LinearPointBlock, grid, num_nodes, 1, elems_per_block, args)); 921 } else { 922 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->LinearDiagonal, grid, num_nodes, 1, elems_per_block, args)); 923 } 924 925 // Restore arrays 926 CeedCallBackend(CeedVectorRestoreArray(elem_diag, &elem_diag_array)); 927 CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); 928 } 929 930 // Assemble local operator diagonal 931 CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request)); 932 933 // Cleanup 934 CeedCallBackend(CeedVectorDestroy(&assembled_qf)); 935 return CEED_ERROR_SUCCESS; 936 } 937 938 //------------------------------------------------------------------------------ 939 // Assemble Linear Diagonal 940 //------------------------------------------------------------------------------ 941 static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) { 942 CeedCallBackend(CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false)); 943 return CEED_ERROR_SUCCESS; 944 } 945 946 //------------------------------------------------------------------------------ 947 // Assemble Linear Point Block Diagonal 948 //------------------------------------------------------------------------------ 949 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) { 950 CeedCallBackend(CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true)); 951 return CEED_ERROR_SUCCESS; 952 } 953 954 //------------------------------------------------------------------------------ 955 // Single Operator Assembly Setup 956 //------------------------------------------------------------------------------ 957 static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) { 958 Ceed ceed; 959 Ceed_Cuda *cuda_data; 960 char *assembly_kernel_source; 961 const char *assembly_kernel_path; 962 CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; 963 CeedInt elem_size_in, num_qpts_in = 0, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp; 964 CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; 965 CeedElemRestriction rstr_in = NULL, rstr_out = NULL; 966 CeedBasis basis_in = NULL, basis_out = NULL; 967 CeedQFunctionField *qf_fields; 968 CeedQFunction qf; 969 CeedOperatorField *input_fields, *output_fields; 970 CeedOperator_Cuda *impl; 971 972 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 973 CeedCallBackend(CeedOperatorGetData(op, &impl)); 974 975 // Get intput and output fields 976 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); 977 978 // Determine active input basis eval mode 979 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 980 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); 981 for (CeedInt i = 0; i < num_input_fields; i++) { 982 CeedVector vec; 983 984 CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); 985 if (vec == CEED_VECTOR_ACTIVE) { 986 CeedBasis basis; 987 CeedEvalMode eval_mode; 988 989 CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis)); 990 CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases"); 991 basis_in = basis; 992 CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); 993 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in)); 994 if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in; 995 else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in)); 996 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 997 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); 998 if (eval_mode != CEED_EVAL_WEIGHT) { 999 // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly 1000 CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in)); 1001 for (CeedInt d = 0; d < q_comp; d++) { 1002 eval_modes_in[num_eval_modes_in + d] = eval_mode; 1003 } 1004 num_eval_modes_in += q_comp; 1005 } 1006 } 1007 } 1008 1009 // Determine active output basis; basis_out and rstr_out only used if same as input, TODO 1010 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); 1011 for (CeedInt i = 0; i < num_output_fields; i++) { 1012 CeedVector vec; 1013 1014 CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); 1015 if (vec == CEED_VECTOR_ACTIVE) { 1016 CeedBasis basis; 1017 CeedEvalMode eval_mode; 1018 1019 CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis)); 1020 CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, 1021 "Backend does not implement operator assembly with multiple active bases"); 1022 basis_out = basis; 1023 CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); 1024 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); 1025 if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out; 1026 else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out)); 1027 CeedCheck(num_qpts_in == num_qpts_out, ceed, CEED_ERROR_UNSUPPORTED, 1028 "Active input and output bases must have the same number of quadrature points"); 1029 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); 1030 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); 1031 if (eval_mode != CEED_EVAL_WEIGHT) { 1032 // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly 1033 CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out)); 1034 for (CeedInt d = 0; d < q_comp; d++) { 1035 eval_modes_out[num_eval_modes_out + d] = eval_mode; 1036 } 1037 num_eval_modes_out += q_comp; 1038 } 1039 } 1040 } 1041 CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); 1042 1043 CeedCallBackend(CeedCalloc(1, &impl->asmb)); 1044 CeedOperatorAssemble_Cuda *asmb = impl->asmb; 1045 asmb->elems_per_block = 1; 1046 asmb->block_size_x = elem_size_in; 1047 asmb->block_size_y = elem_size_out; 1048 1049 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 1050 bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > cuda_data->device_prop.maxThreadsPerBlock; 1051 1052 if (fallback) { 1053 // Use fallback kernel with 1D threadblock 1054 asmb->block_size_y = 1; 1055 } 1056 1057 // Compile kernels 1058 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in)); 1059 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out)); 1060 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path)); 1061 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n"); 1062 CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); 1063 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n"); 1064 CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", 1065 num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in, 1066 "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE", 1067 asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, 1068 "USE_CEEDSIZE", use_ceedsize_idx)); 1069 CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble)); 1070 CeedCallBackend(CeedFree(&assembly_kernel_path)); 1071 CeedCallBackend(CeedFree(&assembly_kernel_source)); 1072 1073 // Load into B_in, in order that they will be used in eval_modes_in 1074 { 1075 const CeedInt in_bytes = elem_size_in * num_qpts_in * num_eval_modes_in * sizeof(CeedScalar); 1076 CeedInt d_in = 0; 1077 CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE; 1078 bool has_eval_none = false; 1079 CeedScalar *identity = NULL; 1080 1081 for (CeedInt i = 0; i < num_eval_modes_in; i++) { 1082 has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE); 1083 } 1084 if (has_eval_none) { 1085 CeedCallBackend(CeedCalloc(elem_size_in * num_qpts_in, &identity)); 1086 for (CeedInt i = 0; i < (elem_size_in < num_qpts_in ? elem_size_in : num_qpts_in); i++) identity[i * elem_size_in + i] = 1.0; 1087 } 1088 1089 CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_in, in_bytes)); 1090 for (CeedInt i = 0; i < num_eval_modes_in; i++) { 1091 const CeedScalar *h_B_in; 1092 1093 CeedCallBackend(CeedOperatorGetBasisPointer(basis_in, eval_modes_in[i], identity, &h_B_in)); 1094 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_modes_in[i], &q_comp)); 1095 if (q_comp > 1) { 1096 if (i == 0 || eval_modes_in[i] != eval_modes_in_prev) d_in = 0; 1097 else h_B_in = &h_B_in[(++d_in) * elem_size_in * num_qpts_in]; 1098 } 1099 eval_modes_in_prev = eval_modes_in[i]; 1100 1101 CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar), 1102 cudaMemcpyHostToDevice)); 1103 } 1104 1105 if (identity) { 1106 CeedCallBackend(CeedFree(&identity)); 1107 } 1108 } 1109 1110 // Load into B_out, in order that they will be used in eval_modes_out 1111 { 1112 const CeedInt out_bytes = elem_size_out * num_qpts_out * num_eval_modes_out * sizeof(CeedScalar); 1113 CeedInt d_out = 0; 1114 CeedEvalMode eval_modes_out_prev = CEED_EVAL_NONE; 1115 bool has_eval_none = false; 1116 CeedScalar *identity = NULL; 1117 1118 for (CeedInt i = 0; i < num_eval_modes_out; i++) { 1119 has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE); 1120 } 1121 if (has_eval_none) { 1122 CeedCallBackend(CeedCalloc(elem_size_out * num_qpts_out, &identity)); 1123 for (CeedInt i = 0; i < (elem_size_out < num_qpts_out ? elem_size_out : num_qpts_out); i++) identity[i * elem_size_out + i] = 1.0; 1124 } 1125 1126 CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_out, out_bytes)); 1127 for (CeedInt i = 0; i < num_eval_modes_out; i++) { 1128 const CeedScalar *h_B_out; 1129 1130 CeedCallBackend(CeedOperatorGetBasisPointer(basis_out, eval_modes_out[i], identity, &h_B_out)); 1131 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_modes_out[i], &q_comp)); 1132 if (q_comp > 1) { 1133 if (i == 0 || eval_modes_out[i] != eval_modes_out_prev) d_out = 0; 1134 else h_B_out = &h_B_out[(++d_out) * elem_size_out * num_qpts_out]; 1135 } 1136 eval_modes_out_prev = eval_modes_out[i]; 1137 1138 CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar), 1139 cudaMemcpyHostToDevice)); 1140 } 1141 1142 if (identity) { 1143 CeedCallBackend(CeedFree(&identity)); 1144 } 1145 } 1146 return CEED_ERROR_SUCCESS; 1147 } 1148 1149 //------------------------------------------------------------------------------ 1150 // Assemble matrix data for COO matrix of assembled operator. 1151 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic. 1152 // 1153 // Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval 1154 // modes). 1155 // TODO: allow multiple active input restrictions/basis objects 1156 //------------------------------------------------------------------------------ 1157 static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) { 1158 Ceed ceed; 1159 CeedSize values_length = 0, assembled_qf_length = 0; 1160 CeedInt use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out; 1161 CeedScalar *values_array; 1162 const CeedScalar *assembled_qf_array; 1163 CeedVector assembled_qf = NULL; 1164 CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out; 1165 CeedRestrictionType rstr_type_in, rstr_type_out; 1166 const bool *orients_in = NULL, *orients_out = NULL; 1167 const CeedInt8 *curl_orients_in = NULL, *curl_orients_out = NULL; 1168 CeedOperator_Cuda *impl; 1169 1170 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1171 CeedCallBackend(CeedOperatorGetData(op, &impl)); 1172 1173 // Assemble QFunction 1174 CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, CEED_REQUEST_IMMEDIATE)); 1175 CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr)); 1176 CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); 1177 1178 CeedCallBackend(CeedVectorGetLength(values, &values_length)); 1179 CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); 1180 if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; 1181 1182 // Setup 1183 if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx)); 1184 CeedOperatorAssemble_Cuda *asmb = impl->asmb; 1185 1186 assert(asmb != NULL); 1187 1188 // Assemble element operator 1189 CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); 1190 values_array += offset; 1191 1192 CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); 1193 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem_in)); 1194 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in)); 1195 1196 CeedCallBackend(CeedElemRestrictionGetType(rstr_in, &rstr_type_in)); 1197 if (rstr_type_in == CEED_RESTRICTION_ORIENTED) { 1198 CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_in, CEED_MEM_DEVICE, &orients_in)); 1199 } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) { 1200 CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_in, CEED_MEM_DEVICE, &curl_orients_in)); 1201 } 1202 1203 if (rstr_in != rstr_out) { 1204 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_out, &num_elem_out)); 1205 CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED, 1206 "Active input and output operator restrictions must have the same number of elements"); 1207 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); 1208 1209 CeedCallBackend(CeedElemRestrictionGetType(rstr_out, &rstr_type_out)); 1210 if (rstr_type_out == CEED_RESTRICTION_ORIENTED) { 1211 CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_out, CEED_MEM_DEVICE, &orients_out)); 1212 } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) { 1213 CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_out, CEED_MEM_DEVICE, &curl_orients_out)); 1214 } 1215 } else { 1216 elem_size_out = elem_size_in; 1217 orients_out = orients_in; 1218 curl_orients_out = curl_orients_in; 1219 } 1220 1221 // Compute B^T D B 1222 CeedInt shared_mem = 1223 ((curl_orients_in || curl_orients_out ? elem_size_in * elem_size_out : 0) + (curl_orients_in ? elem_size_in * asmb->block_size_y : 0)) * 1224 sizeof(CeedScalar); 1225 CeedInt grid = CeedDivUpInt(num_elem_in, asmb->elems_per_block); 1226 void *args[] = {(void *)&num_elem_in, &asmb->d_B_in, &asmb->d_B_out, &orients_in, &curl_orients_in, 1227 &orients_out, &curl_orients_out, &assembled_qf_array, &values_array}; 1228 1229 CeedCallBackend( 1230 CeedRunKernelDimShared_Cuda(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args)); 1231 1232 // Restore arrays 1233 CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); 1234 CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); 1235 1236 // Cleanup 1237 CeedCallBackend(CeedVectorDestroy(&assembled_qf)); 1238 if (rstr_type_in == CEED_RESTRICTION_ORIENTED) { 1239 CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_in, &orients_in)); 1240 } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) { 1241 CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_in, &curl_orients_in)); 1242 } 1243 if (rstr_in != rstr_out) { 1244 if (rstr_type_out == CEED_RESTRICTION_ORIENTED) { 1245 CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_out, &orients_out)); 1246 } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) { 1247 CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out)); 1248 } 1249 } 1250 return CEED_ERROR_SUCCESS; 1251 } 1252 1253 //------------------------------------------------------------------------------ 1254 // Create operator 1255 //------------------------------------------------------------------------------ 1256 int CeedOperatorCreate_Cuda(CeedOperator op) { 1257 Ceed ceed; 1258 CeedOperator_Cuda *impl; 1259 1260 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1261 CeedCallBackend(CeedCalloc(1, &impl)); 1262 CeedCallBackend(CeedOperatorSetData(op, impl)); 1263 1264 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda)); 1265 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Cuda)); 1266 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Cuda)); 1267 CeedCallBackend( 1268 CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda)); 1269 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Cuda)); 1270 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda)); 1271 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda)); 1272 return CEED_ERROR_SUCCESS; 1273 } 1274 1275 //------------------------------------------------------------------------------ 1276