Lines Matching +full:- +full:- +full:ceed

1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
4 // SPDX-License-Identifier: BSD-2-Clause
6 // This file is part of CEED: http://github.com/ceed
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
17 #include "../cuda/ceed-cuda-common.h"
18 #include "../cuda/ceed-cuda-compile.h"
19 #include "ceed-cuda-shared.h"
21 //------------------------------------------------------------------------------
23 //------------------------------------------------------------------------------
26 Ceed ceed; in CeedBasisApplyTensorCore_Cuda_shared() local
33 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); in CeedBasisApplyTensorCore_Cuda_shared()
34 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); in CeedBasisApplyTensorCore_Cuda_shared()
41 …else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is requir… in CeedBasisApplyTensorCore_Cuda_shared()
53 …CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", Ceed… in CeedBasisApplyTensorCore_Cuda_shared()
58 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; in CeedBasisApplyTensorCore_Cuda_shared()
62 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyTensorCore_Cuda_shared()
67 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->Int… in CeedBasisApplyTensorCore_Cuda_shared()
70 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_pe… in CeedBasisApplyTensorCore_Cuda_shared()
80 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->Int… in CeedBasisApplyTensorCore_Cuda_shared()
83 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, … in CeedBasisApplyTensorCore_Cuda_shared()
92 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->Int… in CeedBasisApplyTensorCore_Cuda_shared()
95 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, … in CeedBasisApplyTensorCore_Cuda_shared()
103 …CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEval… in CeedBasisApplyTensorCore_Cuda_shared()
107 CeedScalar *d_grad_1d = data->d_grad_1d; in CeedBasisApplyTensorCore_Cuda_shared()
109 if (data->d_collo_grad_1d) { in CeedBasisApplyTensorCore_Cuda_shared()
110 d_grad_1d = data->d_collo_grad_1d; in CeedBasisApplyTensorCore_Cuda_shared()
112 void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v}; in CeedBasisApplyTensorCore_Cuda_shared()
116 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyTensorCore_Cuda_shared()
121 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradT… in CeedBasisApplyTensorCore_Cuda_shared()
124 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_… in CeedBasisApplyTensorCore_Cuda_shared()
134 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradT… in CeedBasisApplyTensorCore_Cuda_shared()
137 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, el… in CeedBasisApplyTensorCore_Cuda_shared()
145 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradT… in CeedBasisApplyTensorCore_Cuda_shared()
148 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, el… in CeedBasisApplyTensorCore_Cuda_shared()
156 …CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set",… in CeedBasisApplyTensorCore_Cuda_shared()
158 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; in CeedBasisApplyTensorCore_Cuda_shared()
163 …CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, wei… in CeedBasisApplyTensorCore_Cuda_shared()
169 …CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, … in CeedBasisApplyTensorCore_Cuda_shared()
175 …CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, … in CeedBasisApplyTensorCore_Cuda_shared()
183 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); in CeedBasisApplyTensorCore_Cuda_shared()
191 CeedCallBackend(CeedDestroy(&ceed)); in CeedBasisApplyTensorCore_Cuda_shared()
207 //------------------------------------------------------------------------------
208 // Basis apply - tensor AtPoints
209 //------------------------------------------------------------------------------
212 Ceed ceed; in CeedBasisApplyAtPointsCore_Cuda_shared() local
231 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); in CeedBasisApplyAtPointsCore_Cuda_shared()
232 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); in CeedBasisApplyAtPointsCore_Cuda_shared()
242 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, in CeedBasisApplyAtPointsCore_Cuda_shared()
252 if (num_elem != data->num_elem_at_points) { in CeedBasisApplyAtPointsCore_Cuda_shared()
253 data->num_elem_at_points = num_elem; in CeedBasisApplyAtPointsCore_Cuda_shared()
255 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); in CeedBasisApplyAtPointsCore_Cuda_shared()
256 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); in CeedBasisApplyAtPointsCore_Cuda_shared()
257 CeedCallBackend(CeedFree(&data->h_points_per_elem)); in CeedBasisApplyAtPointsCore_Cuda_shared()
258 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); in CeedBasisApplyAtPointsCore_Cuda_shared()
260 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { in CeedBasisApplyAtPointsCore_Cuda_shared()
261 memcpy(data->h_points_per_elem, num_points, num_bytes); in CeedBasisApplyAtPointsCore_Cuda_shared()
262 …CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevi… in CeedBasisApplyAtPointsCore_Cuda_shared()
267 if (data->num_points != max_num_points) { in CeedBasisApplyAtPointsCore_Cuda_shared()
271 data->num_points = max_num_points; in CeedBasisApplyAtPointsCore_Cuda_shared()
273 // -- Create interp matrix to Chebyshev coefficients in CeedBasisApplyAtPointsCore_Cuda_shared()
274 if (!data->d_chebyshev_interp_1d) { in CeedBasisApplyAtPointsCore_Cuda_shared()
281 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); in CeedBasisApplyAtPointsCore_Cuda_shared()
282 …CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cuda… in CeedBasisApplyAtPointsCore_Cuda_shared()
286 // -- Compile kernels in CeedBasisApplyAtPointsCore_Cuda_shared()
287 …nel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor- in CeedBasisApplyAtPointsCore_Cuda_shared()
290 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); in CeedBasisApplyAtPointsCore_Cuda_shared()
292 …CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D"… in CeedBasisApplyAtPointsCore_Cuda_shared()
295 …CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPo… in CeedBasisApplyAtPointsCore_Cuda_shared()
296 …CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->I… in CeedBasisApplyAtPointsCore_Cuda_shared()
297 …CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAddAtPoints", &data in CeedBasisApplyAtPointsCore_Cuda_shared()
298 …CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints… in CeedBasisApplyAtPointsCore_Cuda_shared()
299 …CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->Gra… in CeedBasisApplyAtPointsCore_Cuda_shared()
300 …CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAddAtPoints", &data->… in CeedBasisApplyAtPointsCore_Cuda_shared()
306 …else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is requir… in CeedBasisApplyAtPointsCore_Cuda_shared()
322 …void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, … in CeedBasisApplyAtPointsCore_Cuda_shared()
326 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyAtPointsCore_Cuda_shared()
331 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : d… in CeedBasisApplyAtPointsCore_Cuda_shared()
334 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, … in CeedBasisApplyAtPointsCore_Cuda_shared()
345 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : d… in CeedBasisApplyAtPointsCore_Cuda_shared()
348 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thr… in CeedBasisApplyAtPointsCore_Cuda_shared()
357 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : d… in CeedBasisApplyAtPointsCore_Cuda_shared()
360 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thr… in CeedBasisApplyAtPointsCore_Cuda_shared()
372 …void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d… in CeedBasisApplyAtPointsCore_Cuda_shared()
376 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyAtPointsCore_Cuda_shared()
381 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : dat… in CeedBasisApplyAtPointsCore_Cuda_shared()
384 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, el… in CeedBasisApplyAtPointsCore_Cuda_shared()
394 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : dat… in CeedBasisApplyAtPointsCore_Cuda_shared()
397 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, threa… in CeedBasisApplyAtPointsCore_Cuda_shared()
406 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : dat… in CeedBasisApplyAtPointsCore_Cuda_shared()
409 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, threa… in CeedBasisApplyAtPointsCore_Cuda_shared()
420 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); in CeedBasisApplyAtPointsCore_Cuda_shared()
429 CeedCallBackend(CeedDestroy(&ceed)); in CeedBasisApplyAtPointsCore_Cuda_shared()
445 //------------------------------------------------------------------------------
446 // Apply non-tensor basis
447 //------------------------------------------------------------------------------
450 Ceed ceed; in CeedBasisApplyNonTensorCore_Cuda_shared() local
457 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); in CeedBasisApplyNonTensorCore_Cuda_shared()
458 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); in CeedBasisApplyNonTensorCore_Cuda_shared()
464 …else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is requir… in CeedBasisApplyNonTensorCore_Cuda_shared()
476 …CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEva… in CeedBasisApplyNonTensorCore_Cuda_shared()
481 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; in CeedBasisApplyNonTensorCore_Cuda_shared()
485 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyNonTensorCore_Cuda_shared()
490 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->Int… in CeedBasisApplyNonTensorCore_Cuda_shared()
493 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread, 1, elems_per_b… in CeedBasisApplyNonTensorCore_Cuda_shared()
500 …CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalMod… in CeedBasisApplyNonTensorCore_Cuda_shared()
505 void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v}; in CeedBasisApplyNonTensorCore_Cuda_shared()
509 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyNonTensorCore_Cuda_shared()
514 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradT… in CeedBasisApplyNonTensorCore_Cuda_shared()
517 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread, 1, elems_per_blo… in CeedBasisApplyNonTensorCore_Cuda_shared()
524 …CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", Ce… in CeedBasisApplyNonTensorCore_Cuda_shared()
529 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; in CeedBasisApplyNonTensorCore_Cuda_shared()
533 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyNonTensorCore_Cuda_shared()
536 …CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight… in CeedBasisApplyNonTensorCore_Cuda_shared()
544 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); in CeedBasisApplyNonTensorCore_Cuda_shared()
552 CeedCallBackend(CeedDestroy(&ceed)); in CeedBasisApplyNonTensorCore_Cuda_shared()
568 //------------------------------------------------------------------------------
570 //------------------------------------------------------------------------------
572 Ceed ceed; in CeedBasisDestroy_Cuda_shared() local
575 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); in CeedBasisDestroy_Cuda_shared()
577 CeedCallCuda(ceed, cuModuleUnload(data->module)); in CeedBasisDestroy_Cuda_shared()
578 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); in CeedBasisDestroy_Cuda_shared()
579 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); in CeedBasisDestroy_Cuda_shared()
580 CeedCallBackend(CeedFree(&data->h_points_per_elem)); in CeedBasisDestroy_Cuda_shared()
581 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); in CeedBasisDestroy_Cuda_shared()
582 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); in CeedBasisDestroy_Cuda_shared()
583 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); in CeedBasisDestroy_Cuda_shared()
584 CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d)); in CeedBasisDestroy_Cuda_shared()
585 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); in CeedBasisDestroy_Cuda_shared()
587 CeedCallBackend(CeedDestroy(&ceed)); in CeedBasisDestroy_Cuda_shared()
591 //------------------------------------------------------------------------------
593 //------------------------------------------------------------------------------
596 Ceed ceed; in CeedBasisCreateTensorH1_Cuda_shared() local
602 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); in CeedBasisCreateTensorH1_Cuda_shared()
607 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); in CeedBasisCreateTensorH1_Cuda_shared()
608 … CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); in CeedBasisCreateTensorH1_Cuda_shared()
610 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); in CeedBasisCreateTensorH1_Cuda_shared()
611 …CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); in CeedBasisCreateTensorH1_Cuda_shared()
612 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); in CeedBasisCreateTensorH1_Cuda_shared()
613 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); in CeedBasisCreateTensorH1_Cuda_shared()
616 data->d_collo_grad_1d = NULL; in CeedBasisCreateTensorH1_Cuda_shared()
624 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d)); in CeedBasisCreateTensorH1_Cuda_shared()
625 …CeedCallCuda(ceed, cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, cudaMemcpyHost… in CeedBasisCreateTensorH1_Cuda_shared()
631 …sis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-te… in CeedBasisCreateTensorH1_Cuda_shared()
634 …CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, … in CeedBasisCreateTensorH1_Cuda_shared()
638 …CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocated" : "Inter… in CeedBasisCreateTensorH1_Cuda_shared()
639 …CallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTranspose" : "… in CeedBasisCreateTensorH1_Cuda_shared()
640 …CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTransposeA… in CeedBasisCreateTensorH1_Cuda_shared()
641 &data->InterpTransposeAdd)); in CeedBasisCreateTensorH1_Cuda_shared()
642 …CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocated" : "Grad", … in CeedBasisCreateTensorH1_Cuda_shared()
643 …edCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocatedTranspose" : "… in CeedBasisCreateTensorH1_Cuda_shared()
644 …allBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocatedTransposeAdd" : "… in CeedBasisCreateTensorH1_Cuda_shared()
645 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); in CeedBasisCreateTensorH1_Cuda_shared()
650 …CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_sh… in CeedBasisCreateTensorH1_Cuda_shared()
651 …CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_C… in CeedBasisCreateTensorH1_Cuda_shared()
652 …CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoin… in CeedBasisCreateTensorH1_Cuda_shared()
653 …CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAdd… in CeedBasisCreateTensorH1_Cuda_shared()
654 …CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shar… in CeedBasisCreateTensorH1_Cuda_shared()
655 CeedCallBackend(CeedDestroy(&ceed)); in CeedBasisCreateTensorH1_Cuda_shared()
659 //------------------------------------------------------------------------------
660 // Create non-tensor basis
661 //------------------------------------------------------------------------------
664 Ceed ceed; in CeedBasisCreateH1_Cuda_shared() local
669 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); in CeedBasisCreateH1_Cuda_shared()
675 CeedCallBackend(CeedGetData(ceed, &cuda_data)); in CeedBasisCreateH1_Cuda_shared()
677 cuda_data->device_prop.sharedMemPerBlock) { in CeedBasisCreateH1_Cuda_shared()
678 …CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_re… in CeedBasisCreateH1_Cuda_shared()
679 CeedCallBackend(CeedDestroy(&ceed)); in CeedBasisCreateH1_Cuda_shared()
690 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); in CeedBasisCreateH1_Cuda_shared()
691 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight, q_bytes, cudaMemcpyHostToDevice)); in CeedBasisCreateH1_Cuda_shared()
696 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); in CeedBasisCreateH1_Cuda_shared()
697 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp, interp_bytes, cudaMemcpyHostToDevice)); in CeedBasisCreateH1_Cuda_shared()
702 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, grad_bytes)); in CeedBasisCreateH1_Cuda_shared()
703 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad, grad_bytes, cudaMemcpyHostToDevice)); in CeedBasisCreateH1_Cuda_shared()
707 …onst char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/cuda/cuda in CeedBasisCreateH1_Cuda_shared()
710 …CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts,… in CeedBasisCreateH1_Cuda_shared()
712 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); in CeedBasisCreateH1_Cuda_shared()
713 …CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); in CeedBasisCreateH1_Cuda_shared()
714 …CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTranspos… in CeedBasisCreateH1_Cuda_shared()
715 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); in CeedBasisCreateH1_Cuda_shared()
716 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); in CeedBasisCreateH1_Cuda_shared()
717 …CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd… in CeedBasisCreateH1_Cuda_shared()
718 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); in CeedBasisCreateH1_Cuda_shared()
723 …CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda… in CeedBasisCreateH1_Cuda_shared()
724 …CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTenso… in CeedBasisCreateH1_Cuda_shared()
725 …CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shar… in CeedBasisCreateH1_Cuda_shared()
726 CeedCallBackend(CeedDestroy(&ceed)); in CeedBasisCreateH1_Cuda_shared()
730 //------------------------------------------------------------------------------