xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-basis.c (revision 49aac155e7a09736f56fb3abac0f57dab29f7cbf)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 
14 #include "../cuda/ceed-cuda-common.h"
15 #include "../cuda/ceed-cuda-compile.h"
16 #include "ceed-cuda-ref.h"
17 
18 //------------------------------------------------------------------------------
19 // Basis apply - tensor
20 //------------------------------------------------------------------------------
21 int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
22   Ceed ceed;
23   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
24   Ceed_Cuda *ceed_Cuda;
25   CeedCallBackend(CeedGetData(ceed, &ceed_Cuda));
26   CeedBasis_Cuda *data;
27   CeedCallBackend(CeedBasisGetData(basis, &data));
28   const CeedInt transpose      = t_mode == CEED_TRANSPOSE;
29   const int     max_block_size = 32;
30 
31   // Read vectors
32   const CeedScalar *d_u;
33   CeedScalar       *d_v;
34   if (eval_mode != CEED_EVAL_WEIGHT) {
35     CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
36   }
37   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
38 
39   // Clear v for transpose operation
40   if (t_mode == CEED_TRANSPOSE) {
41     CeedSize length;
42     CeedCallBackend(CeedVectorGetLength(v, &length));
43     CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
44   }
45   CeedInt Q_1d, dim;
46   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
47   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
48 
49   // Basis action
50   switch (eval_mode) {
51     case CEED_EVAL_INTERP: {
52       void   *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &d_u, &d_v};
53       CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
54 
55       CeedCallBackend(CeedRunKernelCuda(ceed, data->Interp, num_elem, block_size, interp_args));
56     } break;
57     case CEED_EVAL_GRAD: {
58       void   *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v};
59       CeedInt block_size  = max_block_size;
60 
61       CeedCallBackend(CeedRunKernelCuda(ceed, data->Grad, num_elem, block_size, grad_args));
62     } break;
63     case CEED_EVAL_WEIGHT: {
64       void     *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
65       const int grid_size     = num_elem;
66       CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Weight, grid_size, Q_1d, dim >= 2 ? Q_1d : 1, 1, weight_args));
67     } break;
68     // LCOV_EXCL_START
69     // Evaluate the divergence to/from the quadrature points
70     case CEED_EVAL_DIV:
71       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
72     // Evaluate the curl to/from the quadrature points
73     case CEED_EVAL_CURL:
74       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
75     // Take no action, BasisApply should not have been called
76     case CEED_EVAL_NONE:
77       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
78       // LCOV_EXCL_STOP
79   }
80 
81   // Restore vectors
82   if (eval_mode != CEED_EVAL_WEIGHT) {
83     CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
84   }
85   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
86   return CEED_ERROR_SUCCESS;
87 }
88 
89 //------------------------------------------------------------------------------
90 // Basis apply - non-tensor
91 //------------------------------------------------------------------------------
92 int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
93                                  CeedVector v) {
94   Ceed ceed;
95   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
96   Ceed_Cuda *ceed_Cuda;
97   CeedCallBackend(CeedGetData(ceed, &ceed_Cuda));
98   CeedBasisNonTensor_Cuda *data;
99   CeedCallBackend(CeedBasisGetData(basis, &data));
100   CeedInt num_nodes, num_qpts;
101   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
102   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
103   const CeedInt transpose       = t_mode == CEED_TRANSPOSE;
104   int           elems_per_block = 1;
105   int           grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
106 
107   // Read vectors
108   const CeedScalar *d_u;
109   CeedScalar       *d_v;
110   if (eval_mode != CEED_EVAL_WEIGHT) {
111     CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
112   }
113   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
114 
115   // Clear v for transpose operation
116   if (t_mode == CEED_TRANSPOSE) {
117     CeedSize length;
118     CeedCallBackend(CeedVectorGetLength(v, &length));
119     CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
120   }
121 
122   // Apply basis operation
123   switch (eval_mode) {
124     case CEED_EVAL_INTERP: {
125       void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp, &d_u, &d_v};
126       if (transpose) {
127         CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Interp, grid, num_nodes, 1, elems_per_block, interp_args));
128       } else {
129         CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Interp, grid, num_qpts, 1, elems_per_block, interp_args));
130       }
131     } break;
132     case CEED_EVAL_GRAD: {
133       void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_grad, &d_u, &d_v};
134       if (transpose) {
135         CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Grad, grid, num_nodes, 1, elems_per_block, grad_args));
136       } else {
137         CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Grad, grid, num_qpts, 1, elems_per_block, grad_args));
138       }
139     } break;
140     case CEED_EVAL_WEIGHT: {
141       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v};
142       CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args));
143     } break;
144     // LCOV_EXCL_START
145     // Evaluate the divergence to/from the quadrature points
146     case CEED_EVAL_DIV:
147       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
148     // Evaluate the curl to/from the quadrature points
149     case CEED_EVAL_CURL:
150       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
151     // Take no action, BasisApply should not have been called
152     case CEED_EVAL_NONE:
153       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
154       // LCOV_EXCL_STOP
155   }
156 
157   // Restore vectors
158   if (eval_mode != CEED_EVAL_WEIGHT) {
159     CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
160   }
161   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
162   return CEED_ERROR_SUCCESS;
163 }
164 
165 //------------------------------------------------------------------------------
166 // Destroy tensor basis
167 //------------------------------------------------------------------------------
168 static int CeedBasisDestroy_Cuda(CeedBasis basis) {
169   Ceed ceed;
170   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
171 
172   CeedBasis_Cuda *data;
173   CeedCallBackend(CeedBasisGetData(basis, &data));
174 
175   CeedCallCuda(ceed, cuModuleUnload(data->module));
176 
177   CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
178   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
179   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
180   CeedCallBackend(CeedFree(&data));
181 
182   return CEED_ERROR_SUCCESS;
183 }
184 
185 //------------------------------------------------------------------------------
186 // Destroy non-tensor basis
187 //------------------------------------------------------------------------------
188 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) {
189   Ceed ceed;
190   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
191 
192   CeedBasisNonTensor_Cuda *data;
193   CeedCallBackend(CeedBasisGetData(basis, &data));
194 
195   CeedCallCuda(ceed, cuModuleUnload(data->module));
196 
197   CeedCallCuda(ceed, cudaFree(data->d_q_weight));
198   CeedCallCuda(ceed, cudaFree(data->d_interp));
199   CeedCallCuda(ceed, cudaFree(data->d_grad));
200   CeedCallBackend(CeedFree(&data));
201 
202   return CEED_ERROR_SUCCESS;
203 }
204 
205 //------------------------------------------------------------------------------
206 // Create tensor
207 //------------------------------------------------------------------------------
208 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
209                                  const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
210   Ceed ceed;
211   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
212   CeedBasis_Cuda *data;
213   CeedCallBackend(CeedCalloc(1, &data));
214 
215   // Copy data to GPU
216   const CeedInt q_bytes = Q_1d * sizeof(CeedScalar);
217   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
218   CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice));
219 
220   const CeedInt interp_bytes = q_bytes * P_1d;
221   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes));
222   CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice));
223 
224   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes));
225   CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice));
226 
227   // Compile basis kernels
228   CeedInt num_comp;
229   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
230   char *basis_kernel_path, *basis_kernel_source;
231   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path));
232   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
233   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
234   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n");
235   CeedCallBackend(CeedCompileCuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
236                                   num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
237                                   "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
238   CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp));
239   CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad));
240   CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight));
241   CeedCallBackend(CeedFree(&basis_kernel_path));
242   CeedCallBackend(CeedFree(&basis_kernel_source));
243 
244   CeedCallBackend(CeedBasisSetData(basis, data));
245 
246   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda));
247   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda));
248   return CEED_ERROR_SUCCESS;
249 }
250 
251 //------------------------------------------------------------------------------
252 // Create non-tensor
253 //------------------------------------------------------------------------------
254 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
255                            const CeedScalar *qref, const CeedScalar *q_weight, CeedBasis basis) {
256   Ceed ceed;
257   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
258   CeedBasisNonTensor_Cuda *data;
259   CeedCallBackend(CeedCalloc(1, &data));
260 
261   // Copy basis data to GPU
262   const CeedInt q_bytes = num_qpts * sizeof(CeedScalar);
263   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
264   CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
265 
266   const CeedInt interp_bytes = q_bytes * num_nodes;
267   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes));
268   CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice));
269 
270   const CeedInt grad_bytes = q_bytes * num_nodes * dim;
271   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes));
272   CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice));
273 
274   // Compile basis kernels
275   CeedInt num_comp;
276   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
277   char *basis_kernel_path, *basis_kernel_source;
278   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
279   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
280   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
281   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n");
282   CeedCallCuda(ceed, CeedCompileCuda(ceed, basis_kernel_source, &data->module, 4, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_DIM", dim,
283                                      "BASIS_NUM_COMP", num_comp));
284   CeedCallCuda(ceed, CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp));
285   CeedCallCuda(ceed, CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad));
286   CeedCallCuda(ceed, CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight));
287   CeedCallBackend(CeedFree(&basis_kernel_path));
288   CeedCallBackend(CeedFree(&basis_kernel_source));
289 
290   CeedCallBackend(CeedBasisSetData(basis, data));
291 
292   // Register backend functions
293   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
294   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
295   return CEED_ERROR_SUCCESS;
296 }
297 //------------------------------------------------------------------------------
298