xref: /libCEED/backends/cuda-shared/ceed-cuda-shared-basis.c (revision f374d6a3d4bb58585c9e7362022a5daaf33a852a)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 #include <stddef.h>
14 #include "ceed-cuda-shared.h"
15 #include "../cuda/ceed-cuda-compile.h"
16 
17 
18 //------------------------------------------------------------------------------
19 // Device initalization
20 //------------------------------------------------------------------------------
21 int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d,
22                        CeedScalar **c_B);
23 int CeedCudaInitInterpGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d,
24                            CeedInt Q_1d, CeedScalar **c_B_ptr,
25                            CeedScalar **c_G_ptr);
26 
27 //------------------------------------------------------------------------------
28 // Apply basis
29 //------------------------------------------------------------------------------
30 int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem,
31                                      CeedTransposeMode t_mode,
32                                      CeedEvalMode eval_mode, CeedVector u,
33                                      CeedVector v) {
34   int ierr;
35   Ceed ceed;
36   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
37   Ceed_Cuda *ceed_Cuda;
38   CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr);
39   CeedBasis_Cuda_shared *data;
40   CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
41   const CeedInt transpose = t_mode == CEED_TRANSPOSE;
42   CeedInt dim, num_comp;
43   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
44   ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
45 
46   // Read vectors
47   const CeedScalar *d_u;
48   CeedScalar *d_v;
49   if (eval_mode != CEED_EVAL_WEIGHT) {
50     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
51   }
52   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
53 
54   // Clear v for transpose mode
55   if (t_mode == CEED_TRANSPOSE) {
56     CeedSize length;
57     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
58     ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr);
59   }
60 
61   // Apply basis operation
62   switch (eval_mode) {
63   case CEED_EVAL_INTERP: {
64     CeedInt P_1d, Q_1d;
65     ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr);
66     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
67     CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
68     ierr = CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B);
69     CeedChkBackend(ierr);
70     void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B,
71                            &d_u, &d_v
72                           };
73     if (dim == 1) {
74       CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2],
75                                            CeedIntMax(512 / thread_1d,
76                                                1)); // avoid >512 total threads
77       CeedInt grid = num_elem/elems_per_block +
78                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
79       CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar);
80       ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1,
81                                         elems_per_block, shared_mem,
82                                         interp_args); CeedChkBackend(ierr);
83     } else if (dim == 2) {
84       const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
85       // elems_per_block must be at least 1
86       CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] /
87                                            num_comp : 1, 1);
88       CeedInt grid = num_elem / elems_per_block +
89                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
90       CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof(
91                              CeedScalar);
92       ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d,
93                                         thread_1d,
94                                         num_comp*elems_per_block, shared_mem,
95                                         interp_args); CeedChkBackend(ierr);
96     } else if (dim == 3) {
97       CeedInt elems_per_block = 1;
98       CeedInt grid = num_elem / elems_per_block +
99                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
100       CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof(
101                              CeedScalar);
102       ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d,
103                                         thread_1d,
104                                         num_comp*elems_per_block, shared_mem,
105                                         interp_args); CeedChkBackend(ierr);
106     }
107   } break;
108   case CEED_EVAL_GRAD: {
109     CeedInt P_1d, Q_1d;
110     ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr);
111     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
112     CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
113     ierr = CeedCudaInitInterpGrad(data->d_interp_1d, data->d_grad_1d, P_1d,
114                                   Q_1d, &data->c_B, &data->c_G);
115     CeedChkBackend(ierr);
116     void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B,
117                          &data->c_G, &d_u, &d_v
118                         };
119     if (dim == 1) {
120       CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2],
121                                            CeedIntMax(512 / thread_1d,
122                                                1)); // avoid >512 total threads
123       CeedInt grid = num_elem / elems_per_block +
124                      ((num_elem / elems_per_block*elems_per_block<num_elem) ? 1 : 0 );
125       CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar);
126       ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1,
127                                         elems_per_block, shared_mem, grad_args);
128       CeedChkBackend(ierr);
129     } else if (dim == 2) {
130       const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
131       // elems_per_block must be at least 1
132       CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] /
133                                            num_comp : 1, 1);
134       CeedInt grid = num_elem / elems_per_block +
135                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
136       CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof(
137                              CeedScalar);
138       ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d,
139                                         num_comp*elems_per_block, shared_mem,
140                                         grad_args); CeedChkBackend(ierr);
141     } else if (dim == 3) {
142       CeedInt elems_per_block = 1;
143       CeedInt grid = num_elem / elems_per_block +
144                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
145       CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof(
146                              CeedScalar);
147       ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d,
148                                         num_comp*elems_per_block, shared_mem,
149                                         grad_args); CeedChkBackend(ierr);
150     }
151   } break;
152   case CEED_EVAL_WEIGHT: {
153     CeedInt Q_1d;
154     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
155     void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v};
156     if (dim == 1) {
157       const CeedInt elems_per_block = 32 / Q_1d;
158       const CeedInt gridsize = num_elem / elems_per_block +
159                                ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
160       ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d,
161                                   elems_per_block, 1, weight_args);
162       CeedChkBackend(ierr);
163     } else if (dim == 2) {
164       const CeedInt opt_elems = 32 / (Q_1d * Q_1d);
165       const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
166       const CeedInt gridsize = num_elem / elems_per_block +
167                                ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
168       ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d,
169                                   elems_per_block, weight_args);
170       CeedChkBackend(ierr);
171     } else if (dim == 3) {
172       const CeedInt gridsize = num_elem;
173       ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, Q_1d,
174                                   weight_args);
175       CeedChkBackend(ierr);
176     }
177   } break;
178   // LCOV_EXCL_START
179   // Evaluate the divergence to/from the quadrature points
180   case CEED_EVAL_DIV:
181     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
182   // Evaluate the curl to/from the quadrature points
183   case CEED_EVAL_CURL:
184     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
185   // Take no action, BasisApply should not have been called
186   case CEED_EVAL_NONE:
187     return CeedError(ceed, CEED_ERROR_BACKEND,
188                      "CEED_EVAL_NONE does not make sense in this context");
189     // LCOV_EXCL_STOP
190   }
191 
192   // Restore vectors
193   if (eval_mode != CEED_EVAL_WEIGHT) {
194     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
195   }
196   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
197   return CEED_ERROR_SUCCESS;
198 }
199 
200 //------------------------------------------------------------------------------
201 // Destroy basis
202 //------------------------------------------------------------------------------
203 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
204   int ierr;
205   Ceed ceed;
206   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
207 
208   CeedBasis_Cuda_shared *data;
209   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
210 
211   CeedChk_Cu(ceed, cuModuleUnload(data->module));
212 
213   ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr);
214   ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr);
215   ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr);
216   ierr = cudaFree(data->d_collo_grad_1d); CeedChk_Cu(ceed, ierr);
217 
218   ierr = CeedFree(&data); CeedChkBackend(ierr);
219 
220   return CEED_ERROR_SUCCESS;
221 }
222 
223 //------------------------------------------------------------------------------
224 // Create tensor basis
225 //------------------------------------------------------------------------------
226 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
227                                         const CeedScalar *interp_1d,
228                                         const CeedScalar *grad_1d,
229                                         const CeedScalar *q_ref_1d,
230                                         const CeedScalar *q_weight_1d,
231                                         CeedBasis basis) {
232   int ierr;
233   Ceed ceed;
234   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
235   CeedBasis_Cuda_shared *data;
236   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
237 
238   // Copy basis data to GPU
239   const CeedInt q_bytes = Q_1d * sizeof(CeedScalar);
240   ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes);
241   CeedChk_Cu(ceed, ierr);
242   ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes,
243                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
244 
245   const CeedInt interp_bytes = q_bytes * P_1d;
246   ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes);
247   CeedChk_Cu(ceed, ierr);
248   ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes,
249                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
250 
251   ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes);
252   CeedChk_Cu(ceed, ierr);
253   ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes,
254                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
255 
256   // Compute collocated gradient and copy to GPU
257   data->d_collo_grad_1d = NULL;
258   if (dim == 3 && Q_1d >= P_1d) {
259     CeedScalar *collo_grad_1d;
260     ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr);
261     ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr);
262     ierr = cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d);
263     CeedChk_Cu(ceed, ierr);
264     ierr = cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d,
265                       cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
266     ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr);
267   }
268 
269   // Compile basis kernels
270   CeedInt num_comp;
271   ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
272   char *basis_kernel_path, *basis_kernel_source;
273   ierr = CeedGetJitAbsolutePath(ceed,
274                                 "ceed/jit-source/cuda/cuda-shared-basis.h",
275                                 &basis_kernel_path); CeedChkBackend(ierr);
276   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
277   ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source);
278   CeedChkBackend(ierr);
279   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete -----\n");
280   ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8,
281                          "BASIS_Q_1D", Q_1d,
282                          "BASIS_P_1D", P_1d,
283                          "BASIS_T_1D", CeedIntMax(Q_1d, P_1d),
284                          "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ?
285                              Q_1d : P_1d, dim),
286                          "BASIS_DIM", dim,
287                          "BASIS_NUM_COMP", num_comp,
288                          "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
289                          "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)
290                         ); CeedChkBackend(ierr);
291   ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp);
292   CeedChkBackend(ierr);
293   ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad);
294   CeedChkBackend(ierr);
295   ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight);
296   CeedChkBackend(ierr);
297   ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr);
298   ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr);
299 
300   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
301 
302   // Register backend functions
303   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
304                                 CeedBasisApplyTensor_Cuda_shared);
305   CeedChkBackend(ierr);
306   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
307                                 CeedBasisDestroy_Cuda_shared); CeedChkBackend(ierr);
308   return CEED_ERROR_SUCCESS;
309 }
310 //------------------------------------------------------------------------------
311