xref: /libCEED/backends/cuda-shared/ceed-cuda-shared-basis.c (revision 88b783a108a0e7f73f6a4b1c66ee7b6d1a268995)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed/ceed.h>
18 #include <ceed/backend.h>
19 #include <ceed/jit-tools.h>
20 #include <cuda.h>
21 #include <cuda_runtime.h>
22 #include <stddef.h>
23 #include "ceed-cuda-shared.h"
24 #include "../cuda/ceed-cuda-compile.h"
25 
26 
27 //------------------------------------------------------------------------------
28 // Device initalization
29 //------------------------------------------------------------------------------
30 int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d,
31                        CeedScalar **c_B);
32 int CeedCudaInitInterpGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d,
33                            CeedInt Q_1d, CeedScalar **c_B_ptr,
34                            CeedScalar **c_G_ptr);
35 
36 //------------------------------------------------------------------------------
37 // Apply basis
38 //------------------------------------------------------------------------------
39 int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem,
40                                      CeedTransposeMode t_mode,
41                                      CeedEvalMode eval_mode, CeedVector u,
42                                      CeedVector v) {
43   int ierr;
44   Ceed ceed;
45   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
46   Ceed_Cuda *ceed_Cuda;
47   CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr);
48   CeedBasis_Cuda_shared *data;
49   CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
50   const CeedInt transpose = t_mode == CEED_TRANSPOSE;
51   CeedInt dim, num_comp;
52   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
53   ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
54 
55   // Read vectors
56   const CeedScalar *d_u;
57   CeedScalar *d_v;
58   if (eval_mode != CEED_EVAL_WEIGHT) {
59     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
60   }
61   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
62 
63   // Clear v for transpose mode
64   if (t_mode == CEED_TRANSPOSE) {
65     CeedInt length;
66     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
67     ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr);
68   }
69 
70   // Apply basis operation
71   switch (eval_mode) {
72   case CEED_EVAL_INTERP: {
73     CeedInt P_1d, Q_1d;
74     ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr);
75     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
76     CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
77     ierr = CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B);
78     CeedChkBackend(ierr);
79     void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B,
80                            &d_u, &d_v
81                           };
82     if (dim == 1) {
83       CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2],
84                                            CeedIntMax(512 / thread_1d,
85                                                1)); // avoid >512 total threads
86       CeedInt grid = num_elem/elems_per_block +
87                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
88       CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar);
89       ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1,
90                                         elems_per_block, shared_mem,
91                                         interp_args); CeedChkBackend(ierr);
92     } else if (dim == 2) {
93       const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
94       // elems_per_block must be at least 1
95       CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] /
96                                            num_comp : 1, 1);
97       CeedInt grid = num_elem / elems_per_block +
98                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
99       CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof(
100                              CeedScalar);
101       ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d,
102                                         thread_1d,
103                                         num_comp*elems_per_block, shared_mem,
104                                         interp_args); CeedChkBackend(ierr);
105     } else if (dim == 3) {
106       CeedInt elems_per_block = 1;
107       CeedInt grid = num_elem / elems_per_block +
108                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
109       CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof(
110                              CeedScalar);
111       ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d,
112                                         thread_1d,
113                                         num_comp*elems_per_block, shared_mem,
114                                         interp_args); CeedChkBackend(ierr);
115     }
116   } break;
117   case CEED_EVAL_GRAD: {
118     CeedInt P_1d, Q_1d;
119     ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr);
120     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
121     CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
122     ierr = CeedCudaInitInterpGrad(data->d_interp_1d, data->d_grad_1d, P_1d,
123                                   Q_1d, &data->c_B, &data->c_G);
124     CeedChkBackend(ierr);
125     void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B,
126                          &data->c_G, &d_u, &d_v
127                         };
128     if (dim == 1) {
129       CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2],
130                                            CeedIntMax(512 / thread_1d,
131                                                1)); // avoid >512 total threads
132       CeedInt grid = num_elem / elems_per_block +
133                      ((num_elem / elems_per_block*elems_per_block<num_elem) ? 1 : 0 );
134       CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar);
135       ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1,
136                                         elems_per_block, shared_mem, grad_args);
137       CeedChkBackend(ierr);
138     } else if (dim == 2) {
139       const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
140       // elems_per_block must be at least 1
141       CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] /
142                                            num_comp : 1, 1);
143       CeedInt grid = num_elem / elems_per_block +
144                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
145       CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof(
146                              CeedScalar);
147       ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d,
148                                         num_comp*elems_per_block, shared_mem,
149                                         grad_args); CeedChkBackend(ierr);
150     } else if (dim == 3) {
151       CeedInt elems_per_block = 1;
152       CeedInt grid = num_elem / elems_per_block +
153                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
154       CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof(
155                              CeedScalar);
156       ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d,
157                                         num_comp*elems_per_block, shared_mem,
158                                         grad_args); CeedChkBackend(ierr);
159     }
160   } break;
161   case CEED_EVAL_WEIGHT: {
162     CeedInt Q_1d;
163     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
164     void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v};
165     if (dim == 1) {
166       const CeedInt elems_per_block = 32 / Q_1d;
167       const CeedInt gridsize = num_elem / elems_per_block +
168                                ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
169       ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d,
170                                   elems_per_block, 1, weight_args);
171       CeedChkBackend(ierr);
172     } else if (dim == 2) {
173       const CeedInt opt_elems = 32 / (Q_1d * Q_1d);
174       const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
175       const CeedInt gridsize = num_elem / elems_per_block +
176                                ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
177       ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d,
178                                   elems_per_block, weight_args);
179       CeedChkBackend(ierr);
180     } else if (dim == 3) {
181       const CeedInt gridsize = num_elem;
182       ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, Q_1d,
183                                   weight_args);
184       CeedChkBackend(ierr);
185     }
186   } break;
187   // LCOV_EXCL_START
188   // Evaluate the divergence to/from the quadrature points
189   case CEED_EVAL_DIV:
190     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
191   // Evaluate the curl to/from the quadrature points
192   case CEED_EVAL_CURL:
193     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
194   // Take no action, BasisApply should not have been called
195   case CEED_EVAL_NONE:
196     return CeedError(ceed, CEED_ERROR_BACKEND,
197                      "CEED_EVAL_NONE does not make sense in this context");
198     // LCOV_EXCL_STOP
199   }
200 
201   // Restore vectors
202   if (eval_mode != CEED_EVAL_WEIGHT) {
203     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
204   }
205   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
206   return CEED_ERROR_SUCCESS;
207 }
208 
209 //------------------------------------------------------------------------------
210 // Destroy basis
211 //------------------------------------------------------------------------------
212 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
213   int ierr;
214   Ceed ceed;
215   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
216 
217   CeedBasis_Cuda_shared *data;
218   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
219 
220   CeedChk_Cu(ceed, cuModuleUnload(data->module));
221 
222   ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr);
223   ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr);
224   ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr);
225   ierr = cudaFree(data->d_collo_grad_1d); CeedChk_Cu(ceed, ierr);
226 
227   ierr = CeedFree(&data); CeedChkBackend(ierr);
228 
229   return CEED_ERROR_SUCCESS;
230 }
231 
232 //------------------------------------------------------------------------------
233 // Create tensor basis
234 //------------------------------------------------------------------------------
235 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
236                                         const CeedScalar *interp_1d,
237                                         const CeedScalar *grad_1d,
238                                         const CeedScalar *q_ref_1d,
239                                         const CeedScalar *q_weight_1d,
240                                         CeedBasis basis) {
241   int ierr;
242   Ceed ceed;
243   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
244   CeedBasis_Cuda_shared *data;
245   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
246 
247   // Copy basis data to GPU
248   const CeedInt q_bytes = Q_1d * sizeof(CeedScalar);
249   ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes);
250   CeedChk_Cu(ceed, ierr);
251   ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes,
252                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
253 
254   const CeedInt interp_bytes = q_bytes * P_1d;
255   ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes);
256   CeedChk_Cu(ceed, ierr);
257   ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes,
258                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
259 
260   ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes);
261   CeedChk_Cu(ceed, ierr);
262   ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes,
263                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
264 
265   // Compute collocated gradient and copy to GPU
266   data->d_collo_grad_1d = NULL;
267   if (dim == 3 && Q_1d >= P_1d) {
268     CeedScalar *collo_grad_1d;
269     ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr);
270     ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr);
271     ierr = cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d);
272     CeedChk_Cu(ceed, ierr);
273     ierr = cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d,
274                       cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
275     ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr);
276   }
277 
278   // Compile basis kernels
279   CeedInt num_comp;
280   ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
281   char *basis_kernel_path, *basis_kernel_source;
282   ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-shared-basis.h",
283                              &basis_kernel_path); CeedChkBackend(ierr);
284   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
285   ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source);
286   CeedChkBackend(ierr);
287   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete -----\n");
288   ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8,
289                          "BASIS_Q_1D", Q_1d,
290                          "BASIS_P_1D", P_1d,
291                          "BASIS_T_1D", CeedIntMax(Q_1d, P_1d),
292                          "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ?
293                              Q_1d : P_1d, dim),
294                          "BASIS_DIM", dim,
295                          "BASIS_NUM_COMP", num_comp,
296                          "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
297                          "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)
298                         ); CeedChkBackend(ierr);
299   ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp);
300   CeedChkBackend(ierr);
301   ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad);
302   CeedChkBackend(ierr);
303   ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight);
304   CeedChkBackend(ierr);
305   ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr);
306   ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr);
307 
308   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
309 
310   // Register backend functions
311   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
312                                 CeedBasisApplyTensor_Cuda_shared);
313   CeedChkBackend(ierr);
314   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
315                                 CeedBasisDestroy_Cuda_shared); CeedChkBackend(ierr);
316   return CEED_ERROR_SUCCESS;
317 }
318 //------------------------------------------------------------------------------
319