xref: /libCEED/backends/cuda-shared/ceed-cuda-shared-basis.c (revision 0be03a92683d319639505fd4b3dce80b3bae318f)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 #include <stddef.h>
14 #include "ceed-cuda-shared.h"
15 #include "../cuda/ceed-cuda-compile.h"
16 
17 
18 //------------------------------------------------------------------------------
19 // Device initalization
20 //------------------------------------------------------------------------------
21 int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d,
22                        CeedScalar **c_B);
23 int CeedCudaInitGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d,
24                      CeedInt Q_1d, CeedScalar **c_B_ptr,
25                      CeedScalar **c_G_ptr);
26 int CeedCudaInitCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d,
27                                CeedInt Q_1d, CeedScalar **c_B_ptr,
28                                CeedScalar **c_G_ptr);
29 
30 //------------------------------------------------------------------------------
31 // Apply basis
32 //------------------------------------------------------------------------------
33 int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem,
34                                      CeedTransposeMode t_mode,
35                                      CeedEvalMode eval_mode, CeedVector u,
36                                      CeedVector v) {
37   int ierr;
38   Ceed ceed;
39   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
40   Ceed_Cuda *ceed_Cuda;
41   CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr);
42   CeedBasis_Cuda_shared *data;
43   CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
44   CeedInt dim, num_comp;
45   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
46   ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
47 
48   // Read vectors
49   const CeedScalar *d_u;
50   CeedScalar *d_v;
51   if (eval_mode != CEED_EVAL_WEIGHT) {
52     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
53   }
54   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
55 
56   // Apply basis operation
57   switch (eval_mode) {
58   case CEED_EVAL_INTERP: {
59     CeedInt P_1d, Q_1d;
60     ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr);
61     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
62     CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
63     ierr = CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B);
64     CeedChkBackend(ierr);
65     void *interp_args[] = {(void *) &num_elem, &data->c_B,
66                            &d_u, &d_v
67                           };
68     if (dim == 1) {
69       CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2],
70                                            CeedIntMax(512 / thread_1d,
71                                                1)); // avoid >512 total threads
72       CeedInt grid = num_elem/elems_per_block +
73                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
74       CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar);
75       if (t_mode == CEED_TRANSPOSE) {
76         ierr = CeedRunKernelDimSharedCuda(ceed, data->InterpTranspose, grid, thread_1d,
77                                           1,
78                                           elems_per_block, shared_mem,
79                                           interp_args); CeedChkBackend(ierr);
80       } else {
81         ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1,
82                                           elems_per_block, shared_mem,
83                                           interp_args); CeedChkBackend(ierr);
84       }
85     } else if (dim == 2) {
86       const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
87       // elems_per_block must be at least 1
88       CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] /
89                                            num_comp : 1, 1);
90       CeedInt grid = num_elem / elems_per_block +
91                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
92       CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof(
93                              CeedScalar);
94       if (t_mode == CEED_TRANSPOSE) {
95         ierr = CeedRunKernelDimSharedCuda(ceed, data->InterpTranspose, grid, thread_1d,
96                                           thread_1d,
97                                           elems_per_block, shared_mem,
98                                           interp_args); CeedChkBackend(ierr);
99       } else {
100         ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d,
101                                           thread_1d,
102                                           elems_per_block, shared_mem,
103                                           interp_args); CeedChkBackend(ierr);
104       }
105     } else if (dim == 3) {
106       CeedInt elems_per_block = 1;
107       CeedInt grid = num_elem / elems_per_block +
108                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
109       CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof(
110                              CeedScalar);
111       if (t_mode == CEED_TRANSPOSE) {
112         ierr = CeedRunKernelDimSharedCuda(ceed, data->InterpTranspose, grid, thread_1d,
113                                           thread_1d,
114                                           elems_per_block, shared_mem,
115                                           interp_args); CeedChkBackend(ierr);
116       } else {
117         ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d,
118                                           thread_1d,
119                                           elems_per_block, shared_mem,
120                                           interp_args); CeedChkBackend(ierr);
121       }
122     }
123   } break;
124   case CEED_EVAL_GRAD: {
125     CeedInt P_1d, Q_1d;
126     ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr);
127     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
128     CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
129     if (data->d_collo_grad_1d) {
130       ierr = CeedCudaInitCollocatedGrad(data->d_interp_1d, data->d_collo_grad_1d,
131                                         P_1d,
132                                         Q_1d, &data->c_B, &data->c_G);
133       CeedChkBackend(ierr);
134     } else {
135       ierr = CeedCudaInitGrad(data->d_interp_1d, data->d_grad_1d, P_1d,
136                               Q_1d, &data->c_B, &data->c_G);
137       CeedChkBackend(ierr);
138     }
139     void *grad_args[] = {(void *) &num_elem, &data->c_B,
140                          &data->c_G, &d_u, &d_v
141                         };
142     if (dim == 1) {
143       CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2],
144                                            CeedIntMax(512 / thread_1d,
145                                                1)); // avoid >512 total threads
146       CeedInt grid = num_elem / elems_per_block +
147                      ((num_elem / elems_per_block*elems_per_block<num_elem) ? 1 : 0 );
148       CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar);
149       if (t_mode == CEED_TRANSPOSE) {
150         ierr = CeedRunKernelDimSharedCuda(ceed, data->GradTranspose, grid, thread_1d, 1,
151                                           elems_per_block, shared_mem, grad_args);
152         CeedChkBackend(ierr);
153       } else {
154         ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1,
155                                           elems_per_block, shared_mem, grad_args);
156         CeedChkBackend(ierr);
157       }
158     } else if (dim == 2) {
159       const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
160       // elems_per_block must be at least 1
161       CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] /
162                                            num_comp : 1, 1);
163       CeedInt grid = num_elem / elems_per_block +
164                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
165       CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof(
166                              CeedScalar);
167       if (t_mode == CEED_TRANSPOSE) {
168         ierr = CeedRunKernelDimSharedCuda(ceed, data->GradTranspose, grid, thread_1d,
169                                           thread_1d,
170                                           elems_per_block, shared_mem,
171                                           grad_args); CeedChkBackend(ierr);
172       } else {
173         ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d,
174                                           elems_per_block, shared_mem,
175                                           grad_args); CeedChkBackend(ierr);
176       }
177     } else if (dim == 3) {
178       CeedInt elems_per_block = 1;
179       CeedInt grid = num_elem / elems_per_block +
180                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
181       CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof(
182                              CeedScalar);
183       if (t_mode == CEED_TRANSPOSE) {
184         ierr = CeedRunKernelDimSharedCuda(ceed, data->GradTranspose, grid, thread_1d,
185                                           thread_1d,
186                                           elems_per_block, shared_mem,
187                                           grad_args); CeedChkBackend(ierr);
188       } else {
189         ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d,
190                                           elems_per_block, shared_mem,
191                                           grad_args); CeedChkBackend(ierr);
192       }
193     }
194   } break;
195   case CEED_EVAL_WEIGHT: {
196     CeedInt Q_1d;
197     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
198     void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v};
199     if (dim == 1) {
200       const CeedInt elems_per_block = 32 / Q_1d;
201       const CeedInt gridsize = num_elem / elems_per_block +
202                                ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
203       ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d,
204                                   elems_per_block, 1, weight_args);
205       CeedChkBackend(ierr);
206     } else if (dim == 2) {
207       const CeedInt opt_elems = 32 / (Q_1d * Q_1d);
208       const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
209       const CeedInt gridsize = num_elem / elems_per_block +
210                                ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
211       ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d,
212                                   elems_per_block, weight_args);
213       CeedChkBackend(ierr);
214     } else if (dim == 3) {
215       const CeedInt opt_elems = 32 / (Q_1d * Q_1d);
216       const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
217       const CeedInt gridsize = num_elem / elems_per_block +
218                                ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
219       ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d,
220                                   elems_per_block, weight_args);
221       CeedChkBackend(ierr);
222     }
223   } break;
224   // LCOV_EXCL_START
225   // Evaluate the divergence to/from the quadrature points
226   case CEED_EVAL_DIV:
227     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
228   // Evaluate the curl to/from the quadrature points
229   case CEED_EVAL_CURL:
230     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
231   // Take no action, BasisApply should not have been called
232   case CEED_EVAL_NONE:
233     return CeedError(ceed, CEED_ERROR_BACKEND,
234                      "CEED_EVAL_NONE does not make sense in this context");
235     // LCOV_EXCL_STOP
236   }
237 
238   // Restore vectors
239   if (eval_mode != CEED_EVAL_WEIGHT) {
240     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
241   }
242   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
243   return CEED_ERROR_SUCCESS;
244 }
245 
246 //------------------------------------------------------------------------------
247 // Destroy basis
248 //------------------------------------------------------------------------------
249 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
250   int ierr;
251   Ceed ceed;
252   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
253 
254   CeedBasis_Cuda_shared *data;
255   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
256 
257   CeedChk_Cu(ceed, cuModuleUnload(data->module));
258 
259   ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr);
260   ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr);
261   ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr);
262   ierr = cudaFree(data->d_collo_grad_1d); CeedChk_Cu(ceed, ierr);
263 
264   ierr = CeedFree(&data); CeedChkBackend(ierr);
265 
266   return CEED_ERROR_SUCCESS;
267 }
268 
269 //------------------------------------------------------------------------------
270 // Create tensor basis
271 //------------------------------------------------------------------------------
272 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
273                                         const CeedScalar *interp_1d,
274                                         const CeedScalar *grad_1d,
275                                         const CeedScalar *q_ref_1d,
276                                         const CeedScalar *q_weight_1d,
277                                         CeedBasis basis) {
278   int ierr;
279   Ceed ceed;
280   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
281   CeedBasis_Cuda_shared *data;
282   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
283 
284   // Copy basis data to GPU
285   const CeedInt q_bytes = Q_1d * sizeof(CeedScalar);
286   ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes);
287   CeedChk_Cu(ceed, ierr);
288   ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes,
289                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
290 
291   const CeedInt interp_bytes = q_bytes * P_1d;
292   ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes);
293   CeedChk_Cu(ceed, ierr);
294   ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes,
295                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
296 
297   ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes);
298   CeedChk_Cu(ceed, ierr);
299   ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes,
300                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
301 
302   // Compute collocated gradient and copy to GPU
303   data->d_collo_grad_1d = NULL;
304   bool has_collocated_grad = dim == 3 && Q_1d >= P_1d;
305   if (has_collocated_grad) {
306     CeedScalar *collo_grad_1d;
307     ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr);
308     ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr);
309     ierr = cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d);
310     CeedChk_Cu(ceed, ierr);
311     ierr = cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d,
312                       cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
313     ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr);
314   }
315 
316   // Compile basis kernels
317   CeedInt num_comp;
318   ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
319   char *basis_kernel_path, *basis_kernel_source;
320   ierr = CeedGetJitAbsolutePath(ceed,
321                                 "ceed/jit-source/cuda/cuda-shared-basis-tensor.h",
322                                 &basis_kernel_path); CeedChkBackend(ierr);
323   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
324   ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source);
325   CeedChkBackend(ierr);
326   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete -----\n");
327   ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8,
328                          "BASIS_Q_1D", Q_1d,
329                          "BASIS_P_1D", P_1d,
330                          "T_1D", CeedIntMax(Q_1d, P_1d),
331                          "BASIS_DIM", dim,
332                          "BASIS_NUM_COMP", num_comp,
333                          "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
334                          "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim),
335                          "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad
336                         ); CeedChkBackend(ierr);
337   ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp);
338   CeedChkBackend(ierr);
339   ierr = CeedGetKernelCuda(ceed, data->module, "InterpTranspose",
340                            &data->InterpTranspose);
341   CeedChkBackend(ierr);
342   ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad);
343   CeedChkBackend(ierr);
344   ierr = CeedGetKernelCuda(ceed, data->module, "GradTranspose",
345                            &data->GradTranspose);
346   CeedChkBackend(ierr);
347   ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight);
348   CeedChkBackend(ierr);
349   ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr);
350   ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr);
351 
352   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
353 
354   // Register backend functions
355   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
356                                 CeedBasisApplyTensor_Cuda_shared);
357   CeedChkBackend(ierr);
358   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
359                                 CeedBasisDestroy_Cuda_shared); CeedChkBackend(ierr);
360   return CEED_ERROR_SUCCESS;
361 }
362 //------------------------------------------------------------------------------
363