xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-basis.c (revision 056ea4bdb268190ce972d0582063449a424e3e57)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 #include "ceed-cuda-ref.h"
14 #include "../cuda/ceed-cuda-compile.h"
15 
16 //------------------------------------------------------------------------------
17 // Basis apply - tensor
18 //------------------------------------------------------------------------------
19 int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem,
20                         CeedTransposeMode t_mode, CeedEvalMode eval_mode,
21                         CeedVector u, CeedVector v) {
22   int ierr;
23   Ceed ceed;
24   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
25   Ceed_Cuda *ceed_Cuda;
26   ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr);
27   CeedBasis_Cuda *data;
28   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
29   const CeedInt transpose = t_mode == CEED_TRANSPOSE;
30   const int max_block_size = 32;
31 
32   // Read vectors
33   const CeedScalar *d_u;
34   CeedScalar *d_v;
35   if (eval_mode != CEED_EVAL_WEIGHT) {
36     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
37   }
38   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
39 
40   // Clear v for transpose operation
41   if (t_mode == CEED_TRANSPOSE) {
42     CeedSize length;
43     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
44     ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar));
45     CeedChk_Cu(ceed, ierr);
46   }
47   CeedInt Q_1d, dim;
48   ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
49   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
50 
51   // Basis action
52   switch (eval_mode) {
53   case CEED_EVAL_INTERP: {
54     void *interp_args[] = {(void *) &num_elem, (void *) &transpose,
55                            &data->d_interp_1d, &d_u, &d_v
56                           };
57     CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
58 
59     ierr = CeedRunKernelCuda(ceed, data->Interp, num_elem, block_size, interp_args);
60     CeedChkBackend(ierr);
61   } break;
62   case CEED_EVAL_GRAD: {
63     void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d,
64                          &data->d_grad_1d, &d_u, &d_v
65                         };
66     CeedInt block_size = max_block_size;
67 
68     ierr = CeedRunKernelCuda(ceed, data->Grad, num_elem, block_size, grad_args);
69     CeedChkBackend(ierr);
70   } break;
71   case CEED_EVAL_WEIGHT: {
72     void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v};
73     const int grid_size = num_elem;
74     ierr = CeedRunKernelDimCuda(ceed, data->Weight, grid_size,
75                                 Q_1d, dim >= 2 ? Q_1d : 1, 1,
76                                 weight_args); CeedChkBackend(ierr);
77   } break;
78   // LCOV_EXCL_START
79   // Evaluate the divergence to/from the quadrature points
80   case CEED_EVAL_DIV:
81     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
82   // Evaluate the curl to/from the quadrature points
83   case CEED_EVAL_CURL:
84     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
85   // Take no action, BasisApply should not have been called
86   case CEED_EVAL_NONE:
87     return CeedError(ceed, CEED_ERROR_BACKEND,
88                      "CEED_EVAL_NONE does not make sense in this context");
89     // LCOV_EXCL_STOP
90   }
91 
92   // Restore vectors
93   if (eval_mode != CEED_EVAL_WEIGHT) {
94     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
95   }
96   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
97   return CEED_ERROR_SUCCESS;
98 }
99 
100 //------------------------------------------------------------------------------
101 // Basis apply - non-tensor
102 //------------------------------------------------------------------------------
103 int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem,
104                                  CeedTransposeMode t_mode, CeedEvalMode eval_mode,
105                                  CeedVector u, CeedVector v) {
106   int ierr;
107   Ceed ceed;
108   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
109   Ceed_Cuda *ceed_Cuda;
110   ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr);
111   CeedBasisNonTensor_Cuda *data;
112   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
113   CeedInt num_nodes, num_qpts;
114   ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChkBackend(ierr);
115   ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChkBackend(ierr);
116   const CeedInt transpose = t_mode == CEED_TRANSPOSE;
117   int elems_per_block = 1;
118   int grid = num_elem / elems_per_block +
119              ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
120 
121   // Read vectors
122   const CeedScalar *d_u;
123   CeedScalar *d_v;
124   if (eval_mode != CEED_EVAL_WEIGHT) {
125     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
126   }
127   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
128 
129   // Clear v for transpose operation
130   if (t_mode == CEED_TRANSPOSE) {
131     CeedSize length;
132     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
133     ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar));
134     CeedChk_Cu(ceed, ierr);
135   }
136 
137   // Apply basis operation
138   switch (eval_mode) {
139   case CEED_EVAL_INTERP: {
140     void *interp_args[] = {(void *) &num_elem, (void *) &transpose,
141                            &data->d_interp, &d_u, &d_v
142                           };
143     if (transpose) {
144       ierr = CeedRunKernelDimCuda(ceed, data->Interp, grid, num_nodes, 1,
145                                   elems_per_block, interp_args); CeedChkBackend(ierr);
146     } else {
147       ierr = CeedRunKernelDimCuda(ceed, data->Interp, grid, num_qpts, 1,
148                                   elems_per_block, interp_args); CeedChkBackend(ierr);
149     }
150   } break;
151   case CEED_EVAL_GRAD: {
152     void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_grad,
153                          &d_u, &d_v
154                         };
155     if (transpose) {
156       ierr = CeedRunKernelDimCuda(ceed, data->Grad, grid, num_nodes, 1,
157                                   elems_per_block, grad_args); CeedChkBackend(ierr);
158     } else {
159       ierr = CeedRunKernelDimCuda(ceed, data->Grad, grid, num_qpts, 1,
160                                   elems_per_block, grad_args); CeedChkBackend(ierr);
161     }
162   } break;
163   case CEED_EVAL_WEIGHT: {
164     void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight, &d_v};
165     ierr = CeedRunKernelDimCuda(ceed, data->Weight, grid, num_qpts, 1,
166                                 elems_per_block, weight_args); CeedChkBackend(ierr);
167   } break;
168   // LCOV_EXCL_START
169   // Evaluate the divergence to/from the quadrature points
170   case CEED_EVAL_DIV:
171     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
172   // Evaluate the curl to/from the quadrature points
173   case CEED_EVAL_CURL:
174     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
175   // Take no action, BasisApply should not have been called
176   case CEED_EVAL_NONE:
177     return CeedError(ceed, CEED_ERROR_BACKEND,
178                      "CEED_EVAL_NONE does not make sense in this context");
179     // LCOV_EXCL_STOP
180   }
181 
182   // Restore vectors
183   if (eval_mode != CEED_EVAL_WEIGHT) {
184     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
185   }
186   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
187   return CEED_ERROR_SUCCESS;
188 }
189 
190 //------------------------------------------------------------------------------
191 // Destroy tensor basis
192 //------------------------------------------------------------------------------
193 static int CeedBasisDestroy_Cuda(CeedBasis basis) {
194   int ierr;
195   Ceed ceed;
196   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
197 
198   CeedBasis_Cuda *data;
199   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
200 
201   CeedChk_Cu(ceed, cuModuleUnload(data->module));
202 
203   ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr);
204   ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr);
205   ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr);
206   ierr = CeedFree(&data); CeedChkBackend(ierr);
207 
208   return CEED_ERROR_SUCCESS;
209 }
210 
211 //------------------------------------------------------------------------------
212 // Destroy non-tensor basis
213 //------------------------------------------------------------------------------
214 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) {
215   int ierr;
216   Ceed ceed;
217   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
218 
219   CeedBasisNonTensor_Cuda *data;
220   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
221 
222   CeedChk_Cu(ceed, cuModuleUnload(data->module));
223 
224   ierr = cudaFree(data->d_q_weight); CeedChk_Cu(ceed, ierr);
225   ierr = cudaFree(data->d_interp); CeedChk_Cu(ceed, ierr);
226   ierr = cudaFree(data->d_grad); CeedChk_Cu(ceed, ierr);
227   ierr = CeedFree(&data); CeedChkBackend(ierr);
228 
229   return CEED_ERROR_SUCCESS;
230 }
231 
232 //------------------------------------------------------------------------------
233 // Create tensor
234 //------------------------------------------------------------------------------
235 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
236                                  const CeedScalar *interp_1d,
237                                  const CeedScalar *grad_1d,
238                                  const CeedScalar *q_ref_1d,
239                                  const CeedScalar *q_weight_1d,
240                                  CeedBasis basis) {
241   int ierr;
242   Ceed ceed;
243   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
244   CeedBasis_Cuda *data;
245   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
246 
247   // Copy data to GPU
248   const CeedInt q_bytes = Q_1d * sizeof(CeedScalar);
249   ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes);
250   CeedChk_Cu(ceed, ierr);
251   ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes,
252                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
253 
254   const CeedInt interp_bytes = q_bytes * P_1d;
255   ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes);
256   CeedChk_Cu(ceed, ierr);
257   ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes,
258                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
259 
260   ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes);
261   CeedChk_Cu(ceed, ierr);
262   ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes,
263                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
264 
265   // Complie basis kernels
266   CeedInt num_comp;
267   ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
268   char *basis_kernel_path, *basis_kernel_source;
269   ierr = CeedGetJitAbsolutePath(ceed,
270                                 "ceed/jit-source/cuda/cuda-ref-basis-tensor.h",
271                                 &basis_kernel_path); CeedChkBackend(ierr);
272   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
273   ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source);
274   CeedChkBackend(ierr);
275   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n");
276   ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 7,
277                          "BASIS_Q_1D", Q_1d,
278                          "BASIS_P_1D", P_1d,
279                          "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ?
280                              Q_1d : P_1d, dim),
281                          "BASIS_DIM", dim,
282                          "BASIS_NUM_COMP", num_comp,
283                          "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
284                          "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)
285                         ); CeedChkBackend(ierr);
286   ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp);
287   CeedChkBackend(ierr);
288   ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad);
289   CeedChkBackend(ierr);
290   ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight);
291   CeedChkBackend(ierr);
292   ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr);
293   ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr);
294 
295   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
296 
297   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
298                                 CeedBasisApply_Cuda); CeedChkBackend(ierr);
299   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
300                                 CeedBasisDestroy_Cuda); CeedChkBackend(ierr);
301   return CEED_ERROR_SUCCESS;
302 }
303 
304 //------------------------------------------------------------------------------
305 // Create non-tensor
306 //------------------------------------------------------------------------------
307 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim,
308                            CeedInt num_nodes,
309                            CeedInt num_qpts, const CeedScalar *interp,
310                            const CeedScalar *grad, const CeedScalar *qref,
311                            const CeedScalar *q_weight, CeedBasis basis) {
312   int ierr;
313   Ceed ceed;
314   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
315   CeedBasisNonTensor_Cuda *data;
316   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
317 
318   // Copy basis data to GPU
319   const CeedInt q_bytes = num_qpts * sizeof(CeedScalar);
320   ierr = cudaMalloc((void **)&data->d_q_weight, q_bytes); CeedChk_Cu(ceed, ierr);
321   ierr = cudaMemcpy(data->d_q_weight, q_weight, q_bytes,
322                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
323 
324   const CeedInt interp_bytes = q_bytes * num_nodes;
325   ierr = cudaMalloc((void **)&data->d_interp, interp_bytes);
326   CeedChk_Cu(ceed, ierr);
327   ierr = cudaMemcpy(data->d_interp, interp, interp_bytes,
328                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
329 
330   const CeedInt grad_bytes = q_bytes * num_nodes * dim;
331   ierr = cudaMalloc((void **)&data->d_grad, grad_bytes); CeedChk_Cu(ceed, ierr);
332   ierr = cudaMemcpy(data->d_grad, grad, grad_bytes,
333                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
334 
335   // Compile basis kernels
336   CeedInt num_comp;
337   ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
338   char *basis_kernel_path, *basis_kernel_source;
339   ierr = CeedGetJitAbsolutePath(ceed,
340                                 "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h",
341                                 &basis_kernel_path); CeedChkBackend(ierr);
342   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
343   ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source);
344   CeedChkBackend(ierr);
345   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n");
346   ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 4,
347                          "BASIS_Q", num_qpts,
348                          "BASIS_P", num_nodes,
349                          "BASIS_DIM", dim,
350                          "BASIS_NUM_COMP", num_comp
351                         ); CeedChk_Cu(ceed, ierr);
352   ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp);
353   CeedChk_Cu(ceed, ierr);
354   ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad);
355   CeedChk_Cu(ceed, ierr);
356   ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight);
357   CeedChk_Cu(ceed, ierr);
358   ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr);
359   ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr);
360 
361   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
362 
363   // Register backend functions
364   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
365                                 CeedBasisApplyNonTensor_Cuda); CeedChkBackend(ierr);
366   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
367                                 CeedBasisDestroyNonTensor_Cuda); CeedChkBackend(ierr);
368   return CEED_ERROR_SUCCESS;
369 }
370 //------------------------------------------------------------------------------
371