xref: /libCEED/backends/hip-ref/ceed-hip-ref-basis.c (revision cdf95791513f7c35170bef3ba2e19f272fe04533)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed/ceed.h>
18 #include <ceed/backend.h>
19 #include <ceed/jit-tools.h>
20 #include <hip/hip_runtime.h>
21 #include "ceed-hip-ref.h"
22 #include "../hip/ceed-hip-compile.h"
23 
24 //------------------------------------------------------------------------------
25 // Basis apply - tensor
26 //------------------------------------------------------------------------------
27 int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem,
28                        CeedTransposeMode t_mode,
29                        CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
30   int ierr;
31   Ceed ceed;
32   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
33   Ceed_Hip *ceed_Hip;
34   ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr);
35   CeedBasis_Hip *data;
36   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
37   const CeedInt transpose = t_mode == CEED_TRANSPOSE;
38   const int max_block_size = 64;
39 
40   // Read vectors
41   const CeedScalar *d_u;
42   CeedScalar *d_v;
43   if (eval_mode != CEED_EVAL_WEIGHT) {
44     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
45   }
46   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
47 
48   // Clear v for transpose operation
49   if (t_mode == CEED_TRANSPOSE) {
50     CeedInt length;
51     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
52     ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar));
53     CeedChk_Hip(ceed, ierr);
54   }
55 
56   // Basis action
57   switch (eval_mode) {
58   case CEED_EVAL_INTERP: {
59     void *interp_args[] = {(void *) &num_elem, (void *) &transpose,
60                            &data->d_interp_1d, &d_u, &d_v
61                           };
62     CeedInt Q_1d, dim;
63     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
64     ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
65     CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
66 
67     ierr = CeedRunKernelHip(ceed, data->Interp, num_elem, block_size, interp_args);
68     CeedChkBackend(ierr);
69   } break;
70   case CEED_EVAL_GRAD: {
71     void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d,
72                          &data->d_grad_1d, &d_u, &d_v
73                         };
74     CeedInt block_size = max_block_size;
75 
76     ierr = CeedRunKernelHip(ceed, data->Grad, num_elem, block_size, grad_args);
77     CeedChkBackend(ierr);
78   } break;
79   case CEED_EVAL_WEIGHT: {
80     void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v};
81     const int block_size = 64;
82     int grid_size = num_elem / block_size;
83     if (block_size * grid_size < num_elem)
84       grid_size += 1;
85 
86     ierr = CeedRunKernelHip(ceed, data->Weight, grid_size, block_size,
87                             weight_args); CeedChkBackend(ierr);
88   } break;
89   // LCOV_EXCL_START
90   // Evaluate the divergence to/from the quadrature points
91   case CEED_EVAL_DIV:
92     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
93   // Evaluate the curl to/from the quadrature points
94   case CEED_EVAL_CURL:
95     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
96   // Take no action, BasisApply should not have been called
97   case CEED_EVAL_NONE:
98     return CeedError(ceed, CEED_ERROR_BACKEND,
99                      "CEED_EVAL_NONE does not make sense in this context");
100     // LCOV_EXCL_STOP
101   }
102 
103   // Restore vectors
104   if (eval_mode != CEED_EVAL_WEIGHT) {
105     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
106   }
107   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
108   return CEED_ERROR_SUCCESS;
109 }
110 
111 //------------------------------------------------------------------------------
112 // Basis apply - non-tensor
113 //------------------------------------------------------------------------------
114 int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem,
115                                 CeedTransposeMode t_mode, CeedEvalMode eval_mode,
116                                 CeedVector u, CeedVector v) {
117   int ierr;
118   Ceed ceed;
119   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
120   Ceed_Hip *ceed_Hip;
121   ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr);
122   CeedBasisNonTensor_Hip *data;
123   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
124   CeedInt num_nodes, num_qpts;
125   ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChkBackend(ierr);
126   ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChkBackend(ierr);
127   const CeedInt transpose = t_mode == CEED_TRANSPOSE;
128   int elemsPerBlock = 1;
129   int grid = num_elem/elemsPerBlock+((
130                                        num_elem/elemsPerBlock*elemsPerBlock<num_elem)?1:0);
131 
132   // Read vectors
133   const CeedScalar *d_u;
134   CeedScalar *d_v;
135   if (eval_mode != CEED_EVAL_WEIGHT) {
136     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
137   }
138   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
139 
140   // Clear v for transpose operation
141   if (t_mode == CEED_TRANSPOSE) {
142     CeedInt length;
143     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
144     ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar));
145     CeedChk_Hip(ceed, ierr);
146   }
147 
148   // Apply basis operation
149   switch (eval_mode) {
150   case CEED_EVAL_INTERP: {
151     void *interp_args[] = {(void *) &num_elem, (void *) &transpose,
152                            &data->d_interp, &d_u, &d_v
153                           };
154     if (transpose) {
155       ierr = CeedRunKernelDimHip(ceed, data->Interp, grid, num_nodes, 1,
156                                  elemsPerBlock, interp_args); CeedChkBackend(ierr);
157     } else {
158       ierr = CeedRunKernelDimHip(ceed, data->Interp, grid, num_qpts, 1,
159                                  elemsPerBlock, interp_args); CeedChkBackend(ierr);
160     }
161   } break;
162   case CEED_EVAL_GRAD: {
163     void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_grad,
164                          &d_u, &d_v
165                         };
166     if (transpose) {
167       ierr = CeedRunKernelDimHip(ceed, data->Grad, grid, num_nodes, 1,
168                                  elemsPerBlock, grad_args); CeedChkBackend(ierr);
169     } else {
170       ierr = CeedRunKernelDimHip(ceed, data->Grad, grid, num_qpts, 1,
171                                  elemsPerBlock, grad_args); CeedChkBackend(ierr);
172     }
173   } break;
174   case CEED_EVAL_WEIGHT: {
175     void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight, &d_v};
176     ierr = CeedRunKernelDimHip(ceed, data->Weight, grid, num_qpts, 1,
177                                elemsPerBlock, weight_args); CeedChkBackend(ierr);
178   } break;
179   // LCOV_EXCL_START
180   // Evaluate the divergence to/from the quadrature points
181   case CEED_EVAL_DIV:
182     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
183   // Evaluate the curl to/from the quadrature points
184   case CEED_EVAL_CURL:
185     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
186   // Take no action, BasisApply should not have been called
187   case CEED_EVAL_NONE:
188     return CeedError(ceed, CEED_ERROR_BACKEND,
189                      "CEED_EVAL_NONE does not make sense in this context");
190     // LCOV_EXCL_STOP
191   }
192 
193   // Restore vectors
194   if (eval_mode != CEED_EVAL_WEIGHT) {
195     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
196   }
197   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
198   return CEED_ERROR_SUCCESS;
199 }
200 
201 //------------------------------------------------------------------------------
202 // Destroy tensor basis
203 //------------------------------------------------------------------------------
204 static int CeedBasisDestroy_Hip(CeedBasis basis) {
205   int ierr;
206   Ceed ceed;
207   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
208 
209   CeedBasis_Hip *data;
210   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
211 
212   CeedChk_Hip(ceed, hipModuleUnload(data->module));
213 
214   ierr = hipFree(data->d_q_weight_1d); CeedChk_Hip(ceed, ierr);
215   ierr = hipFree(data->d_interp_1d); CeedChk_Hip(ceed, ierr);
216   ierr = hipFree(data->d_grad_1d); CeedChk_Hip(ceed, ierr);
217   ierr = CeedFree(&data); CeedChkBackend(ierr);
218 
219   return CEED_ERROR_SUCCESS;
220 }
221 
222 //------------------------------------------------------------------------------
223 // Destroy non-tensor basis
224 //------------------------------------------------------------------------------
225 static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) {
226   int ierr;
227   Ceed ceed;
228   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
229 
230   CeedBasisNonTensor_Hip *data;
231   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
232 
233   CeedChk_Hip(ceed, hipModuleUnload(data->module));
234 
235   ierr = hipFree(data->d_q_weight); CeedChk_Hip(ceed, ierr);
236   ierr = hipFree(data->d_interp); CeedChk_Hip(ceed, ierr);
237   ierr = hipFree(data->d_grad); CeedChk_Hip(ceed, ierr);
238   ierr = CeedFree(&data); CeedChkBackend(ierr);
239 
240   return CEED_ERROR_SUCCESS;
241 }
242 
243 //------------------------------------------------------------------------------
244 // Create tensor
245 //------------------------------------------------------------------------------
246 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
247                                 const CeedScalar *interp_1d,
248                                 const CeedScalar *grad_1d,
249                                 const CeedScalar *qref1d,
250                                 const CeedScalar *q_weight_1d,
251                                 CeedBasis basis) {
252   int ierr;
253   Ceed ceed;
254   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
255   CeedBasis_Hip *data;
256   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
257 
258   // Copy data to GPU
259   const CeedInt q_bytes = Q_1d * sizeof(CeedScalar);
260   ierr = hipMalloc((void **)&data->d_q_weight_1d, q_bytes);
261   CeedChk_Hip(ceed, ierr);
262   ierr = hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes,
263                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
264 
265   const CeedInt interp_bytes = q_bytes * P_1d;
266   ierr = hipMalloc((void **)&data->d_interp_1d, interp_bytes);
267   CeedChk_Hip(ceed, ierr);
268   ierr = hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes,
269                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
270 
271   ierr = hipMalloc((void **)&data->d_grad_1d, interp_bytes);
272   CeedChk_Hip(ceed, ierr);
273   ierr = hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes,
274                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
275 
276   // Complie basis kernels
277   CeedInt ncomp;
278   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
279   char *basis_kernel_path, *basis_kernel_source;
280   ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-basis-tensor.h",
281                              &basis_kernel_path); CeedChkBackend(ierr);
282   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
283   ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source);
284   CeedChkBackend(ierr);
285   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n");
286   ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 7,
287                         "BASIS_Q_1D", Q_1d,
288                         "BASIS_P_1D", P_1d,
289                         "BASIS_BUF_LEN", ncomp * CeedIntPow(Q_1d > P_1d ?
290                             Q_1d : P_1d, dim),
291                         "BASIS_DIM", dim,
292                         "BASIS_NUM_COMP", ncomp,
293                         "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
294                         "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)
295                        ); CeedChkBackend(ierr);
296   ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp);
297   CeedChkBackend(ierr);
298   ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad);
299   CeedChkBackend(ierr);
300   ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight);
301   CeedChkBackend(ierr);
302   ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr);
303   ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr);
304 
305   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
306 
307   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
308                                 CeedBasisApply_Hip); CeedChkBackend(ierr);
309   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
310                                 CeedBasisDestroy_Hip); CeedChkBackend(ierr);
311   return CEED_ERROR_SUCCESS;
312 }
313 
314 //------------------------------------------------------------------------------
315 // Create non-tensor
316 //------------------------------------------------------------------------------
317 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
318                           CeedInt num_qpts, const CeedScalar *interp,
319                           const CeedScalar *grad, const CeedScalar *qref,
320                           const CeedScalar *q_weight, CeedBasis basis) {
321   int ierr;
322   Ceed ceed;
323   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
324   CeedBasisNonTensor_Hip *data;
325   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
326 
327   // Copy basis data to GPU
328   const CeedInt q_bytes = num_qpts * sizeof(CeedScalar);
329   ierr = hipMalloc((void **)&data->d_q_weight, q_bytes); CeedChk_Hip(ceed, ierr);
330   ierr = hipMemcpy(data->d_q_weight, q_weight, q_bytes,
331                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
332 
333   const CeedInt interp_bytes = q_bytes * num_nodes;
334   ierr = hipMalloc((void **)&data->d_interp, interp_bytes);
335   CeedChk_Hip(ceed, ierr);
336   ierr = hipMemcpy(data->d_interp, interp, interp_bytes,
337                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
338 
339   const CeedInt grad_bytes = q_bytes * num_nodes * dim;
340   ierr = hipMalloc((void **)&data->d_grad, grad_bytes); CeedChk_Hip(ceed, ierr);
341   ierr = hipMemcpy(data->d_grad, grad, grad_bytes,
342                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
343 
344   // Compile basis kernels
345   CeedInt ncomp;
346   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
347   char *basis_kernel_path, *basis_kernel_source;
348   ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-basis-nontensor.h",
349                              &basis_kernel_path); CeedChkBackend(ierr);
350   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
351   ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source);
352   CeedChkBackend(ierr);
353   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n");
354   ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 4,
355                         "BASIS_Q", num_qpts,
356                         "BASIS_P", num_nodes,
357                         "BASIS_DIM", dim,
358                         "BASIS_NUM_COMP", ncomp
359                        ); CeedChk_Hip(ceed, ierr);
360   ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp);
361   CeedChk_Hip(ceed, ierr);
362   ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad);
363   CeedChk_Hip(ceed, ierr);
364   ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight);
365   CeedChk_Hip(ceed, ierr);
366   ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr);
367   ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr);
368 
369   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
370 
371   // Register backend functions
372   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
373                                 CeedBasisApplyNonTensor_Hip); CeedChkBackend(ierr);
374   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
375                                 CeedBasisDestroyNonTensor_Hip); CeedChkBackend(ierr);
376   return CEED_ERROR_SUCCESS;
377 }
378 //------------------------------------------------------------------------------
379