xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator.c (revision ea41f46c9c66b87ba3f0a3dda67c05d50bd07de1)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-source/hip/hip-types.h>
11 #include <stddef.h>
12 #include <hip/hiprtc.h>
13 
14 #include "../hip/ceed-hip-common.h"
15 #include "../hip/ceed-hip-compile.h"
16 #include "ceed-hip-gen-operator-build.h"
17 #include "ceed-hip-gen.h"
18 
19 //------------------------------------------------------------------------------
20 // Destroy operator
21 //------------------------------------------------------------------------------
22 static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
23   Ceed                  ceed;
24   CeedOperator_Hip_gen *impl;
25 
26   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
27   CeedCallBackend(CeedOperatorGetData(op, &impl));
28   if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem));
29   CeedCallBackend(CeedFree(&impl));
30   CeedCallBackend(CeedDestroy(&ceed));
31   return CEED_ERROR_SUCCESS;
32 }
33 
34 //------------------------------------------------------------------------------
35 // Apply and add to output
36 //------------------------------------------------------------------------------
37 static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
38   bool                   is_at_points, is_tensor, is_good_run = true;
39   Ceed                   ceed;
40   CeedInt                num_elem, num_input_fields, num_output_fields;
41   CeedEvalMode           eval_mode;
42   CeedVector             output_vecs[CEED_FIELD_MAX] = {NULL};
43   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
44   CeedQFunction_Hip_gen *qf_data;
45   CeedQFunction          qf;
46   CeedOperatorField     *op_input_fields, *op_output_fields;
47   CeedOperator_Hip_gen  *data;
48 
49   // Creation of the operator
50   {
51     bool is_good_build = false;
52 
53     CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_good_build));
54     if (!is_good_build) {
55       CeedOperator op_fallback;
56 
57       CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to code generation issue");
58       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
59       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
60       return CEED_ERROR_SUCCESS;
61     }
62   }
63 
64   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
65   CeedCallBackend(CeedOperatorGetData(op, &data));
66   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
67   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
68   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
69   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
70   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
71 
72   // Input vectors
73   for (CeedInt i = 0; i < num_input_fields; i++) {
74     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
75     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
76       data->fields.inputs[i] = NULL;
77     } else {
78       CeedVector vec;
79 
80       // Get input vector
81       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
82       if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
83       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
84     }
85   }
86 
87   // Output vectors
88   for (CeedInt i = 0; i < num_output_fields; i++) {
89     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
90     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
91       data->fields.outputs[i] = NULL;
92     } else {
93       CeedVector vec;
94 
95       // Get output vector
96       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
97       if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
98       output_vecs[i] = vec;
99       // Check for multiple output modes
100       CeedInt index = -1;
101       for (CeedInt j = 0; j < i; j++) {
102         if (vec == output_vecs[j]) {
103           index = j;
104           break;
105         }
106       }
107       if (index == -1) {
108         CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
109       } else {
110         data->fields.outputs[i] = data->fields.outputs[index];
111       }
112     }
113   }
114 
115   // Point coordinates, if needed
116   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
117   if (is_at_points) {
118     // Coords
119     CeedVector vec;
120 
121     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
122     CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
123     CeedCallBackend(CeedVectorDestroy(&vec));
124 
125     // Points per elem
126     if (num_elem != data->points.num_elem) {
127       CeedInt            *points_per_elem;
128       const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
129       CeedElemRestriction rstr_points = NULL;
130 
131       data->points.num_elem = num_elem;
132       CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
133       CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
134       for (CeedInt e = 0; e < num_elem; e++) {
135         CeedInt num_points_elem;
136 
137         CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
138         points_per_elem[e] = num_points_elem;
139       }
140       if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
141       CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
142       CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
143       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
144       CeedCallBackend(CeedFree(&points_per_elem));
145     }
146   }
147 
148   // Get context data
149   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
150 
151   // Apply operator
152   void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
153   const CeedInt dim       = data->dim;
154   const CeedInt Q_1d      = data->Q_1d;
155   const CeedInt P_1d      = data->max_P_1d;
156   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
157 
158   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
159   CeedInt block_sizes[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
160 
161   if (is_tensor) {
162     CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, P_1d, Q_1d, block_sizes));
163   } else {
164     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
165 
166     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
167     block_sizes[2]  = elems_per_block;
168   }
169   if (dim == 1 || !is_tensor) {
170     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
171     CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);
172 
173     CeedCallBackend(
174         CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs));
175   } else if (dim == 2) {
176     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
177     CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
178 
179     CeedCallBackend(
180         CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs));
181   } else if (dim == 3) {
182     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
183     CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
184 
185     CeedCallBackend(
186         CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs));
187   }
188 
189   // Restore input arrays
190   for (CeedInt i = 0; i < num_input_fields; i++) {
191     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
192     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
193     } else {
194       CeedVector vec;
195 
196       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
197       if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
198       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
199     }
200   }
201 
202   // Restore output arrays
203   for (CeedInt i = 0; i < num_output_fields; i++) {
204     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
205     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
206     } else {
207       CeedVector vec;
208 
209       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
210       if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
211       // Check for multiple output modes
212       CeedInt index = -1;
213 
214       for (CeedInt j = 0; j < i; j++) {
215         if (vec == output_vecs[j]) {
216           index = j;
217           break;
218         }
219       }
220       if (index == -1) {
221         CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
222       }
223     }
224   }
225 
226   // Restore point coordinates, if needed
227   if (is_at_points) {
228     CeedVector vec;
229 
230     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
231     CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
232     CeedCallBackend(CeedVectorDestroy(&vec));
233   }
234 
235   // Restore context data
236   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
237 
238   // Cleanup
239   CeedCallBackend(CeedDestroy(&ceed));
240   CeedCallBackend(CeedQFunctionDestroy(&qf));
241 
242   // Fallback if run was bad (out of resources)
243   if (!is_good_run) {
244     CeedOperator op_fallback;
245 
246     data->use_fallback = true;
247     CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to kernel execution issue");
248     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
249     CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
250     return CEED_ERROR_SUCCESS;
251   }
252   return CEED_ERROR_SUCCESS;
253 }
254 
255 //------------------------------------------------------------------------------
256 // Create operator
257 //------------------------------------------------------------------------------
258 int CeedOperatorCreate_Hip_gen(CeedOperator op) {
259   Ceed                  ceed;
260   CeedOperator_Hip_gen *impl;
261 
262   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
263   CeedCallBackend(CeedCalloc(1, &impl));
264   CeedCallBackend(CeedOperatorSetData(op, impl));
265   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen));
266   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen));
267   CeedCallBackend(CeedDestroy(&ceed));
268   return CEED_ERROR_SUCCESS;
269 }
270 
271 //------------------------------------------------------------------------------
272