1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 #include <stdbool.h>
14 #include <stddef.h>
15 #include <string.h>
16
17 #include "../cuda/ceed-cuda-common.h"
18 #include "../cuda/ceed-cuda-compile.h"
19 #include "ceed-cuda-ref.h"
20
21 //------------------------------------------------------------------------------
22 // Compile restriction kernels
23 //------------------------------------------------------------------------------
CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)24 static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr) {
25 Ceed ceed;
26 bool is_deterministic;
27 CeedInt num_elem, num_comp, elem_size, comp_stride;
28 CeedRestrictionType rstr_type;
29 CeedElemRestriction_Cuda *impl;
30
31 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
32 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
33 CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
34 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
35 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
36 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
37 if (rstr_type == CEED_RESTRICTION_POINTS) {
38 CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr, &elem_size));
39 } else {
40 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
41 }
42 is_deterministic = impl->d_l_vec_indices != NULL;
43
44 // Compile CUDA kernels
45 switch (rstr_type) {
46 case CEED_RESTRICTION_STRIDED: {
47 const char restriction_kernel_source[] = "// Strided restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-strided.h>\n";
48 bool has_backend_strides;
49 CeedInt strides[3] = {1, num_elem * elem_size, elem_size};
50
51 CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
52 if (!has_backend_strides) {
53 CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
54 }
55 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
56 "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM",
57 strides[2]));
58 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose));
59 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose));
60 } break;
61 case CEED_RESTRICTION_STANDARD: {
62 const char restriction_kernel_source[] = "// Standard restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-offset.h>\n";
63
64 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
65 "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
66 "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
67 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
68 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose));
69 } break;
70 case CEED_RESTRICTION_POINTS: {
71 const char restriction_kernel_source[] =
72 "// AtPoints restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-at-points.h>\n\n"
73 "// Standard restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-offset.h>\n";
74
75 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
76 "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
77 "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
78 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
79 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose));
80 } break;
81 case CEED_RESTRICTION_ORIENTED: {
82 const char restriction_kernel_source[] =
83 "// Oriented restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-oriented.h>\n\n"
84 "// Standard restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-offset.h>\n";
85
86 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
87 "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
88 "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
89 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedNoTranspose", &impl->ApplyNoTranspose));
90 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose));
91 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose));
92 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose));
93 } break;
94 case CEED_RESTRICTION_CURL_ORIENTED: {
95 const char restriction_kernel_source[] =
96 "// Curl oriented restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h>\n\n"
97 "// Standard restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-offset.h>\n";
98 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
99 "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
100 "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
101 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedNoTranspose", &impl->ApplyNoTranspose));
102 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->ApplyUnsignedNoTranspose));
103 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnorientedNoTranspose));
104 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose));
105 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose));
106 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose));
107 } break;
108 }
109 CeedCallBackend(CeedDestroy(&ceed));
110 return CEED_ERROR_SUCCESS;
111 }
112
113 //------------------------------------------------------------------------------
114 // Core apply restriction code
115 //------------------------------------------------------------------------------
CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr,CeedTransposeMode t_mode,bool use_signs,bool use_orients,CeedVector u,CeedVector v,CeedRequest * request)116 static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients,
117 CeedVector u, CeedVector v, CeedRequest *request) {
118 Ceed ceed;
119 CeedRestrictionType rstr_type;
120 const CeedScalar *d_u;
121 CeedScalar *d_v;
122 CeedElemRestriction_Cuda *impl;
123
124 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
125 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
126 CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
127
128 // Assemble kernel if needed
129 if (!impl->module) {
130 CeedCallBackend(CeedElemRestrictionSetupCompile_Cuda(rstr));
131 }
132
133 // Get vectors
134 CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
135 if (t_mode == CEED_TRANSPOSE) {
136 // Sum into for transpose mode, e-vec to l-vec
137 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
138 } else {
139 // Overwrite for notranspose mode, l-vec to e-vec
140 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
141 }
142
143 // Restrict
144 if (t_mode == CEED_NOTRANSPOSE) {
145 // L-vector -> E-vector
146 CeedInt elem_size;
147
148 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
149 const CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024;
150 const CeedInt grid = CeedDivUpInt(impl->num_nodes, block_size);
151
152 switch (rstr_type) {
153 case CEED_RESTRICTION_STRIDED: {
154 void *args[] = {&d_u, &d_v};
155
156 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args));
157 } break;
158 case CEED_RESTRICTION_POINTS:
159 case CEED_RESTRICTION_STANDARD: {
160 void *args[] = {&impl->d_offsets, &d_u, &d_v};
161
162 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args));
163 } break;
164 case CEED_RESTRICTION_ORIENTED: {
165 if (use_signs) {
166 void *args[] = {&impl->d_offsets, &impl->d_orients, &d_u, &d_v};
167
168 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args));
169 } else {
170 void *args[] = {&impl->d_offsets, &d_u, &d_v};
171
172 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args));
173 }
174 } break;
175 case CEED_RESTRICTION_CURL_ORIENTED: {
176 if (use_signs && use_orients) {
177 void *args[] = {&impl->d_offsets, &impl->d_curl_orients, &d_u, &d_v};
178
179 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args));
180 } else if (use_orients) {
181 void *args[] = {&impl->d_offsets, &impl->d_curl_orients, &d_u, &d_v};
182
183 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args));
184 } else {
185 void *args[] = {&impl->d_offsets, &d_u, &d_v};
186
187 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args));
188 }
189 } break;
190 }
191 } else {
192 // E-vector -> L-vector
193 const bool is_deterministic = impl->d_l_vec_indices != NULL;
194 const CeedInt block_size = 32;
195 const CeedInt grid = CeedDivUpInt(impl->num_nodes, block_size);
196
197 switch (rstr_type) {
198 case CEED_RESTRICTION_STRIDED: {
199 void *args[] = {&d_u, &d_v};
200
201 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
202 } break;
203 case CEED_RESTRICTION_POINTS: {
204 if (!is_deterministic) {
205 void *args[] = {&impl->d_offsets, &impl->d_points_per_elem, &d_u, &d_v};
206
207 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
208 } else {
209 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_points_per_elem, &impl->d_t_offsets, &d_u, &d_v};
210
211 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
212 }
213 } break;
214 case CEED_RESTRICTION_STANDARD: {
215 if (!is_deterministic) {
216 void *args[] = {&impl->d_offsets, &d_u, &d_v};
217
218 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
219 } else {
220 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v};
221
222 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
223 }
224 } break;
225 case CEED_RESTRICTION_ORIENTED: {
226 if (use_signs) {
227 if (!is_deterministic) {
228 void *args[] = {&impl->d_offsets, &impl->d_orients, &d_u, &d_v};
229
230 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
231 } else {
232 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_orients, &d_u, &d_v};
233
234 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
235 }
236 } else {
237 if (!is_deterministic) {
238 void *args[] = {&impl->d_offsets, &d_u, &d_v};
239
240 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args));
241 } else {
242 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v};
243
244 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args));
245 }
246 }
247 } break;
248 case CEED_RESTRICTION_CURL_ORIENTED: {
249 if (use_signs && use_orients) {
250 if (!is_deterministic) {
251 void *args[] = {&impl->d_offsets, &impl->d_curl_orients, &d_u, &d_v};
252
253 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
254 } else {
255 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v};
256
257 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
258 }
259 } else if (use_orients) {
260 if (!is_deterministic) {
261 void *args[] = {&impl->d_offsets, &impl->d_curl_orients, &d_u, &d_v};
262
263 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args));
264 } else {
265 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v};
266
267 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args));
268 }
269 } else {
270 if (!is_deterministic) {
271 void *args[] = {&impl->d_offsets, &d_u, &d_v};
272
273 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args));
274 } else {
275 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v};
276
277 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args));
278 }
279 }
280 } break;
281 }
282 }
283
284 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL;
285
286 // Restore arrays
287 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
288 CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
289 CeedCallBackend(CeedDestroy(&ceed));
290 return CEED_ERROR_SUCCESS;
291 }
292
293 //------------------------------------------------------------------------------
294 // Apply restriction
295 //------------------------------------------------------------------------------
CeedElemRestrictionApply_Cuda(CeedElemRestriction rstr,CeedTransposeMode t_mode,CeedVector u,CeedVector v,CeedRequest * request)296 static int CeedElemRestrictionApply_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) {
297 return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, true, true, u, v, request);
298 }
299
300 //------------------------------------------------------------------------------
301 // Apply unsigned restriction
302 //------------------------------------------------------------------------------
CeedElemRestrictionApplyUnsigned_Cuda(CeedElemRestriction rstr,CeedTransposeMode t_mode,CeedVector u,CeedVector v,CeedRequest * request)303 static int CeedElemRestrictionApplyUnsigned_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v,
304 CeedRequest *request) {
305 return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, true, u, v, request);
306 }
307
308 //------------------------------------------------------------------------------
309 // Apply unoriented restriction
310 //------------------------------------------------------------------------------
CeedElemRestrictionApplyUnoriented_Cuda(CeedElemRestriction rstr,CeedTransposeMode t_mode,CeedVector u,CeedVector v,CeedRequest * request)311 static int CeedElemRestrictionApplyUnoriented_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v,
312 CeedRequest *request) {
313 return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, false, u, v, request);
314 }
315
316 //------------------------------------------------------------------------------
317 // Get offsets
318 //------------------------------------------------------------------------------
CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr,CeedMemType mem_type,const CeedInt ** offsets)319 static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) {
320 CeedElemRestriction_Cuda *impl;
321 CeedRestrictionType rstr_type;
322
323 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
324 CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
325 switch (mem_type) {
326 case CEED_MEM_HOST:
327 *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->h_offsets_at_points : impl->h_offsets;
328 break;
329 case CEED_MEM_DEVICE:
330 *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->d_offsets_at_points : impl->d_offsets;
331 break;
332 }
333 return CEED_ERROR_SUCCESS;
334 }
335
336 //------------------------------------------------------------------------------
337 // Get orientations
338 //------------------------------------------------------------------------------
CeedElemRestrictionGetOrientations_Cuda(CeedElemRestriction rstr,CeedMemType mem_type,const bool ** orients)339 static int CeedElemRestrictionGetOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) {
340 CeedElemRestriction_Cuda *impl;
341 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
342
343 switch (mem_type) {
344 case CEED_MEM_HOST:
345 *orients = impl->h_orients;
346 break;
347 case CEED_MEM_DEVICE:
348 *orients = impl->d_orients;
349 break;
350 }
351 return CEED_ERROR_SUCCESS;
352 }
353
354 //------------------------------------------------------------------------------
355 // Get curl-conforming orientations
356 //------------------------------------------------------------------------------
CeedElemRestrictionGetCurlOrientations_Cuda(CeedElemRestriction rstr,CeedMemType mem_type,const CeedInt8 ** curl_orients)357 static int CeedElemRestrictionGetCurlOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) {
358 CeedElemRestriction_Cuda *impl;
359 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
360
361 switch (mem_type) {
362 case CEED_MEM_HOST:
363 *curl_orients = impl->h_curl_orients;
364 break;
365 case CEED_MEM_DEVICE:
366 *curl_orients = impl->d_curl_orients;
367 break;
368 }
369 return CEED_ERROR_SUCCESS;
370 }
371
372 //------------------------------------------------------------------------------
373 // Get offset for padded AtPoints E-layout
374 //------------------------------------------------------------------------------
CeedElemRestrictionGetAtPointsElementOffset_Cuda(CeedElemRestriction rstr,CeedInt elem,CeedSize * elem_offset)375 static int CeedElemRestrictionGetAtPointsElementOffset_Cuda(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset) {
376 CeedInt layout[3];
377
378 CeedCallBackend(CeedElemRestrictionGetELayout(rstr, layout));
379 *elem_offset = 0 * layout[0] + 0 * layout[1] + elem * layout[2];
380 return CEED_ERROR_SUCCESS;
381 }
382
383 //------------------------------------------------------------------------------
384 // Destroy restriction
385 //------------------------------------------------------------------------------
CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr)386 static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) {
387 Ceed ceed;
388 CeedElemRestriction_Cuda *impl;
389
390 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
391 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
392 if (impl->module) {
393 CeedCallCuda(ceed, cuModuleUnload(impl->module));
394 }
395 CeedCallBackend(CeedFree(&impl->h_offsets_owned));
396 CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_offsets_owned));
397 CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_t_offsets));
398 CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_t_indices));
399 CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_l_vec_indices));
400 CeedCallBackend(CeedFree(&impl->h_orients_owned));
401 CeedCallCuda(ceed, cudaFree((bool *)impl->d_orients_owned));
402 CeedCallBackend(CeedFree(&impl->h_curl_orients_owned));
403 CeedCallCuda(ceed, cudaFree((CeedInt8 *)impl->d_curl_orients_owned));
404 CeedCallBackend(CeedFree(&impl->h_offsets_at_points_owned));
405 CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_offsets_at_points_owned));
406 CeedCallBackend(CeedFree(&impl->h_points_per_elem_owned));
407 CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_points_per_elem_owned));
408 CeedCallBackend(CeedFree(&impl));
409 CeedCallBackend(CeedDestroy(&ceed));
410 return CEED_ERROR_SUCCESS;
411 }
412
413 //------------------------------------------------------------------------------
414 // Create transpose offsets and indices
415 //------------------------------------------------------------------------------
CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr,const CeedInt elem_size,const CeedInt * indices)416 static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt elem_size, const CeedInt *indices) {
417 Ceed ceed;
418 bool *is_node;
419 CeedSize l_size;
420 CeedInt num_elem, num_comp, num_nodes = 0;
421 CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices;
422 CeedRestrictionType rstr_type;
423 CeedElemRestriction_Cuda *impl;
424
425 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
426 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
427 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
428 CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
429 CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
430 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
431 const CeedInt size_indices = num_elem * elem_size;
432
433 // Count num_nodes
434 CeedCallBackend(CeedCalloc(l_size, &is_node));
435
436 for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1;
437 for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i];
438 impl->num_nodes = num_nodes;
439
440 // L-vector offsets array
441 CeedCallBackend(CeedCalloc(l_size, &ind_to_offset));
442 CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices));
443 for (CeedInt i = 0, j = 0; i < l_size; i++) {
444 if (is_node[i]) {
445 l_vec_indices[j] = i;
446 ind_to_offset[i] = j++;
447 }
448 }
449 CeedCallBackend(CeedFree(&is_node));
450
451 // Compute transpose offsets and indices
452 const CeedInt size_offsets = num_nodes + 1;
453
454 CeedCallBackend(CeedCalloc(size_offsets, &t_offsets));
455 CeedCallBackend(CeedMalloc(size_indices, &t_indices));
456 // Count node multiplicity
457 for (CeedInt e = 0; e < num_elem; ++e) {
458 for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1];
459 }
460 // Convert to running sum
461 for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1];
462 // List all E-vec indices associated with L-vec node
463 for (CeedInt e = 0; e < num_elem; ++e) {
464 for (CeedInt i = 0; i < elem_size; ++i) {
465 const CeedInt lid = elem_size * e + i;
466 const CeedInt gid = indices[lid];
467
468 t_indices[t_offsets[ind_to_offset[gid]]++] = lid;
469 }
470 }
471 // Reset running sum
472 for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1];
473 t_offsets[0] = 0;
474
475 // Copy data to device
476 // -- L-vector indices
477 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt)));
478 CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), cudaMemcpyHostToDevice));
479 // -- Transpose offsets
480 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt)));
481 CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), cudaMemcpyHostToDevice));
482 // -- Transpose indices
483 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt)));
484 CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), cudaMemcpyHostToDevice));
485
486 // Cleanup
487 CeedCallBackend(CeedFree(&ind_to_offset));
488 CeedCallBackend(CeedFree(&l_vec_indices));
489 CeedCallBackend(CeedFree(&t_offsets));
490 CeedCallBackend(CeedFree(&t_indices));
491 CeedCallBackend(CeedDestroy(&ceed));
492 return CEED_ERROR_SUCCESS;
493 }
494
495 //------------------------------------------------------------------------------
496 // Create restriction
497 //------------------------------------------------------------------------------
CeedElemRestrictionCreate_Cuda(CeedMemType mem_type,CeedCopyMode copy_mode,const CeedInt * offsets,const bool * orients,const CeedInt8 * curl_orients,CeedElemRestriction rstr)498 int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, const bool *orients,
499 const CeedInt8 *curl_orients, CeedElemRestriction rstr) {
500 Ceed ceed, ceed_parent;
501 bool is_deterministic;
502 CeedInt num_elem, num_comp, elem_size;
503 CeedRestrictionType rstr_type;
504 CeedElemRestriction_Cuda *impl;
505
506 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
507 CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
508 CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic));
509 CeedCallBackend(CeedDestroy(&ceed_parent));
510 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
511 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
512 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
513 CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
514 // Use max number of points as elem size for AtPoints restrictions
515 if (rstr_type == CEED_RESTRICTION_POINTS) {
516 CeedInt max_points = 0;
517
518 for (CeedInt i = 0; i < num_elem; i++) {
519 max_points = CeedIntMax(max_points, offsets[i + 1] - offsets[i]);
520 }
521 elem_size = max_points;
522 }
523 const CeedInt size = num_elem * elem_size;
524
525 CeedCallBackend(CeedCalloc(1, &impl));
526 impl->num_nodes = size;
527 CeedCallBackend(CeedElemRestrictionSetData(rstr, impl));
528
529 // Set layouts
530 {
531 bool has_backend_strides;
532 CeedInt layout[3] = {1, size, elem_size};
533
534 CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout));
535 if (rstr_type == CEED_RESTRICTION_STRIDED) {
536 CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
537 if (has_backend_strides) {
538 CeedCallBackend(CeedElemRestrictionSetLLayout(rstr, layout));
539 }
540 }
541 }
542
543 // Pad AtPoints indices
544 if (rstr_type == CEED_RESTRICTION_POINTS) {
545 CeedSize offsets_len = elem_size * num_elem, at_points_size = num_elem + 1;
546 CeedInt max_points = elem_size, *offsets_padded, *points_per_elem;
547
548 CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "only MemType Host supported when creating AtPoints restriction");
549 CeedCallBackend(CeedMalloc(offsets_len, &offsets_padded));
550 CeedCallBackend(CeedMalloc(num_elem, &points_per_elem));
551 for (CeedInt i = 0; i < num_elem; i++) {
552 CeedInt num_points = offsets[i + 1] - offsets[i];
553 CeedInt last_point = 0;
554
555 points_per_elem[i] = num_points;
556 at_points_size += num_points;
557 // -- Copy all points in element
558 for (CeedInt j = 0; j < num_points; j++) {
559 offsets_padded[i * max_points + j] = offsets[offsets[i] + j] * num_comp;
560 last_point = offsets_padded[i * max_points + j];
561 }
562 // -- Replicate out last point in element
563 for (CeedInt j = num_points; j < max_points; j++) {
564 offsets_padded[i * max_points + j] = last_point;
565 }
566 }
567 CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, at_points_size, &impl->h_offsets_at_points_owned, &impl->h_offsets_at_points_borrowed,
568 &impl->h_offsets_at_points));
569 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_offsets_at_points_owned, at_points_size * sizeof(CeedInt)));
570 CeedCallCuda(ceed, cudaMemcpy((CeedInt **)impl->d_offsets_at_points_owned, impl->h_offsets_at_points, at_points_size * sizeof(CeedInt),
571 cudaMemcpyHostToDevice));
572 impl->d_offsets_at_points = (CeedInt *)impl->d_offsets_at_points_owned;
573
574 // -- Use padded offsets for the rest of the setup
575 offsets = (const CeedInt *)offsets_padded;
576 copy_mode = CEED_OWN_POINTER;
577 CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, elem_size * num_elem * num_comp));
578
579 // -- Points per element
580 CeedCallBackend(CeedSetHostCeedIntArray(points_per_elem, CEED_OWN_POINTER, num_elem, &impl->h_points_per_elem_owned,
581 &impl->h_points_per_elem_borrowed, &impl->h_points_per_elem));
582 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_points_per_elem_owned, num_elem * sizeof(CeedInt)));
583 CeedCallCuda(ceed,
584 cudaMemcpy((CeedInt **)impl->d_points_per_elem_owned, impl->h_points_per_elem, num_elem * sizeof(CeedInt), cudaMemcpyHostToDevice));
585 impl->d_points_per_elem = (CeedInt *)impl->d_points_per_elem_owned;
586 }
587
588 // Set up device offset/orientation arrays
589 if (rstr_type != CEED_RESTRICTION_STRIDED) {
590 switch (mem_type) {
591 case CEED_MEM_HOST: {
592 CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, size, &impl->h_offsets_owned, &impl->h_offsets_borrowed, &impl->h_offsets));
593 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_offsets_owned, size * sizeof(CeedInt)));
594 CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_offsets_owned, impl->h_offsets, size * sizeof(CeedInt), cudaMemcpyHostToDevice));
595 impl->d_offsets = (CeedInt *)impl->d_offsets_owned;
596 if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, elem_size, offsets));
597 } break;
598 case CEED_MEM_DEVICE: {
599 CeedCallBackend(CeedSetDeviceCeedIntArray_Cuda(ceed, offsets, copy_mode, size, &impl->d_offsets_owned, &impl->d_offsets_borrowed,
600 (const CeedInt **)&impl->d_offsets));
601 CeedCallBackend(CeedMalloc(size, &impl->h_offsets_owned));
602 CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->h_offsets_owned, impl->d_offsets, size * sizeof(CeedInt), cudaMemcpyDeviceToHost));
603 impl->h_offsets = impl->h_offsets_owned;
604 if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, elem_size, offsets));
605 } break;
606 }
607
608 // Orientation data
609 if (rstr_type == CEED_RESTRICTION_ORIENTED) {
610 switch (mem_type) {
611 case CEED_MEM_HOST: {
612 CeedCallBackend(CeedSetHostBoolArray(orients, copy_mode, size, &impl->h_orients_owned, &impl->h_orients_borrowed, &impl->h_orients));
613 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_orients_owned, size * sizeof(bool)));
614 CeedCallCuda(ceed, cudaMemcpy((bool *)impl->d_orients_owned, impl->h_orients, size * sizeof(bool), cudaMemcpyHostToDevice));
615 impl->d_orients = impl->d_orients_owned;
616 } break;
617 case CEED_MEM_DEVICE: {
618 CeedCallBackend(CeedSetDeviceBoolArray_Cuda(ceed, orients, copy_mode, size, &impl->d_orients_owned, &impl->d_orients_borrowed,
619 (const bool **)&impl->d_orients));
620 CeedCallBackend(CeedMalloc(size, &impl->h_orients_owned));
621 CeedCallCuda(ceed, cudaMemcpy((bool *)impl->h_orients_owned, impl->d_orients, size * sizeof(bool), cudaMemcpyDeviceToHost));
622 impl->h_orients = impl->h_orients_owned;
623 } break;
624 }
625 } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) {
626 switch (mem_type) {
627 case CEED_MEM_HOST: {
628 CeedCallBackend(CeedSetHostCeedInt8Array(curl_orients, copy_mode, 3 * size, &impl->h_curl_orients_owned, &impl->h_curl_orients_borrowed,
629 &impl->h_curl_orients));
630 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_curl_orients_owned, 3 * size * sizeof(CeedInt8)));
631 CeedCallCuda(ceed,
632 cudaMemcpy((CeedInt8 *)impl->d_curl_orients_owned, impl->h_curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyHostToDevice));
633 impl->d_curl_orients = impl->d_curl_orients_owned;
634 } break;
635 case CEED_MEM_DEVICE: {
636 CeedCallBackend(CeedSetDeviceCeedInt8Array_Cuda(ceed, curl_orients, copy_mode, 3 * size, &impl->d_curl_orients_owned,
637 &impl->d_curl_orients_borrowed, (const CeedInt8 **)&impl->d_curl_orients));
638 CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_owned));
639 CeedCallCuda(ceed,
640 cudaMemcpy((CeedInt8 *)impl->h_curl_orients_owned, impl->d_curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyDeviceToHost));
641 impl->h_curl_orients = impl->h_curl_orients_owned;
642 } break;
643 }
644 }
645 }
646
647 // Register backend functions
648 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Cuda));
649 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Cuda));
650 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Cuda));
651 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda));
652 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Cuda));
653 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Cuda));
654 if (rstr_type == CEED_RESTRICTION_POINTS) {
655 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset",
656 CeedElemRestrictionGetAtPointsElementOffset_Cuda));
657 }
658 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Cuda));
659 CeedCallBackend(CeedDestroy(&ceed));
660 return CEED_ERROR_SUCCESS;
661 }
662
663 //------------------------------------------------------------------------------
664