xref: /libCEED/backends/hip-ref/ceed-hip-ref-restriction.c (revision cdf95791513f7c35170bef3ba2e19f272fe04533)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed/ceed.h>
18 #include <ceed/backend.h>
19 #include <ceed/jit-tools.h>
20 #include <hip/hip_runtime.h>
21 #include <stdbool.h>
22 #include <stddef.h>
23 #include "ceed-hip-ref.h"
24 #include "../hip/ceed-hip-compile.h"
25 
26 //------------------------------------------------------------------------------
27 // Apply restriction
28 //------------------------------------------------------------------------------
29 static int CeedElemRestrictionApply_Hip(CeedElemRestriction r,
30                                         CeedTransposeMode t_mode, CeedVector u,
31                                         CeedVector v, CeedRequest *request) {
32   int ierr;
33   CeedElemRestriction_Hip *impl;
34   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
35   Ceed ceed;
36   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
37   Ceed_Hip *data;
38   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
39   const CeedInt block_size = 64;
40   const CeedInt num_nodes = impl->num_nodes;
41   CeedInt num_elem, elem_size;
42   CeedElemRestrictionGetNumElements(r, &num_elem);
43   ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
44   hipFunction_t kernel;
45 
46   // Get vectors
47   const CeedScalar *d_u;
48   CeedScalar *d_v;
49   ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
50   if (t_mode == CEED_TRANSPOSE) {
51     // Sum into for transpose mode, e-vec to l-vec
52     ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
53   } else {
54     // Overwrite for notranspose mode, l-vec to e-vec
55     ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
56   }
57 
58   // Restrict
59   if (t_mode == CEED_NOTRANSPOSE) {
60     // L-vector -> E-vector
61     if (impl->d_ind) {
62       // -- Offsets provided
63       kernel = impl->OffsetNoTranspose;
64       void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
65       CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256;
66       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
67                               block_size, args); CeedChkBackend(ierr);
68     } else {
69       // -- Strided restriction
70       kernel = impl->StridedNoTranspose;
71       void *args[] = {&num_elem, &d_u, &d_v};
72       CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256;
73       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
74                               block_size, args); CeedChkBackend(ierr);
75     }
76   } else {
77     // E-vector -> L-vector
78     if (impl->d_ind) {
79       // -- Offsets provided
80       kernel = impl->OffsetTranspose;
81       void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices,
82                       &impl->d_t_offsets, &d_u, &d_v
83                      };
84       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
85                               block_size, args); CeedChkBackend(ierr);
86     } else {
87       // -- Strided restriction
88       kernel = impl->StridedTranspose;
89       void *args[] = {&num_elem, &d_u, &d_v};
90       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
91                               block_size, args); CeedChkBackend(ierr);
92     }
93   }
94 
95   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
96     *request = NULL;
97 
98   // Restore arrays
99   ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
100   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
101   return CEED_ERROR_SUCCESS;
102 }
103 
104 //------------------------------------------------------------------------------
105 // Blocked not supported
106 //------------------------------------------------------------------------------
107 int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block,
108                                       CeedTransposeMode t_mode, CeedVector u,
109                                       CeedVector v, CeedRequest *request) {
110   // LCOV_EXCL_START
111   int ierr;
112   Ceed ceed;
113   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
114   return CeedError(ceed, CEED_ERROR_BACKEND,
115                    "Backend does not implement blocked restrictions");
116   // LCOV_EXCL_STOP
117 }
118 
119 //------------------------------------------------------------------------------
120 // Get offsets
121 //------------------------------------------------------------------------------
122 static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr,
123     CeedMemType mtype, const CeedInt **offsets) {
124   int ierr;
125   CeedElemRestriction_Hip *impl;
126   ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr);
127 
128   switch (mtype) {
129   case CEED_MEM_HOST:
130     *offsets = impl->h_ind;
131     break;
132   case CEED_MEM_DEVICE:
133     *offsets = impl->d_ind;
134     break;
135   }
136   return CEED_ERROR_SUCCESS;
137 }
138 
139 //------------------------------------------------------------------------------
140 // Destroy restriction
141 //------------------------------------------------------------------------------
142 static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) {
143   int ierr;
144   CeedElemRestriction_Hip *impl;
145   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
146 
147   Ceed ceed;
148   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
149   ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr);
150   ierr = CeedFree(&impl->h_ind_allocated); CeedChkBackend(ierr);
151   ierr = hipFree(impl->d_ind_allocated); CeedChk_Hip(ceed, ierr);
152   ierr = hipFree(impl->d_t_offsets); CeedChk_Hip(ceed, ierr);
153   ierr = hipFree(impl->d_t_indices); CeedChk_Hip(ceed, ierr);
154   ierr = hipFree(impl->d_l_vec_indices); CeedChk_Hip(ceed, ierr);
155   ierr = CeedFree(&impl); CeedChkBackend(ierr);
156 
157   return CEED_ERROR_SUCCESS;
158 }
159 
160 //------------------------------------------------------------------------------
161 // Create transpose offsets and indices
162 //------------------------------------------------------------------------------
163 static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r,
164     const CeedInt *indices) {
165   int ierr;
166   Ceed ceed;
167   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
168   CeedElemRestriction_Hip *impl;
169   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
170   CeedInt num_elem, elem_size, l_size, num_comp;
171   ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr);
172   ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
173   ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr);
174   ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr);
175 
176   // Count num_nodes
177   bool *is_node;
178   ierr = CeedCalloc(l_size, &is_node); CeedChkBackend(ierr);
179   const CeedInt size_indices = num_elem * elem_size;
180   for (CeedInt i = 0; i < size_indices; i++)
181     is_node[indices[i]] = 1;
182   CeedInt num_nodes = 0;
183   for (CeedInt i = 0; i < l_size; i++)
184     num_nodes += is_node[i];
185   impl->num_nodes = num_nodes;
186 
187   // L-vector offsets array
188   CeedInt *ind_to_offset, *l_vec_indices;
189   ierr = CeedCalloc(l_size, &ind_to_offset); CeedChkBackend(ierr);
190   ierr = CeedCalloc(num_nodes, &l_vec_indices); CeedChkBackend(ierr);
191   CeedInt j = 0;
192   for (CeedInt i = 0; i < l_size; i++)
193     if (is_node[i]) {
194       l_vec_indices[j] = i;
195       ind_to_offset[i] = j++;
196     }
197   ierr = CeedFree(&is_node); CeedChkBackend(ierr);
198 
199   // Compute transpose offsets and indices
200   const CeedInt size_offsets = num_nodes + 1;
201   CeedInt *t_offsets;
202   ierr = CeedCalloc(size_offsets, &t_offsets); CeedChkBackend(ierr);
203   CeedInt *t_indices;
204   ierr = CeedMalloc(size_indices, &t_indices); CeedChkBackend(ierr);
205   // Count node multiplicity
206   for (CeedInt e = 0; e < num_elem; ++e)
207     for (CeedInt i = 0; i < elem_size; ++i)
208       ++t_offsets[ind_to_offset[indices[elem_size*e + i]] + 1];
209   // Convert to running sum
210   for (CeedInt i = 1; i < size_offsets; ++i)
211     t_offsets[i] += t_offsets[i-1];
212   // List all E-vec indices associated with L-vec node
213   for (CeedInt e = 0; e < num_elem; ++e) {
214     for (CeedInt i = 0; i < elem_size; ++i) {
215       const CeedInt lid = elem_size*e + i;
216       const CeedInt gid = indices[lid];
217       t_indices[t_offsets[ind_to_offset[gid]]++] = lid;
218     }
219   }
220   // Reset running sum
221   for (int i = size_offsets - 1; i > 0; --i)
222     t_offsets[i] = t_offsets[i - 1];
223   t_offsets[0] = 0;
224 
225   // Copy data to device
226   // -- L-vector indices
227   ierr = hipMalloc((void **)&impl->d_l_vec_indices, num_nodes*sizeof(CeedInt));
228   CeedChk_Hip(ceed, ierr);
229   ierr = hipMemcpy(impl->d_l_vec_indices, l_vec_indices,
230                    num_nodes*sizeof(CeedInt), hipMemcpyHostToDevice);
231   CeedChk_Hip(ceed, ierr);
232   // -- Transpose offsets
233   ierr = hipMalloc((void **)&impl->d_t_offsets, size_offsets*sizeof(CeedInt));
234   CeedChk_Hip(ceed, ierr);
235   ierr = hipMemcpy(impl->d_t_offsets, t_offsets, size_offsets*sizeof(CeedInt),
236                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
237   // -- Transpose indices
238   ierr = hipMalloc((void **)&impl->d_t_indices, size_indices*sizeof(CeedInt));
239   CeedChk_Hip(ceed, ierr);
240   ierr = hipMemcpy(impl->d_t_indices, t_indices, size_indices*sizeof(CeedInt),
241                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
242 
243   // Cleanup
244   ierr = CeedFree(&ind_to_offset); CeedChkBackend(ierr);
245   ierr = CeedFree(&l_vec_indices); CeedChkBackend(ierr);
246   ierr = CeedFree(&t_offsets); CeedChkBackend(ierr);
247   ierr = CeedFree(&t_indices); CeedChkBackend(ierr);
248 
249   return CEED_ERROR_SUCCESS;
250 }
251 
252 //------------------------------------------------------------------------------
253 // Create restriction
254 //------------------------------------------------------------------------------
255 int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode,
256                                   const CeedInt *indices,
257                                   CeedElemRestriction r) {
258   int ierr;
259   Ceed ceed;
260   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
261   CeedElemRestriction_Hip *impl;
262   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
263   CeedInt num_elem, num_comp, elem_size;
264   ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr);
265   ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr);
266   ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
267   CeedInt size = num_elem * elem_size;
268   CeedInt strides[3] = {1, size, elem_size};
269   CeedInt comp_stride = 1;
270 
271   // Stride data
272   bool is_strided;
273   ierr = CeedElemRestrictionIsStrided(r, &is_strided); CeedChkBackend(ierr);
274   if (is_strided) {
275     bool has_backend_strides;
276     ierr = CeedElemRestrictionHasBackendStrides(r, &has_backend_strides);
277     CeedChkBackend(ierr);
278     if (!has_backend_strides) {
279       ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr);
280     }
281   } else {
282     ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr);
283   }
284 
285   impl->h_ind           = NULL;
286   impl->h_ind_allocated = NULL;
287   impl->d_ind           = NULL;
288   impl->d_ind_allocated = NULL;
289   impl->d_t_indices     = NULL;
290   impl->d_t_offsets     = NULL;
291   impl->num_nodes = size;
292   ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr);
293   CeedInt layout[3] = {1, elem_size*num_elem, elem_size};
294   ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr);
295 
296   // Set up device indices/offset arrays
297   if (mtype == CEED_MEM_HOST) {
298     switch (cmode) {
299     case CEED_OWN_POINTER:
300       impl->h_ind_allocated = (CeedInt *)indices;
301       impl->h_ind = (CeedInt *)indices;
302       break;
303     case CEED_USE_POINTER:
304       impl->h_ind = (CeedInt *)indices;
305       break;
306     case CEED_COPY_VALUES:
307       break;
308     }
309     if (indices != NULL) {
310       ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt));
311       CeedChk_Hip(ceed, ierr);
312       impl->d_ind_allocated = impl->d_ind; // We own the device memory
313       ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt),
314                        hipMemcpyHostToDevice);
315       CeedChk_Hip(ceed, ierr);
316       ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr);
317     }
318   } else if (mtype == CEED_MEM_DEVICE) {
319     switch (cmode) {
320     case CEED_COPY_VALUES:
321       if (indices != NULL) {
322         ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt));
323         CeedChk_Hip(ceed, ierr);
324         impl->d_ind_allocated = impl->d_ind; // We own the device memory
325         ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt),
326                          hipMemcpyDeviceToDevice);
327         CeedChk_Hip(ceed, ierr);
328       }
329       break;
330     case CEED_OWN_POINTER:
331       impl->d_ind = (CeedInt *)indices;
332       impl->d_ind_allocated = impl->d_ind;
333       break;
334     case CEED_USE_POINTER:
335       impl->d_ind = (CeedInt *)indices;
336     }
337     if (indices != NULL) {
338       ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr);
339     }
340   } else {
341     // LCOV_EXCL_START
342     return CeedError(ceed, CEED_ERROR_BACKEND,
343                      "Only MemType = HOST or DEVICE supported");
344     // LCOV_EXCL_STOP
345   }
346 
347   // Compile HIP kernels
348   CeedInt num_nodes = impl->num_nodes;
349   char *restriction_kernel_path, *restriction_kernel_source;
350   ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-restriction.h",
351                              &restriction_kernel_path); CeedChkBackend(ierr);
352   CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source -----\n");
353   ierr = CeedLoadSourceToBuffer(ceed, restriction_kernel_path,
354                                 &restriction_kernel_source);
355   CeedChkBackend(ierr);
356   CeedDebug256(ceed, 2,
357                "----- Loading Restriction Kernel Source Complete! -----\n");
358   ierr = CeedCompileHip(ceed, restriction_kernel_source, &impl->module, 8,
359                         "RESTR_ELEM_SIZE", elem_size,
360                         "RESTR_NUM_ELEM", num_elem,
361                         "RESTR_NUM_COMP", num_comp,
362                         "RESTR_NUM_NODES", num_nodes,
363                         "RESTR_COMP_STRIDE", comp_stride,
364                         "RESTR_STRIDE_NODES", strides[0],
365                         "RESTR_STRIDE_COMP", strides[1],
366                         "RESTR_STRIDE_ELEM", strides[2]); CeedChkBackend(ierr);
367   ierr = CeedGetKernelHip(ceed, impl->module, "StridedNoTranspose",
368                           &impl->StridedNoTranspose); CeedChkBackend(ierr);
369   ierr = CeedGetKernelHip(ceed, impl->module, "OffsetNoTranspose",
370                           &impl->OffsetNoTranspose); CeedChkBackend(ierr);
371   ierr = CeedGetKernelHip(ceed, impl->module, "StridedTranspose",
372                           &impl->StridedTranspose); CeedChkBackend(ierr);
373   ierr = CeedGetKernelHip(ceed, impl->module, "OffsetTranspose",
374                           &impl->OffsetTranspose); CeedChkBackend(ierr);
375   ierr = CeedFree(&restriction_kernel_path); CeedChkBackend(ierr);
376   ierr = CeedFree(&restriction_kernel_source); CeedChkBackend(ierr);
377 
378   // Register backend functions
379   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply",
380                                 CeedElemRestrictionApply_Hip);
381   CeedChkBackend(ierr);
382   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock",
383                                 CeedElemRestrictionApplyBlock_Hip);
384   CeedChkBackend(ierr);
385   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets",
386                                 CeedElemRestrictionGetOffsets_Hip);
387   CeedChkBackend(ierr);
388   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy",
389                                 CeedElemRestrictionDestroy_Hip);
390   CeedChkBackend(ierr);
391   return CEED_ERROR_SUCCESS;
392 }
393 
394 //------------------------------------------------------------------------------
395 // Blocked not supported
396 //------------------------------------------------------------------------------
397 int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype,
398     const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) {
399   int ierr;
400   Ceed ceed;
401   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
402   return CeedError(ceed, CEED_ERROR_BACKEND,
403                    "Backend does not implement blocked restrictions");
404 }
405 //------------------------------------------------------------------------------
406