xref: /libCEED/backends/hip-ref/ceed-hip-ref-restriction.c (revision e79b91d9f61753a734e6e21c778d772fcdbcc265)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed/ceed.h>
18 #include <ceed/backend.h>
19 #include <ceed/jit-tools.h>
20 #include <hip/hip_runtime.h>
21 #include <stdbool.h>
22 #include <stddef.h>
23 #include "ceed-hip-ref.h"
24 #include "../hip/ceed-hip-compile.h"
25 
26 //------------------------------------------------------------------------------
27 // Apply restriction
28 //------------------------------------------------------------------------------
29 static int CeedElemRestrictionApply_Hip(CeedElemRestriction r,
30                                         CeedTransposeMode t_mode, CeedVector u,
31                                         CeedVector v, CeedRequest *request) {
32   int ierr;
33   CeedElemRestriction_Hip *impl;
34   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
35   Ceed ceed;
36   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
37   Ceed_Hip *data;
38   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
39   const CeedInt block_size = 64;
40   const CeedInt num_nodes = impl->num_nodes;
41   CeedInt num_elem, elem_size;
42   CeedElemRestrictionGetNumElements(r, &num_elem);
43   ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
44   hipFunction_t kernel;
45 
46   // Get vectors
47   const CeedScalar *d_u;
48   CeedScalar *d_v;
49   ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
50   if (t_mode == CEED_TRANSPOSE) {
51     // Sum into for transpose mode, e-vec to l-vec
52     ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
53   } else {
54     // Overwrite for notranspose mode, l-vec to e-vec
55     ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
56   }
57 
58   // Restrict
59   if (t_mode == CEED_NOTRANSPOSE) {
60     // L-vector -> E-vector
61     if (impl->d_ind) {
62       // -- Offsets provided
63       kernel = impl->OffsetNoTranspose;
64       void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
65       CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256;
66       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
67                               block_size, args); CeedChkBackend(ierr);
68     } else {
69       // -- Strided restriction
70       kernel = impl->StridedNoTranspose;
71       void *args[] = {&num_elem, &d_u, &d_v};
72       CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256;
73       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
74                               block_size, args); CeedChkBackend(ierr);
75     }
76   } else {
77     // E-vector -> L-vector
78     if (impl->d_ind) {
79       // -- Offsets provided
80       kernel = impl->OffsetTranspose;
81       void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices,
82                       &impl->d_t_offsets, &d_u, &d_v
83                      };
84       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
85                               block_size, args); CeedChkBackend(ierr);
86     } else {
87       // -- Strided restriction
88       kernel = impl->StridedTranspose;
89       void *args[] = {&num_elem, &d_u, &d_v};
90       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
91                               block_size, args); CeedChkBackend(ierr);
92     }
93   }
94 
95   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
96     *request = NULL;
97 
98   // Restore arrays
99   ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
100   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
101   return CEED_ERROR_SUCCESS;
102 }
103 
104 //------------------------------------------------------------------------------
105 // Blocked not supported
106 //------------------------------------------------------------------------------
107 int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block,
108                                       CeedTransposeMode t_mode, CeedVector u,
109                                       CeedVector v, CeedRequest *request) {
110   // LCOV_EXCL_START
111   int ierr;
112   Ceed ceed;
113   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
114   return CeedError(ceed, CEED_ERROR_BACKEND,
115                    "Backend does not implement blocked restrictions");
116   // LCOV_EXCL_STOP
117 }
118 
119 //------------------------------------------------------------------------------
120 // Get offsets
121 //------------------------------------------------------------------------------
122 static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr,
123     CeedMemType mtype, const CeedInt **offsets) {
124   int ierr;
125   CeedElemRestriction_Hip *impl;
126   ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr);
127 
128   switch (mtype) {
129   case CEED_MEM_HOST:
130     *offsets = impl->h_ind;
131     break;
132   case CEED_MEM_DEVICE:
133     *offsets = impl->d_ind;
134     break;
135   }
136   return CEED_ERROR_SUCCESS;
137 }
138 
139 //------------------------------------------------------------------------------
140 // Destroy restriction
141 //------------------------------------------------------------------------------
142 static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) {
143   int ierr;
144   CeedElemRestriction_Hip *impl;
145   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
146 
147   Ceed ceed;
148   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
149   ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr);
150   ierr = CeedFree(&impl->h_ind_allocated); CeedChkBackend(ierr);
151   ierr = hipFree(impl->d_ind_allocated); CeedChk_Hip(ceed, ierr);
152   ierr = hipFree(impl->d_t_offsets); CeedChk_Hip(ceed, ierr);
153   ierr = hipFree(impl->d_t_indices); CeedChk_Hip(ceed, ierr);
154   ierr = hipFree(impl->d_l_vec_indices); CeedChk_Hip(ceed, ierr);
155   ierr = CeedFree(&impl); CeedChkBackend(ierr);
156 
157   return CEED_ERROR_SUCCESS;
158 }
159 
160 //------------------------------------------------------------------------------
161 // Create transpose offsets and indices
162 //------------------------------------------------------------------------------
163 static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r,
164     const CeedInt *indices) {
165   int ierr;
166   Ceed ceed;
167   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
168   CeedElemRestriction_Hip *impl;
169   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
170   CeedSize l_size;
171   CeedInt num_elem, elem_size, num_comp;
172   ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr);
173   ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
174   ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr);
175   ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr);
176 
177   // Count num_nodes
178   bool *is_node;
179   ierr = CeedCalloc(l_size, &is_node); CeedChkBackend(ierr);
180   const CeedInt size_indices = num_elem * elem_size;
181   for (CeedInt i = 0; i < size_indices; i++)
182     is_node[indices[i]] = 1;
183   CeedInt num_nodes = 0;
184   for (CeedInt i = 0; i < l_size; i++)
185     num_nodes += is_node[i];
186   impl->num_nodes = num_nodes;
187 
188   // L-vector offsets array
189   CeedInt *ind_to_offset, *l_vec_indices;
190   ierr = CeedCalloc(l_size, &ind_to_offset); CeedChkBackend(ierr);
191   ierr = CeedCalloc(num_nodes, &l_vec_indices); CeedChkBackend(ierr);
192   CeedInt j = 0;
193   for (CeedInt i = 0; i < l_size; i++)
194     if (is_node[i]) {
195       l_vec_indices[j] = i;
196       ind_to_offset[i] = j++;
197     }
198   ierr = CeedFree(&is_node); CeedChkBackend(ierr);
199 
200   // Compute transpose offsets and indices
201   const CeedInt size_offsets = num_nodes + 1;
202   CeedInt *t_offsets;
203   ierr = CeedCalloc(size_offsets, &t_offsets); CeedChkBackend(ierr);
204   CeedInt *t_indices;
205   ierr = CeedMalloc(size_indices, &t_indices); CeedChkBackend(ierr);
206   // Count node multiplicity
207   for (CeedInt e = 0; e < num_elem; ++e)
208     for (CeedInt i = 0; i < elem_size; ++i)
209       ++t_offsets[ind_to_offset[indices[elem_size*e + i]] + 1];
210   // Convert to running sum
211   for (CeedInt i = 1; i < size_offsets; ++i)
212     t_offsets[i] += t_offsets[i-1];
213   // List all E-vec indices associated with L-vec node
214   for (CeedInt e = 0; e < num_elem; ++e) {
215     for (CeedInt i = 0; i < elem_size; ++i) {
216       const CeedInt lid = elem_size*e + i;
217       const CeedInt gid = indices[lid];
218       t_indices[t_offsets[ind_to_offset[gid]]++] = lid;
219     }
220   }
221   // Reset running sum
222   for (int i = size_offsets - 1; i > 0; --i)
223     t_offsets[i] = t_offsets[i - 1];
224   t_offsets[0] = 0;
225 
226   // Copy data to device
227   // -- L-vector indices
228   ierr = hipMalloc((void **)&impl->d_l_vec_indices, num_nodes*sizeof(CeedInt));
229   CeedChk_Hip(ceed, ierr);
230   ierr = hipMemcpy(impl->d_l_vec_indices, l_vec_indices,
231                    num_nodes*sizeof(CeedInt), hipMemcpyHostToDevice);
232   CeedChk_Hip(ceed, ierr);
233   // -- Transpose offsets
234   ierr = hipMalloc((void **)&impl->d_t_offsets, size_offsets*sizeof(CeedInt));
235   CeedChk_Hip(ceed, ierr);
236   ierr = hipMemcpy(impl->d_t_offsets, t_offsets, size_offsets*sizeof(CeedInt),
237                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
238   // -- Transpose indices
239   ierr = hipMalloc((void **)&impl->d_t_indices, size_indices*sizeof(CeedInt));
240   CeedChk_Hip(ceed, ierr);
241   ierr = hipMemcpy(impl->d_t_indices, t_indices, size_indices*sizeof(CeedInt),
242                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
243 
244   // Cleanup
245   ierr = CeedFree(&ind_to_offset); CeedChkBackend(ierr);
246   ierr = CeedFree(&l_vec_indices); CeedChkBackend(ierr);
247   ierr = CeedFree(&t_offsets); CeedChkBackend(ierr);
248   ierr = CeedFree(&t_indices); CeedChkBackend(ierr);
249 
250   return CEED_ERROR_SUCCESS;
251 }
252 
253 //------------------------------------------------------------------------------
254 // Create restriction
255 //------------------------------------------------------------------------------
256 int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode,
257                                   const CeedInt *indices,
258                                   CeedElemRestriction r) {
259   int ierr;
260   Ceed ceed;
261   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
262   CeedElemRestriction_Hip *impl;
263   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
264   CeedInt num_elem, num_comp, elem_size;
265   ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr);
266   ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr);
267   ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
268   CeedInt size = num_elem * elem_size;
269   CeedInt strides[3] = {1, size, elem_size};
270   CeedInt comp_stride = 1;
271 
272   // Stride data
273   bool is_strided;
274   ierr = CeedElemRestrictionIsStrided(r, &is_strided); CeedChkBackend(ierr);
275   if (is_strided) {
276     bool has_backend_strides;
277     ierr = CeedElemRestrictionHasBackendStrides(r, &has_backend_strides);
278     CeedChkBackend(ierr);
279     if (!has_backend_strides) {
280       ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr);
281     }
282   } else {
283     ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr);
284   }
285 
286   impl->h_ind           = NULL;
287   impl->h_ind_allocated = NULL;
288   impl->d_ind           = NULL;
289   impl->d_ind_allocated = NULL;
290   impl->d_t_indices     = NULL;
291   impl->d_t_offsets     = NULL;
292   impl->num_nodes = size;
293   ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr);
294   CeedInt layout[3] = {1, elem_size*num_elem, elem_size};
295   ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr);
296 
297   // Set up device indices/offset arrays
298   if (mtype == CEED_MEM_HOST) {
299     switch (cmode) {
300     case CEED_OWN_POINTER:
301       impl->h_ind_allocated = (CeedInt *)indices;
302       impl->h_ind = (CeedInt *)indices;
303       break;
304     case CEED_USE_POINTER:
305       impl->h_ind = (CeedInt *)indices;
306       break;
307     case CEED_COPY_VALUES:
308       break;
309     }
310     if (indices != NULL) {
311       ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt));
312       CeedChk_Hip(ceed, ierr);
313       impl->d_ind_allocated = impl->d_ind; // We own the device memory
314       ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt),
315                        hipMemcpyHostToDevice);
316       CeedChk_Hip(ceed, ierr);
317       ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr);
318     }
319   } else if (mtype == CEED_MEM_DEVICE) {
320     switch (cmode) {
321     case CEED_COPY_VALUES:
322       if (indices != NULL) {
323         ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt));
324         CeedChk_Hip(ceed, ierr);
325         impl->d_ind_allocated = impl->d_ind; // We own the device memory
326         ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt),
327                          hipMemcpyDeviceToDevice);
328         CeedChk_Hip(ceed, ierr);
329       }
330       break;
331     case CEED_OWN_POINTER:
332       impl->d_ind = (CeedInt *)indices;
333       impl->d_ind_allocated = impl->d_ind;
334       break;
335     case CEED_USE_POINTER:
336       impl->d_ind = (CeedInt *)indices;
337     }
338     if (indices != NULL) {
339       ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr);
340     }
341   } else {
342     // LCOV_EXCL_START
343     return CeedError(ceed, CEED_ERROR_BACKEND,
344                      "Only MemType = HOST or DEVICE supported");
345     // LCOV_EXCL_STOP
346   }
347 
348   // Compile HIP kernels
349   CeedInt num_nodes = impl->num_nodes;
350   char *restriction_kernel_path, *restriction_kernel_source;
351   ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-restriction.h",
352                              &restriction_kernel_path); CeedChkBackend(ierr);
353   CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source -----\n");
354   ierr = CeedLoadSourceToBuffer(ceed, restriction_kernel_path,
355                                 &restriction_kernel_source);
356   CeedChkBackend(ierr);
357   CeedDebug256(ceed, 2,
358                "----- Loading Restriction Kernel Source Complete! -----\n");
359   ierr = CeedCompileHip(ceed, restriction_kernel_source, &impl->module, 8,
360                         "RESTR_ELEM_SIZE", elem_size,
361                         "RESTR_NUM_ELEM", num_elem,
362                         "RESTR_NUM_COMP", num_comp,
363                         "RESTR_NUM_NODES", num_nodes,
364                         "RESTR_COMP_STRIDE", comp_stride,
365                         "RESTR_STRIDE_NODES", strides[0],
366                         "RESTR_STRIDE_COMP", strides[1],
367                         "RESTR_STRIDE_ELEM", strides[2]); CeedChkBackend(ierr);
368   ierr = CeedGetKernelHip(ceed, impl->module, "StridedNoTranspose",
369                           &impl->StridedNoTranspose); CeedChkBackend(ierr);
370   ierr = CeedGetKernelHip(ceed, impl->module, "OffsetNoTranspose",
371                           &impl->OffsetNoTranspose); CeedChkBackend(ierr);
372   ierr = CeedGetKernelHip(ceed, impl->module, "StridedTranspose",
373                           &impl->StridedTranspose); CeedChkBackend(ierr);
374   ierr = CeedGetKernelHip(ceed, impl->module, "OffsetTranspose",
375                           &impl->OffsetTranspose); CeedChkBackend(ierr);
376   ierr = CeedFree(&restriction_kernel_path); CeedChkBackend(ierr);
377   ierr = CeedFree(&restriction_kernel_source); CeedChkBackend(ierr);
378 
379   // Register backend functions
380   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply",
381                                 CeedElemRestrictionApply_Hip);
382   CeedChkBackend(ierr);
383   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock",
384                                 CeedElemRestrictionApplyBlock_Hip);
385   CeedChkBackend(ierr);
386   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets",
387                                 CeedElemRestrictionGetOffsets_Hip);
388   CeedChkBackend(ierr);
389   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy",
390                                 CeedElemRestrictionDestroy_Hip);
391   CeedChkBackend(ierr);
392   return CEED_ERROR_SUCCESS;
393 }
394 
395 //------------------------------------------------------------------------------
396 // Blocked not supported
397 //------------------------------------------------------------------------------
398 int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype,
399     const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) {
400   int ierr;
401   Ceed ceed;
402   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
403   return CeedError(ceed, CEED_ERROR_BACKEND,
404                    "Backend does not implement blocked restrictions");
405 }
406 //------------------------------------------------------------------------------
407