xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-restriction.c (revision 53f7acb178914a16137d9c91c84843f149b8f9af)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 #include <stdbool.h>
14 #include <stddef.h>
15 #include <string.h>
16 
17 #include "../cuda/ceed-cuda-common.h"
18 #include "../cuda/ceed-cuda-compile.h"
19 #include "ceed-cuda-ref.h"
20 
21 //------------------------------------------------------------------------------
22 // Apply restriction
23 //------------------------------------------------------------------------------
24 static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) {
25   CeedElemRestriction_Cuda *impl;
26   CeedCallBackend(CeedElemRestrictionGetData(r, &impl));
27   Ceed ceed;
28   CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
29   Ceed_Cuda *data;
30   CeedCallBackend(CeedGetData(ceed, &data));
31   const CeedInt num_nodes = impl->num_nodes;
32   CeedInt       num_elem, elem_size;
33   CeedElemRestrictionGetNumElements(r, &num_elem);
34   CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size));
35   CUfunction kernel;
36 
37   // Get vectors
38   const CeedScalar *d_u;
39   CeedScalar       *d_v;
40   CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
41   if (t_mode == CEED_TRANSPOSE) {
42     // Sum into for transpose mode, e-vec to l-vec
43     CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
44   } else {
45     // Overwrite for notranspose mode, l-vec to e-vec
46     CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
47   }
48 
49   // Restrict
50   if (t_mode == CEED_NOTRANSPOSE) {
51     // L-vector -> E-vector
52     if (impl->d_ind) {
53       // -- Offsets provided
54       kernel             = impl->OffsetNoTranspose;
55       void   *args[]     = {&num_elem, &impl->d_ind, &d_u, &d_v};
56       CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024;
57 
58       CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args));
59     } else {
60       // -- Strided restriction
61       kernel             = impl->StridedNoTranspose;
62       void   *args[]     = {&num_elem, &d_u, &d_v};
63       CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024;
64 
65       CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args));
66     }
67   } else {
68     // E-vector -> L-vector
69     if (impl->d_ind) {
70       // -- Offsets provided
71       CeedInt block_size = 32;
72       if (impl->OffsetTranspose) {
73         kernel       = impl->OffsetTranspose;
74         void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
75 
76         CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args));
77       } else {
78         kernel       = impl->OffsetTransposeDet;
79         void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v};
80 
81         CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args));
82       }
83     } else {
84       // -- Strided restriction
85       kernel             = impl->StridedTranspose;
86       void   *args[]     = {&num_elem, &d_u, &d_v};
87       CeedInt block_size = 32;
88 
89       CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args));
90     }
91   }
92 
93   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL;
94 
95   // Restore arrays
96   CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
97   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
98   return CEED_ERROR_SUCCESS;
99 }
100 
101 //------------------------------------------------------------------------------
102 // Get offsets
103 //------------------------------------------------------------------------------
104 static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) {
105   CeedElemRestriction_Cuda *impl;
106   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
107 
108   switch (mem_type) {
109     case CEED_MEM_HOST:
110       *offsets = impl->h_ind;
111       break;
112     case CEED_MEM_DEVICE:
113       *offsets = impl->d_ind;
114       break;
115   }
116   return CEED_ERROR_SUCCESS;
117 }
118 
119 //------------------------------------------------------------------------------
120 // Destroy restriction
121 //------------------------------------------------------------------------------
122 static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) {
123   CeedElemRestriction_Cuda *impl;
124   CeedCallBackend(CeedElemRestrictionGetData(r, &impl));
125 
126   Ceed ceed;
127   CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
128   CeedCallCuda(ceed, cuModuleUnload(impl->module));
129   CeedCallBackend(CeedFree(&impl->h_ind_allocated));
130   CeedCallCuda(ceed, cudaFree(impl->d_ind_allocated));
131   CeedCallCuda(ceed, cudaFree(impl->d_t_offsets));
132   CeedCallCuda(ceed, cudaFree(impl->d_t_indices));
133   CeedCallCuda(ceed, cudaFree(impl->d_l_vec_indices));
134   CeedCallBackend(CeedFree(&impl));
135 
136   return CEED_ERROR_SUCCESS;
137 }
138 
139 //------------------------------------------------------------------------------
140 // Create transpose offsets and indices
141 //------------------------------------------------------------------------------
142 static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const CeedInt *indices) {
143   Ceed ceed;
144   CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
145   CeedElemRestriction_Cuda *impl;
146   CeedCallBackend(CeedElemRestrictionGetData(r, &impl));
147   CeedSize l_size;
148   CeedInt  num_elem, elem_size, num_comp;
149   CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem));
150   CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size));
151   CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size));
152   CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp));
153 
154   // Count num_nodes
155   bool *is_node;
156   CeedCallBackend(CeedCalloc(l_size, &is_node));
157   const CeedInt size_indices = num_elem * elem_size;
158   for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1;
159   CeedInt num_nodes = 0;
160   for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i];
161   impl->num_nodes = num_nodes;
162 
163   // L-vector offsets array
164   CeedInt *ind_to_offset, *l_vec_indices;
165   CeedCallBackend(CeedCalloc(l_size, &ind_to_offset));
166   CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices));
167   CeedInt j = 0;
168   for (CeedInt i = 0; i < l_size; i++) {
169     if (is_node[i]) {
170       l_vec_indices[j] = i;
171       ind_to_offset[i] = j++;
172     }
173   }
174   CeedCallBackend(CeedFree(&is_node));
175 
176   // Compute transpose offsets and indices
177   const CeedInt size_offsets = num_nodes + 1;
178   CeedInt      *t_offsets;
179   CeedCallBackend(CeedCalloc(size_offsets, &t_offsets));
180   CeedInt *t_indices;
181   CeedCallBackend(CeedMalloc(size_indices, &t_indices));
182   // Count node multiplicity
183   for (CeedInt e = 0; e < num_elem; ++e) {
184     for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1];
185   }
186   // Convert to running sum
187   for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1];
188   // List all E-vec indices associated with L-vec node
189   for (CeedInt e = 0; e < num_elem; ++e) {
190     for (CeedInt i = 0; i < elem_size; ++i) {
191       const CeedInt lid                          = elem_size * e + i;
192       const CeedInt gid                          = indices[lid];
193       t_indices[t_offsets[ind_to_offset[gid]]++] = lid;
194     }
195   }
196   // Reset running sum
197   for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1];
198   t_offsets[0] = 0;
199 
200   // Copy data to device
201   // -- L-vector indices
202   CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt)));
203   CeedCallCuda(ceed, cudaMemcpy(impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), cudaMemcpyHostToDevice));
204   // -- Transpose offsets
205   CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt)));
206   CeedCallCuda(ceed, cudaMemcpy(impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), cudaMemcpyHostToDevice));
207   // -- Transpose indices
208   CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt)));
209   CeedCallCuda(ceed, cudaMemcpy(impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), cudaMemcpyHostToDevice));
210 
211   // Cleanup
212   CeedCallBackend(CeedFree(&ind_to_offset));
213   CeedCallBackend(CeedFree(&l_vec_indices));
214   CeedCallBackend(CeedFree(&t_offsets));
215   CeedCallBackend(CeedFree(&t_indices));
216 
217   return CEED_ERROR_SUCCESS;
218 }
219 
220 //------------------------------------------------------------------------------
221 // Create restriction
222 //------------------------------------------------------------------------------
223 int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients,
224                                    const CeedInt8 *curl_orients, CeedElemRestriction r) {
225   Ceed ceed;
226   CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
227   CeedElemRestriction_Cuda *impl;
228   CeedCallBackend(CeedCalloc(1, &impl));
229   Ceed parent;
230   CeedCallBackend(CeedGetParent(ceed, &parent));
231   bool is_deterministic;
232   CeedCallBackend(CeedIsDeterministic(parent, &is_deterministic));
233   CeedInt num_elem, num_comp, elem_size;
234   CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem));
235   CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp));
236   CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size));
237   CeedInt size        = num_elem * elem_size;
238   CeedInt strides[3]  = {1, size, elem_size};
239   CeedInt comp_stride = 1;
240 
241   CeedRestrictionType rstr_type;
242   CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type));
243   CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND,
244             "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented");
245 
246   // Stride data
247   bool is_strided;
248   CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided));
249   if (is_strided) {
250     bool has_backend_strides;
251     CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides));
252     if (!has_backend_strides) {
253       CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides));
254     }
255   } else {
256     CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride));
257   }
258 
259   impl->h_ind           = NULL;
260   impl->h_ind_allocated = NULL;
261   impl->d_ind           = NULL;
262   impl->d_ind_allocated = NULL;
263   impl->d_t_indices     = NULL;
264   impl->d_t_offsets     = NULL;
265   impl->num_nodes       = size;
266   CeedCallBackend(CeedElemRestrictionSetData(r, impl));
267   CeedInt layout[3] = {1, elem_size * num_elem, elem_size};
268   CeedCallBackend(CeedElemRestrictionSetELayout(r, layout));
269 
270   // Set up device indices/offset arrays
271   switch (mem_type) {
272     case CEED_MEM_HOST: {
273       switch (copy_mode) {
274         case CEED_OWN_POINTER:
275           impl->h_ind_allocated = (CeedInt *)indices;
276           impl->h_ind           = (CeedInt *)indices;
277           break;
278         case CEED_USE_POINTER:
279           impl->h_ind = (CeedInt *)indices;
280           break;
281         case CEED_COPY_VALUES:
282           if (indices != NULL) {
283             CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated));
284             memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt));
285             impl->h_ind = impl->h_ind_allocated;
286           }
287           break;
288       }
289       if (indices != NULL) {
290         CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt)));
291         impl->d_ind_allocated = impl->d_ind;  // We own the device memory
292         CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyHostToDevice));
293         if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices));
294       }
295       break;
296     }
297     case CEED_MEM_DEVICE: {
298       switch (copy_mode) {
299         case CEED_COPY_VALUES:
300           if (indices != NULL) {
301             CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt)));
302             impl->d_ind_allocated = impl->d_ind;  // We own the device memory
303             CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyDeviceToDevice));
304           }
305           break;
306         case CEED_OWN_POINTER:
307           impl->d_ind           = (CeedInt *)indices;
308           impl->d_ind_allocated = impl->d_ind;
309           break;
310         case CEED_USE_POINTER:
311           impl->d_ind = (CeedInt *)indices;
312       }
313       if (indices != NULL) {
314         CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated));
315         CeedCallCuda(ceed, cudaMemcpy(impl->h_ind_allocated, impl->d_ind, elem_size * num_elem * sizeof(CeedInt), cudaMemcpyDeviceToHost));
316         impl->h_ind = impl->h_ind_allocated;
317         if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices));
318       }
319       break;
320     }
321     // LCOV_EXCL_START
322     default:
323       return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported");
324       // LCOV_EXCL_STOP
325   }
326 
327   // Compile CUDA kernels (add atomicAdd function for old NVidia architectures)
328   CeedInt num_nodes = impl->num_nodes;
329   char   *restriction_kernel_path, *restriction_kernel_source = NULL;
330   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction.h", &restriction_kernel_path));
331   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
332   if (!is_deterministic) {
333     struct cudaDeviceProp prop;
334     Ceed_Cuda            *ceed_data;
335     CeedCallBackend(CeedGetData(ceed, &ceed_data));
336     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
337     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
338       char *atomic_add_path;
339       CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path));
340       CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &restriction_kernel_source));
341       CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
342       CeedCallBackend(CeedFree(&atomic_add_path));
343     }
344   }
345   if (!restriction_kernel_source) {
346     CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
347   }
348   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
349   CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 8, "RESTR_ELEM_SIZE", elem_size, "RESTR_NUM_ELEM", num_elem,
350                                    "RESTR_NUM_COMP", num_comp, "RESTR_NUM_NODES", num_nodes, "RESTR_COMP_STRIDE", comp_stride, "RESTR_STRIDE_NODES",
351                                    strides[0], "RESTR_STRIDE_COMP", strides[1], "RESTR_STRIDE_ELEM", strides[2]));
352   CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose));
353   CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose));
354   CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose));
355   if (!is_deterministic) {
356     CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose));
357   } else {
358     CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet));
359   }
360   CeedCallBackend(CeedFree(&restriction_kernel_path));
361   CeedCallBackend(CeedFree(&restriction_kernel_source));
362 
363   // Register backend functions
364   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Cuda));
365   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Cuda));
366   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Cuda));
367   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda));
368   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Cuda));
369   return CEED_ERROR_SUCCESS;
370 }
371 
372 //------------------------------------------------------------------------------
373