xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-restriction.c (revision cfa13e89a200bdd88bed8f0c57a43555c66a749e)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 #include <stdbool.h>
14 #include <stddef.h>
15 #include <string.h>
16 
17 #include "../cuda/ceed-cuda-common.h"
18 #include "../cuda/ceed-cuda-compile.h"
19 #include "ceed-cuda-ref.h"
20 
21 //------------------------------------------------------------------------------
22 // Apply restriction
23 //------------------------------------------------------------------------------
24 static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) {
25   Ceed                      ceed;
26   Ceed_Cuda                *data;
27   CUfunction                kernel;
28   CeedInt                   num_elem, elem_size;
29   const CeedScalar         *d_u;
30   CeedScalar               *d_v;
31   CeedElemRestriction_Cuda *impl;
32 
33   CeedCallBackend(CeedElemRestrictionGetData(r, &impl));
34   CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
35   CeedCallBackend(CeedGetData(ceed, &data));
36   CeedElemRestrictionGetNumElements(r, &num_elem);
37   CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size));
38   const CeedInt num_nodes = impl->num_nodes;
39 
40   // Get vectors
41   CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
42   if (t_mode == CEED_TRANSPOSE) {
43     // Sum into for transpose mode, e-vec to l-vec
44     CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
45   } else {
46     // Overwrite for notranspose mode, l-vec to e-vec
47     CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
48   }
49 
50   // Restrict
51   if (t_mode == CEED_NOTRANSPOSE) {
52     // L-vector -> E-vector
53     if (impl->d_ind) {
54       // -- Offsets provided
55       kernel             = impl->OffsetNoTranspose;
56       void   *args[]     = {&num_elem, &impl->d_ind, &d_u, &d_v};
57       CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024;
58 
59       CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args));
60     } else {
61       // -- Strided restriction
62       kernel             = impl->StridedNoTranspose;
63       void   *args[]     = {&num_elem, &d_u, &d_v};
64       CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024;
65 
66       CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args));
67     }
68   } else {
69     // E-vector -> L-vector
70     if (impl->d_ind) {
71       // -- Offsets provided
72       CeedInt block_size = 32;
73 
74       if (impl->OffsetTranspose) {
75         kernel       = impl->OffsetTranspose;
76         void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
77 
78         CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args));
79       } else {
80         kernel       = impl->OffsetTransposeDet;
81         void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v};
82 
83         CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args));
84       }
85     } else {
86       // -- Strided restriction
87       kernel             = impl->StridedTranspose;
88       void   *args[]     = {&num_elem, &d_u, &d_v};
89       CeedInt block_size = 32;
90 
91       CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args));
92     }
93   }
94 
95   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL;
96 
97   // Restore arrays
98   CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
99   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
100   return CEED_ERROR_SUCCESS;
101 }
102 
103 //------------------------------------------------------------------------------
104 // Get offsets
105 //------------------------------------------------------------------------------
106 static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) {
107   CeedElemRestriction_Cuda *impl;
108 
109   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
110   switch (mem_type) {
111     case CEED_MEM_HOST:
112       *offsets = impl->h_ind;
113       break;
114     case CEED_MEM_DEVICE:
115       *offsets = impl->d_ind;
116       break;
117   }
118   return CEED_ERROR_SUCCESS;
119 }
120 
121 //------------------------------------------------------------------------------
122 // Destroy restriction
123 //------------------------------------------------------------------------------
124 static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) {
125   Ceed                      ceed;
126   CeedElemRestriction_Cuda *impl;
127 
128   CeedCallBackend(CeedElemRestrictionGetData(r, &impl));
129   CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
130   CeedCallCuda(ceed, cuModuleUnload(impl->module));
131   CeedCallBackend(CeedFree(&impl->h_ind_allocated));
132   CeedCallCuda(ceed, cudaFree(impl->d_ind_allocated));
133   CeedCallCuda(ceed, cudaFree(impl->d_t_offsets));
134   CeedCallCuda(ceed, cudaFree(impl->d_t_indices));
135   CeedCallCuda(ceed, cudaFree(impl->d_l_vec_indices));
136   CeedCallBackend(CeedFree(&impl));
137   return CEED_ERROR_SUCCESS;
138 }
139 
140 //------------------------------------------------------------------------------
141 // Create transpose offsets and indices
142 //------------------------------------------------------------------------------
143 static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const CeedInt *indices) {
144   Ceed                      ceed;
145   bool                     *is_node;
146   CeedSize                  l_size;
147   CeedInt                   num_elem, elem_size, num_comp, num_nodes = 0;
148   CeedInt                  *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices;
149   CeedElemRestriction_Cuda *impl;
150 
151   CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
152   CeedCallBackend(CeedElemRestrictionGetData(r, &impl));
153   CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem));
154   CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size));
155   CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size));
156   CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp));
157   const CeedInt size_indices = num_elem * elem_size;
158 
159   // Count num_nodes
160   CeedCallBackend(CeedCalloc(l_size, &is_node));
161 
162   for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1;
163   for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i];
164   impl->num_nodes = num_nodes;
165 
166   // L-vector offsets array
167   CeedCallBackend(CeedCalloc(l_size, &ind_to_offset));
168   CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices));
169   for (CeedInt i = 0, j = 0; i < l_size; i++) {
170     if (is_node[i]) {
171       l_vec_indices[j] = i;
172       ind_to_offset[i] = j++;
173     }
174   }
175   CeedCallBackend(CeedFree(&is_node));
176 
177   // Compute transpose offsets and indices
178   const CeedInt size_offsets = num_nodes + 1;
179 
180   CeedCallBackend(CeedCalloc(size_offsets, &t_offsets));
181   CeedCallBackend(CeedMalloc(size_indices, &t_indices));
182   // Count node multiplicity
183   for (CeedInt e = 0; e < num_elem; ++e) {
184     for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1];
185   }
186   // Convert to running sum
187   for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1];
188   // List all E-vec indices associated with L-vec node
189   for (CeedInt e = 0; e < num_elem; ++e) {
190     for (CeedInt i = 0; i < elem_size; ++i) {
191       const CeedInt lid = elem_size * e + i;
192       const CeedInt gid = indices[lid];
193 
194       t_indices[t_offsets[ind_to_offset[gid]]++] = lid;
195     }
196   }
197   // Reset running sum
198   for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1];
199   t_offsets[0] = 0;
200 
201   // Copy data to device
202   // -- L-vector indices
203   CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt)));
204   CeedCallCuda(ceed, cudaMemcpy(impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), cudaMemcpyHostToDevice));
205   // -- Transpose offsets
206   CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt)));
207   CeedCallCuda(ceed, cudaMemcpy(impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), cudaMemcpyHostToDevice));
208   // -- Transpose indices
209   CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt)));
210   CeedCallCuda(ceed, cudaMemcpy(impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), cudaMemcpyHostToDevice));
211 
212   // Cleanup
213   CeedCallBackend(CeedFree(&ind_to_offset));
214   CeedCallBackend(CeedFree(&l_vec_indices));
215   CeedCallBackend(CeedFree(&t_offsets));
216   CeedCallBackend(CeedFree(&t_indices));
217   return CEED_ERROR_SUCCESS;
218 }
219 
220 //------------------------------------------------------------------------------
221 // Create restriction
222 //------------------------------------------------------------------------------
223 int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients,
224                                    const CeedInt8 *curl_orients, CeedElemRestriction r) {
225   Ceed                      ceed, ceed_parent;
226   bool                      is_deterministic, is_strided;
227   CeedInt                   num_elem, num_comp, elem_size, comp_stride = 1;
228   CeedRestrictionType       rstr_type;
229   CeedElemRestriction_Cuda *impl;
230 
231   CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
232   CeedCallBackend(CeedCalloc(1, &impl));
233   CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
234   CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic));
235   CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem));
236   CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp));
237   CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size));
238   const CeedInt size       = num_elem * elem_size;
239   CeedInt       strides[3] = {1, size, elem_size};
240   CeedInt       layout[3]  = {1, elem_size * num_elem, elem_size};
241 
242   CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type));
243   CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND,
244             "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented");
245 
246   // Stride data
247   CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided));
248   if (is_strided) {
249     bool has_backend_strides;
250 
251     CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides));
252     if (!has_backend_strides) {
253       CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides));
254     }
255   } else {
256     CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride));
257   }
258 
259   impl->h_ind           = NULL;
260   impl->h_ind_allocated = NULL;
261   impl->d_ind           = NULL;
262   impl->d_ind_allocated = NULL;
263   impl->d_t_indices     = NULL;
264   impl->d_t_offsets     = NULL;
265   impl->num_nodes       = size;
266   CeedCallBackend(CeedElemRestrictionSetData(r, impl));
267   CeedCallBackend(CeedElemRestrictionSetELayout(r, layout));
268 
269   // Set up device indices/offset arrays
270   switch (mem_type) {
271     case CEED_MEM_HOST: {
272       switch (copy_mode) {
273         case CEED_OWN_POINTER:
274           impl->h_ind_allocated = (CeedInt *)indices;
275           impl->h_ind           = (CeedInt *)indices;
276           break;
277         case CEED_USE_POINTER:
278           impl->h_ind = (CeedInt *)indices;
279           break;
280         case CEED_COPY_VALUES:
281           if (indices != NULL) {
282             CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated));
283             memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt));
284             impl->h_ind = impl->h_ind_allocated;
285           }
286           break;
287       }
288       if (indices != NULL) {
289         CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt)));
290         impl->d_ind_allocated = impl->d_ind;  // We own the device memory
291         CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyHostToDevice));
292         if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices));
293       }
294       break;
295     }
296     case CEED_MEM_DEVICE: {
297       switch (copy_mode) {
298         case CEED_COPY_VALUES:
299           if (indices != NULL) {
300             CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt)));
301             impl->d_ind_allocated = impl->d_ind;  // We own the device memory
302             CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyDeviceToDevice));
303           }
304           break;
305         case CEED_OWN_POINTER:
306           impl->d_ind           = (CeedInt *)indices;
307           impl->d_ind_allocated = impl->d_ind;
308           break;
309         case CEED_USE_POINTER:
310           impl->d_ind = (CeedInt *)indices;
311       }
312       if (indices != NULL) {
313         CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated));
314         CeedCallCuda(ceed, cudaMemcpy(impl->h_ind_allocated, impl->d_ind, elem_size * num_elem * sizeof(CeedInt), cudaMemcpyDeviceToHost));
315         impl->h_ind = impl->h_ind_allocated;
316         if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices));
317       }
318       break;
319     }
320     // LCOV_EXCL_START
321     default:
322       return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported");
323       // LCOV_EXCL_STOP
324   }
325 
326   // Compile CUDA kernels (add atomicAdd function for old NVidia architectures)
327   CeedInt num_nodes = impl->num_nodes;
328   char   *restriction_kernel_path, *restriction_kernel_source = NULL;
329 
330   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction.h", &restriction_kernel_path));
331   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
332   if (!is_deterministic) {
333     struct cudaDeviceProp prop;
334     Ceed_Cuda            *ceed_data;
335 
336     CeedCallBackend(CeedGetData(ceed, &ceed_data));
337     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
338     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
339       char *atomic_add_path;
340 
341       CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path));
342       CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &restriction_kernel_source));
343       CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
344       CeedCallBackend(CeedFree(&atomic_add_path));
345     }
346   }
347   if (!restriction_kernel_source) {
348     CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
349   }
350   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
351   CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
352                                    "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES",
353                                    strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2]));
354   CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose));
355   CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose));
356   CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose));
357   if (!is_deterministic) {
358     CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose));
359   } else {
360     CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet));
361   }
362   CeedCallBackend(CeedFree(&restriction_kernel_path));
363   CeedCallBackend(CeedFree(&restriction_kernel_source));
364 
365   // Register backend functions
366   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Cuda));
367   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Cuda));
368   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Cuda));
369   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda));
370   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Cuda));
371   return CEED_ERROR_SUCCESS;
372 }
373 
374 //------------------------------------------------------------------------------
375