1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 #include <string.h> 16 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #include "ceed-cuda-ref.h" 20 21 //------------------------------------------------------------------------------ 22 // Apply restriction 23 //------------------------------------------------------------------------------ 24 static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { 25 CeedElemRestriction_Cuda *impl; 26 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 27 Ceed ceed; 28 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 29 Ceed_Cuda *data; 30 CeedCallBackend(CeedGetData(ceed, &data)); 31 const CeedInt num_nodes = impl->num_nodes; 32 CeedInt num_elem, elem_size; 33 CeedElemRestrictionGetNumElements(r, &num_elem); 34 CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 35 CUfunction kernel; 36 37 // Get vectors 38 const CeedScalar *d_u; 39 CeedScalar *d_v; 40 CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 41 if (t_mode == CEED_TRANSPOSE) { 42 // Sum into for transpose mode, e-vec to l-vec 43 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 44 } else { 45 // Overwrite for notranspose mode, l-vec to e-vec 46 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 47 } 48 49 // Restrict 50 if (t_mode == CEED_NOTRANSPOSE) { 51 // L-vector -> E-vector 52 if (impl->d_ind) { 53 // -- Offsets provided 54 kernel = impl->OffsetNoTranspose; 55 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 56 CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; 57 58 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 59 } else { 60 // -- Strided restriction 61 kernel = impl->StridedNoTranspose; 62 void *args[] = {&num_elem, &d_u, &d_v}; 63 CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; 64 65 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 66 } 67 } else { 68 // E-vector -> L-vector 69 if (impl->d_ind) { 70 // -- Offsets provided 71 CeedInt block_size = 32; 72 if (impl->OffsetTranspose) { 73 kernel = impl->OffsetTranspose; 74 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 75 76 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 77 } else { 78 kernel = impl->OffsetTransposeDet; 79 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 80 81 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 82 } 83 } else { 84 // -- Strided restriction 85 kernel = impl->StridedTranspose; 86 void *args[] = {&num_elem, &d_u, &d_v}; 87 CeedInt block_size = 32; 88 89 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 90 } 91 } 92 93 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; 94 95 // Restore arrays 96 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 97 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 98 return CEED_ERROR_SUCCESS; 99 } 100 101 //------------------------------------------------------------------------------ 102 // Get offsets 103 //------------------------------------------------------------------------------ 104 static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { 105 CeedElemRestriction_Cuda *impl; 106 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 107 108 switch (mem_type) { 109 case CEED_MEM_HOST: 110 *offsets = impl->h_ind; 111 break; 112 case CEED_MEM_DEVICE: 113 *offsets = impl->d_ind; 114 break; 115 } 116 return CEED_ERROR_SUCCESS; 117 } 118 119 //------------------------------------------------------------------------------ 120 // Destroy restriction 121 //------------------------------------------------------------------------------ 122 static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) { 123 CeedElemRestriction_Cuda *impl; 124 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 125 126 Ceed ceed; 127 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 128 CeedCallCuda(ceed, cuModuleUnload(impl->module)); 129 CeedCallBackend(CeedFree(&impl->h_ind_allocated)); 130 CeedCallCuda(ceed, cudaFree(impl->d_ind_allocated)); 131 CeedCallCuda(ceed, cudaFree(impl->d_t_offsets)); 132 CeedCallCuda(ceed, cudaFree(impl->d_t_indices)); 133 CeedCallCuda(ceed, cudaFree(impl->d_l_vec_indices)); 134 CeedCallBackend(CeedFree(&impl)); 135 136 return CEED_ERROR_SUCCESS; 137 } 138 139 //------------------------------------------------------------------------------ 140 // Create transpose offsets and indices 141 //------------------------------------------------------------------------------ 142 static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const CeedInt *indices) { 143 Ceed ceed; 144 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 145 CeedElemRestriction_Cuda *impl; 146 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 147 CeedSize l_size; 148 CeedInt num_elem, elem_size, num_comp; 149 CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); 150 CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 151 CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); 152 CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); 153 154 // Count num_nodes 155 bool *is_node; 156 CeedCallBackend(CeedCalloc(l_size, &is_node)); 157 const CeedInt size_indices = num_elem * elem_size; 158 for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; 159 CeedInt num_nodes = 0; 160 for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; 161 impl->num_nodes = num_nodes; 162 163 // L-vector offsets array 164 CeedInt *ind_to_offset, *l_vec_indices; 165 CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); 166 CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); 167 CeedInt j = 0; 168 for (CeedInt i = 0; i < l_size; i++) { 169 if (is_node[i]) { 170 l_vec_indices[j] = i; 171 ind_to_offset[i] = j++; 172 } 173 } 174 CeedCallBackend(CeedFree(&is_node)); 175 176 // Compute transpose offsets and indices 177 const CeedInt size_offsets = num_nodes + 1; 178 CeedInt *t_offsets; 179 CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); 180 CeedInt *t_indices; 181 CeedCallBackend(CeedMalloc(size_indices, &t_indices)); 182 // Count node multiplicity 183 for (CeedInt e = 0; e < num_elem; ++e) { 184 for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1]; 185 } 186 // Convert to running sum 187 for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1]; 188 // List all E-vec indices associated with L-vec node 189 for (CeedInt e = 0; e < num_elem; ++e) { 190 for (CeedInt i = 0; i < elem_size; ++i) { 191 const CeedInt lid = elem_size * e + i; 192 const CeedInt gid = indices[lid]; 193 t_indices[t_offsets[ind_to_offset[gid]]++] = lid; 194 } 195 } 196 // Reset running sum 197 for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1]; 198 t_offsets[0] = 0; 199 200 // Copy data to device 201 // -- L-vector indices 202 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt))); 203 CeedCallCuda(ceed, cudaMemcpy(impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), cudaMemcpyHostToDevice)); 204 // -- Transpose offsets 205 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt))); 206 CeedCallCuda(ceed, cudaMemcpy(impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), cudaMemcpyHostToDevice)); 207 // -- Transpose indices 208 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt))); 209 CeedCallCuda(ceed, cudaMemcpy(impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), cudaMemcpyHostToDevice)); 210 211 // Cleanup 212 CeedCallBackend(CeedFree(&ind_to_offset)); 213 CeedCallBackend(CeedFree(&l_vec_indices)); 214 CeedCallBackend(CeedFree(&t_offsets)); 215 CeedCallBackend(CeedFree(&t_indices)); 216 217 return CEED_ERROR_SUCCESS; 218 } 219 220 //------------------------------------------------------------------------------ 221 // Create restriction 222 //------------------------------------------------------------------------------ 223 int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, 224 const CeedInt8 *curl_orients, CeedElemRestriction r) { 225 Ceed ceed; 226 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 227 CeedElemRestriction_Cuda *impl; 228 CeedCallBackend(CeedCalloc(1, &impl)); 229 Ceed parent; 230 CeedCallBackend(CeedGetParent(ceed, &parent)); 231 bool is_deterministic; 232 CeedCallBackend(CeedIsDeterministic(parent, &is_deterministic)); 233 CeedInt num_elem, num_comp, elem_size; 234 CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); 235 CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); 236 CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 237 CeedInt size = num_elem * elem_size; 238 CeedInt strides[3] = {1, size, elem_size}; 239 CeedInt comp_stride = 1; 240 241 CeedRestrictionType rstr_type; 242 CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); 243 CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, 244 "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); 245 246 // Stride data 247 bool is_strided; 248 CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); 249 if (is_strided) { 250 bool has_backend_strides; 251 CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); 252 if (!has_backend_strides) { 253 CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); 254 } 255 } else { 256 CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); 257 } 258 259 impl->h_ind = NULL; 260 impl->h_ind_allocated = NULL; 261 impl->d_ind = NULL; 262 impl->d_ind_allocated = NULL; 263 impl->d_t_indices = NULL; 264 impl->d_t_offsets = NULL; 265 impl->num_nodes = size; 266 CeedCallBackend(CeedElemRestrictionSetData(r, impl)); 267 CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; 268 CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); 269 270 // Set up device indices/offset arrays 271 switch (mem_type) { 272 case CEED_MEM_HOST: { 273 switch (copy_mode) { 274 case CEED_OWN_POINTER: 275 impl->h_ind_allocated = (CeedInt *)indices; 276 impl->h_ind = (CeedInt *)indices; 277 break; 278 case CEED_USE_POINTER: 279 impl->h_ind = (CeedInt *)indices; 280 break; 281 case CEED_COPY_VALUES: 282 if (indices != NULL) { 283 CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); 284 memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); 285 impl->h_ind = impl->h_ind_allocated; 286 } 287 break; 288 } 289 if (indices != NULL) { 290 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); 291 impl->d_ind_allocated = impl->d_ind; // We own the device memory 292 CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyHostToDevice)); 293 if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices)); 294 } 295 break; 296 } 297 case CEED_MEM_DEVICE: { 298 switch (copy_mode) { 299 case CEED_COPY_VALUES: 300 if (indices != NULL) { 301 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); 302 impl->d_ind_allocated = impl->d_ind; // We own the device memory 303 CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyDeviceToDevice)); 304 } 305 break; 306 case CEED_OWN_POINTER: 307 impl->d_ind = (CeedInt *)indices; 308 impl->d_ind_allocated = impl->d_ind; 309 break; 310 case CEED_USE_POINTER: 311 impl->d_ind = (CeedInt *)indices; 312 } 313 if (indices != NULL) { 314 CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); 315 CeedCallCuda(ceed, cudaMemcpy(impl->h_ind_allocated, impl->d_ind, elem_size * num_elem * sizeof(CeedInt), cudaMemcpyDeviceToHost)); 316 impl->h_ind = impl->h_ind_allocated; 317 if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices)); 318 } 319 break; 320 } 321 // LCOV_EXCL_START 322 default: 323 return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported"); 324 // LCOV_EXCL_STOP 325 } 326 327 // Compile CUDA kernels (add atomicAdd function for old NVidia architectures) 328 CeedInt num_nodes = impl->num_nodes; 329 char *restriction_kernel_path, *restriction_kernel_source = NULL; 330 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction.h", &restriction_kernel_path)); 331 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 332 if (!is_deterministic) { 333 struct cudaDeviceProp prop; 334 Ceed_Cuda *ceed_data; 335 CeedCallBackend(CeedGetData(ceed, &ceed_data)); 336 CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); 337 if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { 338 char *atomic_add_path; 339 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path)); 340 CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &restriction_kernel_source)); 341 CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); 342 CeedCallBackend(CeedFree(&atomic_add_path)); 343 } 344 } 345 if (!restriction_kernel_source) { 346 CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); 347 } 348 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 349 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 8, "RESTR_ELEM_SIZE", elem_size, "RESTR_NUM_ELEM", num_elem, 350 "RESTR_NUM_COMP", num_comp, "RESTR_NUM_NODES", num_nodes, "RESTR_COMP_STRIDE", comp_stride, "RESTR_STRIDE_NODES", 351 strides[0], "RESTR_STRIDE_COMP", strides[1], "RESTR_STRIDE_ELEM", strides[2])); 352 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); 353 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); 354 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); 355 if (!is_deterministic) { 356 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); 357 } else { 358 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet)); 359 } 360 CeedCallBackend(CeedFree(&restriction_kernel_path)); 361 CeedCallBackend(CeedFree(&restriction_kernel_source)); 362 363 // Register backend functions 364 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Cuda)); 365 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Cuda)); 366 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Cuda)); 367 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda)); 368 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Cuda)); 369 return CEED_ERROR_SUCCESS; 370 } 371 372 //------------------------------------------------------------------------------ 373