1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 #include <string.h> 16 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #include "ceed-cuda-ref.h" 20 21 //------------------------------------------------------------------------------ 22 // Apply restriction 23 //------------------------------------------------------------------------------ 24 static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { 25 Ceed ceed; 26 Ceed_Cuda *data; 27 CUfunction kernel; 28 CeedInt num_elem, elem_size; 29 const CeedScalar *d_u; 30 CeedScalar *d_v; 31 CeedElemRestriction_Cuda *impl; 32 33 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 34 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 35 CeedCallBackend(CeedGetData(ceed, &data)); 36 CeedElemRestrictionGetNumElements(r, &num_elem); 37 CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 38 const CeedInt num_nodes = impl->num_nodes; 39 40 // Get vectors 41 CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 42 if (t_mode == CEED_TRANSPOSE) { 43 // Sum into for transpose mode, e-vec to l-vec 44 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 45 } else { 46 // Overwrite for notranspose mode, l-vec to e-vec 47 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 48 } 49 50 // Restrict 51 if (t_mode == CEED_NOTRANSPOSE) { 52 // L-vector -> E-vector 53 if (impl->d_ind) { 54 // -- Offsets provided 55 kernel = impl->OffsetNoTranspose; 56 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 57 CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; 58 59 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 60 } else { 61 // -- Strided restriction 62 kernel = impl->StridedNoTranspose; 63 void *args[] = {&num_elem, &d_u, &d_v}; 64 CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; 65 66 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 67 } 68 } else { 69 // E-vector -> L-vector 70 if (impl->d_ind) { 71 // -- Offsets provided 72 CeedInt block_size = 32; 73 74 if (impl->OffsetTranspose) { 75 kernel = impl->OffsetTranspose; 76 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 77 78 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 79 } else { 80 kernel = impl->OffsetTransposeDet; 81 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 82 83 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 84 } 85 } else { 86 // -- Strided restriction 87 kernel = impl->StridedTranspose; 88 void *args[] = {&num_elem, &d_u, &d_v}; 89 CeedInt block_size = 32; 90 91 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 92 } 93 } 94 95 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; 96 97 // Restore arrays 98 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 99 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 100 return CEED_ERROR_SUCCESS; 101 } 102 103 //------------------------------------------------------------------------------ 104 // Get offsets 105 //------------------------------------------------------------------------------ 106 static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { 107 CeedElemRestriction_Cuda *impl; 108 109 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 110 switch (mem_type) { 111 case CEED_MEM_HOST: 112 *offsets = impl->h_ind; 113 break; 114 case CEED_MEM_DEVICE: 115 *offsets = impl->d_ind; 116 break; 117 } 118 return CEED_ERROR_SUCCESS; 119 } 120 121 //------------------------------------------------------------------------------ 122 // Destroy restriction 123 //------------------------------------------------------------------------------ 124 static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) { 125 Ceed ceed; 126 CeedElemRestriction_Cuda *impl; 127 128 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 129 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 130 CeedCallCuda(ceed, cuModuleUnload(impl->module)); 131 CeedCallBackend(CeedFree(&impl->h_ind_allocated)); 132 CeedCallCuda(ceed, cudaFree(impl->d_ind_allocated)); 133 CeedCallCuda(ceed, cudaFree(impl->d_t_offsets)); 134 CeedCallCuda(ceed, cudaFree(impl->d_t_indices)); 135 CeedCallCuda(ceed, cudaFree(impl->d_l_vec_indices)); 136 CeedCallBackend(CeedFree(&impl)); 137 return CEED_ERROR_SUCCESS; 138 } 139 140 //------------------------------------------------------------------------------ 141 // Create transpose offsets and indices 142 //------------------------------------------------------------------------------ 143 static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const CeedInt *indices) { 144 Ceed ceed; 145 bool *is_node; 146 CeedSize l_size; 147 CeedInt num_elem, elem_size, num_comp, num_nodes = 0; 148 CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; 149 CeedElemRestriction_Cuda *impl; 150 151 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 152 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 153 CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); 154 CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 155 CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); 156 CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); 157 const CeedInt size_indices = num_elem * elem_size; 158 159 // Count num_nodes 160 CeedCallBackend(CeedCalloc(l_size, &is_node)); 161 162 for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; 163 for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; 164 impl->num_nodes = num_nodes; 165 166 // L-vector offsets array 167 CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); 168 CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); 169 for (CeedInt i = 0, j = 0; i < l_size; i++) { 170 if (is_node[i]) { 171 l_vec_indices[j] = i; 172 ind_to_offset[i] = j++; 173 } 174 } 175 CeedCallBackend(CeedFree(&is_node)); 176 177 // Compute transpose offsets and indices 178 const CeedInt size_offsets = num_nodes + 1; 179 180 CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); 181 CeedCallBackend(CeedMalloc(size_indices, &t_indices)); 182 // Count node multiplicity 183 for (CeedInt e = 0; e < num_elem; ++e) { 184 for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1]; 185 } 186 // Convert to running sum 187 for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1]; 188 // List all E-vec indices associated with L-vec node 189 for (CeedInt e = 0; e < num_elem; ++e) { 190 for (CeedInt i = 0; i < elem_size; ++i) { 191 const CeedInt lid = elem_size * e + i; 192 const CeedInt gid = indices[lid]; 193 194 t_indices[t_offsets[ind_to_offset[gid]]++] = lid; 195 } 196 } 197 // Reset running sum 198 for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1]; 199 t_offsets[0] = 0; 200 201 // Copy data to device 202 // -- L-vector indices 203 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt))); 204 CeedCallCuda(ceed, cudaMemcpy(impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), cudaMemcpyHostToDevice)); 205 // -- Transpose offsets 206 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt))); 207 CeedCallCuda(ceed, cudaMemcpy(impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), cudaMemcpyHostToDevice)); 208 // -- Transpose indices 209 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt))); 210 CeedCallCuda(ceed, cudaMemcpy(impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), cudaMemcpyHostToDevice)); 211 212 // Cleanup 213 CeedCallBackend(CeedFree(&ind_to_offset)); 214 CeedCallBackend(CeedFree(&l_vec_indices)); 215 CeedCallBackend(CeedFree(&t_offsets)); 216 CeedCallBackend(CeedFree(&t_indices)); 217 return CEED_ERROR_SUCCESS; 218 } 219 220 //------------------------------------------------------------------------------ 221 // Create restriction 222 //------------------------------------------------------------------------------ 223 int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, 224 const CeedInt8 *curl_orients, CeedElemRestriction r) { 225 Ceed ceed, ceed_parent; 226 bool is_deterministic, is_strided; 227 CeedInt num_elem, num_comp, elem_size, comp_stride = 1; 228 CeedRestrictionType rstr_type; 229 CeedElemRestriction_Cuda *impl; 230 231 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 232 CeedCallBackend(CeedCalloc(1, &impl)); 233 CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); 234 CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); 235 CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); 236 CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); 237 CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 238 const CeedInt size = num_elem * elem_size; 239 CeedInt strides[3] = {1, size, elem_size}; 240 CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; 241 242 CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); 243 CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, 244 "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); 245 246 // Stride data 247 CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); 248 if (is_strided) { 249 bool has_backend_strides; 250 251 CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); 252 if (!has_backend_strides) { 253 CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); 254 } 255 } else { 256 CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); 257 } 258 259 impl->h_ind = NULL; 260 impl->h_ind_allocated = NULL; 261 impl->d_ind = NULL; 262 impl->d_ind_allocated = NULL; 263 impl->d_t_indices = NULL; 264 impl->d_t_offsets = NULL; 265 impl->num_nodes = size; 266 CeedCallBackend(CeedElemRestrictionSetData(r, impl)); 267 CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); 268 269 // Set up device indices/offset arrays 270 switch (mem_type) { 271 case CEED_MEM_HOST: { 272 switch (copy_mode) { 273 case CEED_OWN_POINTER: 274 impl->h_ind_allocated = (CeedInt *)indices; 275 impl->h_ind = (CeedInt *)indices; 276 break; 277 case CEED_USE_POINTER: 278 impl->h_ind = (CeedInt *)indices; 279 break; 280 case CEED_COPY_VALUES: 281 if (indices != NULL) { 282 CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); 283 memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); 284 impl->h_ind = impl->h_ind_allocated; 285 } 286 break; 287 } 288 if (indices != NULL) { 289 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); 290 impl->d_ind_allocated = impl->d_ind; // We own the device memory 291 CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyHostToDevice)); 292 if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices)); 293 } 294 break; 295 } 296 case CEED_MEM_DEVICE: { 297 switch (copy_mode) { 298 case CEED_COPY_VALUES: 299 if (indices != NULL) { 300 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); 301 impl->d_ind_allocated = impl->d_ind; // We own the device memory 302 CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyDeviceToDevice)); 303 } 304 break; 305 case CEED_OWN_POINTER: 306 impl->d_ind = (CeedInt *)indices; 307 impl->d_ind_allocated = impl->d_ind; 308 break; 309 case CEED_USE_POINTER: 310 impl->d_ind = (CeedInt *)indices; 311 } 312 if (indices != NULL) { 313 CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); 314 CeedCallCuda(ceed, cudaMemcpy(impl->h_ind_allocated, impl->d_ind, elem_size * num_elem * sizeof(CeedInt), cudaMemcpyDeviceToHost)); 315 impl->h_ind = impl->h_ind_allocated; 316 if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices)); 317 } 318 break; 319 } 320 // LCOV_EXCL_START 321 default: 322 return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported"); 323 // LCOV_EXCL_STOP 324 } 325 326 // Compile CUDA kernels (add atomicAdd function for old NVidia architectures) 327 CeedInt num_nodes = impl->num_nodes; 328 char *restriction_kernel_path, *restriction_kernel_source = NULL; 329 330 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction.h", &restriction_kernel_path)); 331 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 332 if (!is_deterministic) { 333 struct cudaDeviceProp prop; 334 Ceed_Cuda *ceed_data; 335 336 CeedCallBackend(CeedGetData(ceed, &ceed_data)); 337 CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); 338 if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { 339 char *atomic_add_path; 340 341 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path)); 342 CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &restriction_kernel_source)); 343 CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); 344 CeedCallBackend(CeedFree(&atomic_add_path)); 345 } 346 } 347 if (!restriction_kernel_source) { 348 CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); 349 } 350 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 351 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, 352 "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES", 353 strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2])); 354 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); 355 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); 356 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); 357 if (!is_deterministic) { 358 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); 359 } else { 360 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet)); 361 } 362 CeedCallBackend(CeedFree(&restriction_kernel_path)); 363 CeedCallBackend(CeedFree(&restriction_kernel_source)); 364 365 // Register backend functions 366 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Cuda)); 367 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Cuda)); 368 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Cuda)); 369 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda)); 370 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Cuda)); 371 return CEED_ERROR_SUCCESS; 372 } 373 374 //------------------------------------------------------------------------------ 375