1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-source/cuda/cuda-types.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stddef.h> 14 #include <string.h> 15 16 #include "../cuda/ceed-cuda-common.h" 17 #include "../cuda/ceed-cuda-compile.h" 18 #include "ceed-cuda-gen-operator-build.h" 19 #include "ceed-cuda-gen.h" 20 21 //------------------------------------------------------------------------------ 22 // Destroy operator 23 //------------------------------------------------------------------------------ 24 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 25 Ceed ceed; 26 CeedOperator_Cuda_gen *impl; 27 28 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 29 CeedCallBackend(CeedOperatorGetData(op, &impl)); 30 if (impl->module) CeedCallCuda(ceed, cuModuleUnload(impl->module)); 31 if (impl->module_assemble_full) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_full)); 32 if (impl->module_assemble_diagonal) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_diagonal)); 33 if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem)); 34 CeedCallBackend(CeedFree(&impl)); 35 CeedCallBackend(CeedDestroy(&ceed)); 36 return CEED_ERROR_SUCCESS; 37 } 38 39 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, int elems_per_block) { 40 int useful_threads_per_block = threads_per_elem * elems_per_block; 41 // round up to nearest multiple of warp_size 42 int block_size = CeedDivUpInt(useful_threads_per_block, warp_size) * warp_size; 43 int blocks_per_sm = threads_per_sm / block_size; 44 return threads_per_sm - useful_threads_per_block * blocks_per_sm; 45 } 46 47 // Choose the least wasteful block size constrained by blocks_per_sm of max_threads_per_block. 48 // 49 // The x and y part of block[] contains per-element sizes (specified on input) while the z part is number of elements. 50 // 51 // Problem setting: we'd like to make occupancy high with relatively few inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how 52 // many threads can run. 53 // 54 // Note that full occupancy sometimes can't be achieved by one thread block. 55 // For example, an SM might support 1536 threads in total, but only 1024 within a single thread block. 56 // So cuOccupancyMaxPotentialBlockSize may suggest a block size of 768 so that two blocks can run, versus one block of 1024 will prevent a second 57 // block from running. The cuda-gen kernels are pretty heavy with lots of instruction-level parallelism (ILP) so we'll generally be okay with 58 // relatively low occupancy and smaller thread blocks, but we solve a reasonably general problem here. Empirically, we find that blocks bigger than 59 // about 256 have higher latency and worse load balancing when the number of elements is modest. 60 // 61 // cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of the number of quadrature points (or number of basis functions). 62 // They also have a lot of __syncthreads(), which is another point against excessively large thread blocks. 63 // Suppose I have elements with 7x7x7 quadrature points. 64 // This will loop over the last dimension, so we have 7*7=49 threads per element. 65 // Suppose we have two elements = 2*49=98 useful threads. 66 // CUDA schedules in units of full warps (32 threads), so 128 CUDA hardware threads are effectively committed to that block. 67 // Now suppose cuOccupancyMaxPotentialBlockSize returned 352. 68 // We can schedule 2 blocks of size 98 (196 useful threads using 256 hardware threads), but not a third block (which would need a total of 384 69 // hardware threads). 70 // 71 // If instead, we had packed 3 elements, we'd have 3*49=147 useful threads occupying 160 slots, and could schedule two blocks. 72 // Alternatively, we could pack a single block of 7 elements (2*49=343 useful threads) into the 354 slots. 73 // The latter has the least "waste", but __syncthreads() over-synchronizes and it might not pay off relative to smaller blocks. 74 static int BlockGridCalculate(CeedInt num_elem, int blocks_per_sm, int max_threads_per_block, int max_threads_z, int warp_size, int block[3], 75 int *grid) { 76 const int threads_per_sm = blocks_per_sm * max_threads_per_block; 77 const int threads_per_elem = block[0] * block[1]; 78 int elems_per_block = 1; 79 int waste = Waste(threads_per_sm, warp_size, threads_per_elem, 1); 80 81 for (int i = 2; i <= CeedIntMin(max_threads_per_block / threads_per_elem, num_elem); i++) { 82 int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i); 83 84 // We want to minimize waste, but smaller kernels have lower latency and less __syncthreads() overhead so when a larger block size has the same 85 // waste as a smaller one, go ahead and prefer the smaller block. 86 if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) { 87 elems_per_block = i; 88 waste = i_waste; 89 } 90 } 91 // In low-order elements, threads_per_elem may be sufficiently low to give an elems_per_block greater than allowable for the device, so we must 92 // check before setting the z-dimension size of the block. 93 block[2] = CeedIntMin(elems_per_block, max_threads_z); 94 *grid = CeedDivUpInt(num_elem, elems_per_block); 95 return CEED_ERROR_SUCCESS; 96 } 97 98 // callback for cuOccupancyMaxPotentialBlockSize, providing the amount of dynamic shared memory required for a thread block of size threads. 99 static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); } 100 101 //------------------------------------------------------------------------------ 102 // Apply and add to output 103 //------------------------------------------------------------------------------ 104 static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good, 105 CeedRequest *request) { 106 bool is_at_points, is_tensor; 107 Ceed ceed; 108 Ceed_Cuda *cuda_data; 109 CeedInt num_elem, num_input_fields, num_output_fields; 110 CeedEvalMode eval_mode; 111 CeedQFunctionField *qf_input_fields, *qf_output_fields; 112 CeedQFunction_Cuda_gen *qf_data; 113 CeedQFunction qf; 114 CeedOperatorField *op_input_fields, *op_output_fields; 115 CeedOperator_Cuda_gen *data; 116 117 // Build the operator kernel 118 CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, is_run_good)); 119 if (!(*is_run_good)) return CEED_ERROR_SUCCESS; 120 121 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 122 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 123 CeedCallBackend(CeedOperatorGetData(op, &data)); 124 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 125 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 126 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 127 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 128 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 129 130 // Input vectors 131 for (CeedInt i = 0; i < num_input_fields; i++) { 132 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 133 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 134 data->fields.inputs[i] = NULL; 135 } else { 136 bool is_active; 137 CeedVector vec; 138 139 // Get input vector 140 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 141 is_active = vec == CEED_VECTOR_ACTIVE; 142 if (is_active) data->fields.inputs[i] = input_arr; 143 else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); 144 CeedCallBackend(CeedVectorDestroy(&vec)); 145 } 146 } 147 148 // Output vectors 149 for (CeedInt i = 0; i < num_output_fields; i++) { 150 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 151 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 152 data->fields.outputs[i] = NULL; 153 } else { 154 bool is_active; 155 CeedVector vec; 156 157 // Get output vector 158 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 159 is_active = vec == CEED_VECTOR_ACTIVE; 160 if (is_active) data->fields.outputs[i] = output_arr; 161 else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i])); 162 CeedCallBackend(CeedVectorDestroy(&vec)); 163 } 164 } 165 166 // Point coordinates, if needed 167 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 168 if (is_at_points) { 169 // Coords 170 CeedVector vec; 171 172 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 173 CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); 174 CeedCallBackend(CeedVectorDestroy(&vec)); 175 176 // Points per elem 177 if (num_elem != data->points.num_elem) { 178 CeedInt *points_per_elem; 179 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 180 CeedElemRestriction rstr_points = NULL; 181 182 data->points.num_elem = num_elem; 183 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 184 CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); 185 for (CeedInt e = 0; e < num_elem; e++) { 186 CeedInt num_points_elem; 187 188 CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); 189 points_per_elem[e] = num_points_elem; 190 } 191 if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem)); 192 CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes)); 193 CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice)); 194 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 195 CeedCallBackend(CeedFree(&points_per_elem)); 196 } 197 } 198 199 // Get context data 200 CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); 201 202 // Apply operator 203 void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points}; 204 int max_threads_per_block, min_grid_size, grid; 205 206 CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor)); 207 CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 208 int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; 209 210 if (is_tensor) { 211 CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, is_at_points ? 1 : max_threads_per_block, 212 cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); 213 } else { 214 CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1)); 215 216 grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 217 block[2] = elems_per_block; 218 } 219 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 220 221 CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, stream, grid, block[0], block[1], block[2], shared_mem, is_run_good, opargs)); 222 223 // Restore input arrays 224 for (CeedInt i = 0; i < num_input_fields; i++) { 225 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 226 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 227 } else { 228 bool is_active; 229 CeedVector vec; 230 231 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 232 is_active = vec == CEED_VECTOR_ACTIVE; 233 if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); 234 CeedCallBackend(CeedVectorDestroy(&vec)); 235 } 236 } 237 238 // Restore output arrays 239 for (CeedInt i = 0; i < num_output_fields; i++) { 240 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 241 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 242 } else { 243 bool is_active; 244 CeedVector vec; 245 246 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 247 is_active = vec == CEED_VECTOR_ACTIVE; 248 if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i])); 249 CeedCallBackend(CeedVectorDestroy(&vec)); 250 } 251 } 252 253 // Restore point coordinates, if needed 254 if (is_at_points) { 255 CeedVector vec; 256 257 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 258 CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); 259 CeedCallBackend(CeedVectorDestroy(&vec)); 260 } 261 262 // Restore context data 263 CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); 264 265 // Cleanup 266 CeedCallBackend(CeedDestroy(&ceed)); 267 CeedCallBackend(CeedQFunctionDestroy(&qf)); 268 if (!(*is_run_good)) data->use_fallback = true; 269 return CEED_ERROR_SUCCESS; 270 } 271 272 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { 273 bool is_run_good = false; 274 const CeedScalar *input_arr = NULL; 275 CeedScalar *output_arr = NULL; 276 277 // Try to run kernel 278 if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr)); 279 if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr)); 280 CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, NULL, input_arr, output_arr, &is_run_good, request)); 281 if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr)); 282 if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr)); 283 284 // Fallback on unsuccessful run 285 if (!is_run_good) { 286 CeedOperator op_fallback; 287 288 CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator"); 289 CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); 290 CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); 291 } 292 return CEED_ERROR_SUCCESS; 293 } 294 295 static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { 296 bool is_run_good[CEED_COMPOSITE_MAX] = {false}; 297 CeedInt num_suboperators; 298 const CeedScalar *input_arr = NULL; 299 CeedScalar *output_arr = NULL; 300 Ceed ceed; 301 CeedOperator *sub_operators; 302 303 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 304 CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); 305 CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); 306 if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr)); 307 if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr)); 308 for (CeedInt i = 0; i < num_suboperators; i++) { 309 CeedInt num_elem = 0; 310 311 CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem)); 312 if (num_elem > 0) { 313 cudaStream_t stream = NULL; 314 315 CeedCallCuda(ceed, cudaStreamCreate(&stream)); 316 CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(sub_operators[i], stream, input_arr, output_arr, &is_run_good[i], request)); 317 CeedCallCuda(ceed, cudaStreamDestroy(stream)); 318 } 319 } 320 if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr)); 321 if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr)); 322 CeedCallCuda(ceed, cudaDeviceSynchronize()); 323 324 // Fallback on unsuccessful run 325 for (CeedInt i = 0; i < num_suboperators; i++) { 326 if (!is_run_good[i]) { 327 CeedOperator op_fallback; 328 329 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator"); 330 CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback)); 331 CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); 332 } 333 } 334 CeedCallBackend(CeedDestroy(&ceed)); 335 return CEED_ERROR_SUCCESS; 336 } 337 338 //------------------------------------------------------------------------------ 339 // AtPoints diagonal assembly 340 //------------------------------------------------------------------------------ 341 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen(CeedOperator op, CeedVector assembled, CeedRequest *request) { 342 Ceed ceed; 343 CeedOperator_Cuda_gen *data; 344 345 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 346 CeedCallBackend(CeedOperatorGetData(op, &data)); 347 348 // Build the assembly kernel 349 if (!data->assemble_diagonal && !data->use_assembly_fallback) { 350 bool is_build_good = false; 351 CeedInt num_active_bases_in, num_active_bases_out; 352 CeedOperatorAssemblyData assembly_data; 353 354 CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data)); 355 CeedCallBackend( 356 CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL)); 357 if (num_active_bases_in == num_active_bases_out) { 358 CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good)); 359 if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(op, &is_build_good)); 360 } 361 if (!is_build_good) data->use_assembly_fallback = true; 362 } 363 364 // Try assembly 365 if (!data->use_assembly_fallback) { 366 bool is_run_good = true; 367 Ceed_Cuda *cuda_data; 368 CeedInt num_elem, num_input_fields, num_output_fields; 369 CeedEvalMode eval_mode; 370 CeedScalar *assembled_array; 371 CeedQFunctionField *qf_input_fields, *qf_output_fields; 372 CeedQFunction_Cuda_gen *qf_data; 373 CeedQFunction qf; 374 CeedOperatorField *op_input_fields, *op_output_fields; 375 376 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 377 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 378 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 379 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 380 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 381 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 382 383 // Input vectors 384 for (CeedInt i = 0; i < num_input_fields; i++) { 385 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 386 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 387 data->fields.inputs[i] = NULL; 388 } else { 389 bool is_active; 390 CeedVector vec; 391 392 // Get input vector 393 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 394 is_active = vec == CEED_VECTOR_ACTIVE; 395 if (is_active) data->fields.inputs[i] = NULL; 396 else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); 397 CeedCallBackend(CeedVectorDestroy(&vec)); 398 } 399 } 400 401 // Point coordinates 402 { 403 CeedVector vec; 404 405 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 406 CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); 407 CeedCallBackend(CeedVectorDestroy(&vec)); 408 409 // Points per elem 410 if (num_elem != data->points.num_elem) { 411 CeedInt *points_per_elem; 412 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 413 CeedElemRestriction rstr_points = NULL; 414 415 data->points.num_elem = num_elem; 416 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 417 CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); 418 for (CeedInt e = 0; e < num_elem; e++) { 419 CeedInt num_points_elem; 420 421 CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); 422 points_per_elem[e] = num_points_elem; 423 } 424 if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem)); 425 CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes)); 426 CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice)); 427 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 428 CeedCallBackend(CeedFree(&points_per_elem)); 429 } 430 } 431 432 // Get context data 433 CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); 434 435 // Assembly array 436 CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array)); 437 438 // Assemble diagonal 439 void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array}; 440 int max_threads_per_block, min_grid_size, grid; 441 442 CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 443 int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; 444 445 CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, 1, 446 cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); 447 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 448 449 CeedCallBackend( 450 CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_diagonal, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs)); 451 CeedCallCuda(ceed, cudaDeviceSynchronize()); 452 453 // Restore input arrays 454 for (CeedInt i = 0; i < num_input_fields; i++) { 455 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 456 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 457 } else { 458 bool is_active; 459 CeedVector vec; 460 461 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 462 is_active = vec == CEED_VECTOR_ACTIVE; 463 if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); 464 CeedCallBackend(CeedVectorDestroy(&vec)); 465 } 466 } 467 468 // Restore point coordinates 469 { 470 CeedVector vec; 471 472 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 473 CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); 474 CeedCallBackend(CeedVectorDestroy(&vec)); 475 } 476 477 // Restore context data 478 CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); 479 480 // Restore assembly array 481 CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array)); 482 483 // Cleanup 484 CeedCallBackend(CeedQFunctionDestroy(&qf)); 485 if (!is_run_good) data->use_assembly_fallback = true; 486 } 487 CeedCallBackend(CeedDestroy(&ceed)); 488 489 // Fallback, if needed 490 if (data->use_assembly_fallback) { 491 CeedOperator op_fallback; 492 493 CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator"); 494 CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); 495 CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request)); 496 return CEED_ERROR_SUCCESS; 497 } 498 return CEED_ERROR_SUCCESS; 499 } 500 501 //------------------------------------------------------------------------------ 502 // AtPoints full assembly 503 //------------------------------------------------------------------------------ 504 static int CeedSingleOperatorAssembleAtPoints_Cuda_gen(CeedOperator op, CeedInt offset, CeedVector assembled) { 505 Ceed ceed; 506 CeedOperator_Cuda_gen *data; 507 508 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 509 CeedCallBackend(CeedOperatorGetData(op, &data)); 510 511 // Build the assembly kernel 512 if (!data->assemble_full && !data->use_assembly_fallback) { 513 bool is_build_good = false; 514 CeedInt num_active_bases_in, num_active_bases_out; 515 CeedOperatorAssemblyData assembly_data; 516 517 CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data)); 518 CeedCallBackend( 519 CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL)); 520 if (num_active_bases_in == num_active_bases_out) { 521 CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good)); 522 if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(op, &is_build_good)); 523 } 524 if (!is_build_good) data->use_assembly_fallback = true; 525 } 526 527 // Try assembly 528 if (!data->use_assembly_fallback) { 529 bool is_run_good = true; 530 Ceed_Cuda *cuda_data; 531 CeedInt num_elem, num_input_fields, num_output_fields; 532 CeedEvalMode eval_mode; 533 CeedScalar *assembled_array; 534 CeedQFunctionField *qf_input_fields, *qf_output_fields; 535 CeedQFunction_Cuda_gen *qf_data; 536 CeedQFunction qf; 537 CeedOperatorField *op_input_fields, *op_output_fields; 538 539 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 540 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 541 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 542 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 543 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 544 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 545 546 // Input vectors 547 for (CeedInt i = 0; i < num_input_fields; i++) { 548 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 549 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 550 data->fields.inputs[i] = NULL; 551 } else { 552 bool is_active; 553 CeedVector vec; 554 555 // Get input vector 556 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 557 is_active = vec == CEED_VECTOR_ACTIVE; 558 if (is_active) data->fields.inputs[i] = NULL; 559 else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); 560 CeedCallBackend(CeedVectorDestroy(&vec)); 561 } 562 } 563 564 // Point coordinates 565 { 566 CeedVector vec; 567 568 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 569 CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords)); 570 CeedCallBackend(CeedVectorDestroy(&vec)); 571 572 // Points per elem 573 if (num_elem != data->points.num_elem) { 574 CeedInt *points_per_elem; 575 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 576 CeedElemRestriction rstr_points = NULL; 577 578 data->points.num_elem = num_elem; 579 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 580 CeedCallBackend(CeedCalloc(num_elem, &points_per_elem)); 581 for (CeedInt e = 0; e < num_elem; e++) { 582 CeedInt num_points_elem; 583 584 CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); 585 points_per_elem[e] = num_points_elem; 586 } 587 if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem)); 588 CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes)); 589 CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice)); 590 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 591 CeedCallBackend(CeedFree(&points_per_elem)); 592 } 593 } 594 595 // Get context data 596 CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); 597 598 // Assembly array 599 CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array)); 600 CeedScalar *assembled_offset_array = &assembled_array[offset]; 601 602 // Assemble diagonal 603 void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, 604 &data->G, &data->W, &data->points, &assembled_offset_array}; 605 int max_threads_per_block, min_grid_size, grid; 606 607 CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); 608 int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; 609 610 CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, 1, 611 cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); 612 CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); 613 614 CeedCallBackend( 615 CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_full, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs)); 616 CeedCallCuda(ceed, cudaDeviceSynchronize()); 617 618 // Restore input arrays 619 for (CeedInt i = 0; i < num_input_fields; i++) { 620 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 621 if (eval_mode == CEED_EVAL_WEIGHT) { // Skip 622 } else { 623 bool is_active; 624 CeedVector vec; 625 626 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); 627 is_active = vec == CEED_VECTOR_ACTIVE; 628 if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); 629 CeedCallBackend(CeedVectorDestroy(&vec)); 630 } 631 } 632 633 // Restore point coordinates 634 { 635 CeedVector vec; 636 637 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec)); 638 CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords)); 639 CeedCallBackend(CeedVectorDestroy(&vec)); 640 } 641 642 // Restore context data 643 CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); 644 645 // Restore assembly array 646 CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array)); 647 648 // Cleanup 649 CeedCallBackend(CeedQFunctionDestroy(&qf)); 650 if (!is_run_good) data->use_assembly_fallback = true; 651 } 652 CeedCallBackend(CeedDestroy(&ceed)); 653 654 // Fallback, if needed 655 if (data->use_assembly_fallback) { 656 CeedOperator op_fallback; 657 658 CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator"); 659 CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); 660 CeedCallBackend(CeedSingleOperatorAssemble(op_fallback, offset, assembled)); 661 return CEED_ERROR_SUCCESS; 662 } 663 return CEED_ERROR_SUCCESS; 664 } 665 666 //------------------------------------------------------------------------------ 667 // Create operator 668 //------------------------------------------------------------------------------ 669 int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 670 bool is_composite, is_at_points; 671 Ceed ceed; 672 CeedOperator_Cuda_gen *impl; 673 674 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 675 CeedCallBackend(CeedCalloc(1, &impl)); 676 CeedCallBackend(CeedOperatorSetData(op, impl)); 677 CeedCall(CeedOperatorIsComposite(op, &is_composite)); 678 if (is_composite) { 679 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Cuda_gen)); 680 } else { 681 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen)); 682 } 683 CeedCall(CeedOperatorIsAtPoints(op, &is_at_points)); 684 if (is_at_points) { 685 CeedCallBackend( 686 CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen)); 687 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssembleAtPoints_Cuda_gen)); 688 } 689 CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen)); 690 CeedCallBackend(CeedDestroy(&ceed)); 691 return CEED_ERROR_SUCCESS; 692 } 693 694 //------------------------------------------------------------------------------ 695