1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed.h> 11 #include <ceed/backend.h> 12 #include <ceed/jit-tools.h> 13 #include <cuda_runtime.h> 14 15 #include <iostream> 16 #include <sstream> 17 #include <string> 18 19 #include "../cuda-ref/ceed-cuda-ref.h" 20 #include "../cuda-shared/ceed-cuda-shared.h" 21 #include "../cuda/ceed-cuda-common.h" 22 #include "../cuda/ceed-cuda-compile.h" 23 #include "ceed-cuda-gen.h" 24 25 //------------------------------------------------------------------------------ 26 // Determine type of operator 27 //------------------------------------------------------------------------------ 28 static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 29 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields, 30 CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor, 31 bool *use_3d_slices) { 32 // Find dim, P_1d, Q_1d 33 *max_P_1d = 0; 34 *Q_1d = 0; 35 *dim = 0; 36 *is_tensor = true; 37 for (CeedInt i = 0; i < num_input_fields; i++) { 38 CeedBasis basis; 39 40 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 41 if (basis != CEED_BASIS_NONE) { 42 bool is_field_tensor; 43 CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0; 44 45 // Collect dim, P_1d, and Q_1d 46 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 47 CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 48 *is_tensor = *is_tensor && is_field_tensor; 49 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 50 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 51 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 52 CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 53 *dim = field_dim; 54 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 55 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 56 *Q_1d = field_Q_1d; 57 } 58 CeedCallBackend(CeedBasisDestroy(&basis)); 59 } 60 for (CeedInt i = 0; i < num_output_fields; i++) { 61 CeedBasis basis; 62 63 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 64 if (basis != CEED_BASIS_NONE) { 65 bool is_field_tensor; 66 CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0; 67 68 // Collect dim, P_1d, and Q_1d 69 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 70 CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 71 *is_tensor = *is_tensor && is_field_tensor; 72 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 73 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 74 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 75 CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 76 *dim = field_dim; 77 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 78 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 79 *Q_1d = field_Q_1d; 80 } 81 CeedCallBackend(CeedBasisDestroy(&basis)); 82 } 83 84 // Only use 3D collocated gradient parallelization strategy when gradient is computed 85 *use_3d_slices = false; 86 if (*dim == 3) { 87 bool was_grad_found = false; 88 89 for (CeedInt i = 0; i < num_input_fields; i++) { 90 CeedEvalMode eval_mode; 91 92 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 93 if (eval_mode == CEED_EVAL_GRAD) { 94 CeedBasis_Cuda_shared *basis_data; 95 CeedBasis basis; 96 97 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 98 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 99 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 100 was_grad_found = true; 101 CeedCallBackend(CeedBasisDestroy(&basis)); 102 } 103 } 104 for (CeedInt i = 0; i < num_output_fields; i++) { 105 CeedEvalMode eval_mode; 106 107 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 108 if (eval_mode == CEED_EVAL_GRAD) { 109 CeedBasis_Cuda_shared *basis_data; 110 CeedBasis basis; 111 112 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 113 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 114 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 115 was_grad_found = true; 116 CeedCallBackend(CeedBasisDestroy(&basis)); 117 } 118 } 119 } 120 return CEED_ERROR_SUCCESS; 121 } 122 123 //------------------------------------------------------------------------------ 124 // Setup fields 125 //------------------------------------------------------------------------------ 126 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field, 127 CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_at_points, 128 bool use_3d_slices) { 129 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 130 std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d"; 131 std::string option_name = (is_input ? "inputs" : "outputs"); 132 CeedEvalMode eval_mode = CEED_EVAL_NONE; 133 CeedInt elem_size = 0, num_comp = 0, P_1d = 0; 134 CeedElemRestriction elem_rstr; 135 CeedBasis_Cuda_shared *basis_data; 136 CeedBasis basis; 137 138 code << " // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n"; 139 140 // Get field data 141 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 142 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 143 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 144 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 145 } 146 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 147 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 148 if (basis != CEED_BASIS_NONE) { 149 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 150 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 151 } 152 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 153 154 // Set field constants 155 if (eval_mode != CEED_EVAL_WEIGHT) { 156 code << " const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; 157 code << " const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; 158 } 159 160 // Load basis data 161 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 162 switch (eval_mode) { 163 case CEED_EVAL_NONE: 164 break; 165 case CEED_EVAL_INTERP: 166 if (is_at_points) { 167 // AtPoints 168 if (!basis_data->d_chebyshev_interp_1d) { 169 CeedSize interp_bytes; 170 CeedScalar *chebyshev_interp_1d; 171 172 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 173 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 174 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 175 CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 176 CeedCallCuda(CeedBasisReturnCeed(basis), 177 cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 178 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 179 } 180 if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 181 else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 182 } else { 183 // Standard quadrature 184 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 185 else data->B.outputs[i] = basis_data->d_interp_1d; 186 } 187 code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n"; 188 code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 189 break; 190 case CEED_EVAL_GRAD: 191 if (is_at_points) { 192 // AtPoints 193 if (!basis_data->d_chebyshev_interp_1d) { 194 CeedSize interp_bytes; 195 CeedScalar *chebyshev_interp_1d; 196 197 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 198 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 199 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 200 CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 201 CeedCallCuda(CeedBasisReturnCeed(basis), 202 cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 203 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 204 } 205 if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 206 else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 207 } else { 208 // Standard quadrature 209 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 210 else data->B.outputs[i] = basis_data->d_interp_1d; 211 } 212 code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n"; 213 code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 214 if (is_at_points) break; // No G mat for AtPoints 215 if (use_3d_slices) { 216 if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; 217 else data->G.outputs[i] = basis_data->d_collo_grad_1d; 218 code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n"; 219 code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 220 } else { 221 bool has_collo_grad = basis_data->d_collo_grad_1d; 222 223 if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 224 else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 225 if (has_collo_grad) { 226 code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n"; 227 code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 228 } else { 229 code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * P_1d << "];\n"; 230 code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 231 } 232 } 233 break; 234 case CEED_EVAL_WEIGHT: 235 break; // No action 236 // LCOV_EXCL_START 237 case CEED_EVAL_DIV: 238 case CEED_EVAL_CURL: 239 break; // TODO: Not implemented 240 // LCOV_EXCL_STOP 241 } 242 CeedCallBackend(CeedBasisDestroy(&basis)); 243 return CEED_ERROR_SUCCESS; 244 } 245 246 //------------------------------------------------------------------------------ 247 // Restriction 248 //------------------------------------------------------------------------------ 249 static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim, 250 CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, 251 CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) { 252 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 253 std::string P_name = "P_1d" + var_suffix; 254 CeedEvalMode eval_mode = CEED_EVAL_NONE; 255 CeedInt elem_size = 0, num_comp = 0, P_1d = 0; 256 CeedSize l_size; 257 CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD; 258 CeedElemRestriction_Cuda *rstr_data; 259 CeedElemRestriction elem_rstr; 260 CeedBasis basis; 261 262 // Get field data 263 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 264 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 265 CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); 266 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 267 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 268 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 269 } 270 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 271 if (basis != CEED_BASIS_NONE) { 272 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 273 } 274 CeedCallBackend(CeedBasisDestroy(&basis)); 275 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 276 277 // Restriction 278 if (is_input) { 279 // Input 280 if (field_input_buffer[i] != i) { 281 std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]); 282 283 // Restriction was already done for previous input 284 code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; 285 } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) { 286 if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) { 287 // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated 288 code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n"; 289 } else if (rstr_type != CEED_RESTRICTION_POINTS) { 290 // Otherwise we're using the scratch space 291 code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 292 } 293 switch (rstr_type) { 294 case CEED_RESTRICTION_STANDARD: { 295 CeedInt comp_stride; 296 297 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 298 code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 299 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 300 code << " // CompStride: " << comp_stride << "\n"; 301 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 302 code << " ReadLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" 303 << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n"; 304 break; 305 } 306 case CEED_RESTRICTION_STRIDED: { 307 bool has_backend_strides; 308 CeedInt num_elem; 309 310 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 311 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 312 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 313 314 if (!has_backend_strides) { 315 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 316 } 317 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 318 code << " ReadLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << "," 319 << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n"; 320 break; 321 } 322 case CEED_RESTRICTION_POINTS: { 323 CeedInt comp_stride; 324 325 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 326 code << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 327 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 328 break; 329 } 330 // LCOV_EXCL_START 331 case CEED_RESTRICTION_ORIENTED: 332 case CEED_RESTRICTION_CURL_ORIENTED: 333 break; // TODO: Not implemented 334 // LCOV_EXCL_STOP 335 } 336 } 337 } else { 338 // Output 339 switch (rstr_type) { 340 case CEED_RESTRICTION_STANDARD: { 341 CeedInt comp_stride; 342 343 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 344 code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 345 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 346 code << " // CompStride: " << comp_stride << "\n"; 347 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 348 code << " WriteLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" 349 << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n"; 350 break; 351 } 352 case CEED_RESTRICTION_STRIDED: { 353 bool has_backend_strides; 354 CeedInt num_elem; 355 356 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 357 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 358 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 359 360 if (!has_backend_strides) { 361 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 362 } 363 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 364 code << " WriteLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << "," 365 << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n"; 366 break; 367 } 368 case CEED_RESTRICTION_POINTS: 369 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 370 break; 371 // LCOV_EXCL_START 372 case CEED_RESTRICTION_ORIENTED: 373 case CEED_RESTRICTION_CURL_ORIENTED: 374 break; // TODO: Not implemented 375 // LCOV_EXCL_STOP 376 } 377 } 378 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 379 return CEED_ERROR_SUCCESS; 380 } 381 382 //------------------------------------------------------------------------------ 383 // Basis 384 //------------------------------------------------------------------------------ 385 static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim, 386 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, 387 bool is_at_points, bool use_3d_slices) { 388 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 389 std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d"; 390 CeedEvalMode eval_mode = CEED_EVAL_NONE; 391 CeedInt elem_size = 0, num_comp = 0, P_1d = 0; 392 CeedElemRestriction elem_rstr; 393 CeedBasis basis; 394 395 // Get field data 396 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 397 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 398 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 399 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 400 } 401 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 402 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 403 if (basis != CEED_BASIS_NONE) { 404 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 405 } 406 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 407 408 // Basis 409 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 410 if (is_input) { 411 switch (eval_mode) { 412 case CEED_EVAL_NONE: 413 if (!use_3d_slices && !is_at_points) { 414 code << " CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; 415 } 416 break; 417 case CEED_EVAL_INTERP: 418 if (is_at_points) { 419 code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n"; 420 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name 421 << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 422 } else { 423 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 424 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name 425 << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 426 } 427 break; 428 case CEED_EVAL_GRAD: 429 if (is_at_points) { 430 code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n"; 431 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name 432 << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 433 } else if (use_3d_slices) { 434 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 435 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name 436 << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 437 } else { 438 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n"; 439 code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" << var_suffix 440 << ", P_1d" << var_suffix << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" 441 << var_suffix << ");\n"; 442 } 443 break; 444 case CEED_EVAL_WEIGHT: { 445 if (is_at_points) { 446 code << " // Nothing to do AtPoints\n"; 447 } else { 448 CeedBasis_Cuda_shared *basis_data; 449 450 code << " CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n"; 451 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 452 data->W = basis_data->d_q_weight_1d; 453 code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n"; 454 } 455 break; 456 } 457 // LCOV_EXCL_START 458 case CEED_EVAL_DIV: 459 case CEED_EVAL_CURL: 460 break; // TODO: Not implemented 461 // LCOV_EXCL_STOP 462 } 463 } else { 464 switch (eval_mode) { 465 case CEED_EVAL_NONE: 466 code << " CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; 467 break; // No action 468 case CEED_EVAL_INTERP: 469 code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 470 if (is_at_points) { 471 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name 472 << ">(data, r_c" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 473 } else { 474 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name 475 << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 476 } 477 break; 478 case CEED_EVAL_GRAD: 479 code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 480 if (is_at_points) { 481 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name 482 << ">(data, r_c" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 483 } else if (use_3d_slices) { 484 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name 485 << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 486 } else { 487 code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" 488 << var_suffix << ", " << P_name << "," << Q_name << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix 489 << ", r_e" << var_suffix << ");\n"; 490 } 491 break; 492 // LCOV_EXCL_START 493 case CEED_EVAL_WEIGHT: 494 break; // Should not occur 495 case CEED_EVAL_DIV: 496 case CEED_EVAL_CURL: 497 break; // TODO: Not implemented 498 // LCOV_EXCL_STOP 499 } 500 } 501 CeedCallBackend(CeedBasisDestroy(&basis)); 502 return CEED_ERROR_SUCCESS; 503 } 504 505 //------------------------------------------------------------------------------ 506 // QFunction 507 //------------------------------------------------------------------------------ 508 static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt dim, CeedInt max_num_points, 509 CeedInt num_input_fields, CeedOperatorField *op_input_fields, 510 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, 511 CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields, 512 std::string qfunction_name, CeedInt Q_1d, bool is_at_points, bool use_3d_slices) { 513 std::string Q_name = "Q_1d"; 514 CeedEvalMode eval_mode = CEED_EVAL_NONE; 515 CeedElemRestriction elem_rstr; 516 517 // Setup output arrays 518 code << "\n // -- Output field setup\n"; 519 for (CeedInt i = 0; i < num_output_fields; i++) { 520 std::string var_suffix = "_out_" + std::to_string(i); 521 522 code << " // ---- Output field " << i << "\n"; 523 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 524 switch (eval_mode) { 525 case CEED_EVAL_NONE: 526 if (is_at_points) { 527 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n"; 528 } else { 529 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 530 } 531 break; 532 case CEED_EVAL_INTERP: 533 if (is_at_points) { 534 // Accumulator for point data 535 code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n"; 536 code << " for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "; i++) {\n"; 537 code << " r_c" << var_suffix << "[i] = 0.0;\n"; 538 code << " }\n"; 539 } else { 540 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 541 } 542 break; 543 case CEED_EVAL_GRAD: 544 if (is_at_points) { 545 // Accumulator for point data 546 code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "*dim];\n"; 547 code << " for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "; i++) {\n"; 548 code << " r_c" << var_suffix << "[i] = 0.0;\n"; 549 code << " }\n"; 550 } else if (use_3d_slices) { 551 // Accumulator for gradient slices 552 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 553 code << " for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n"; 554 code << " r_q" << var_suffix << "[i] = 0.0;\n"; 555 code << " }\n"; 556 } else { 557 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n"; 558 } 559 break; 560 case CEED_EVAL_WEIGHT: 561 break; 562 // LCOV_EXCL_START 563 case CEED_EVAL_DIV: 564 case CEED_EVAL_CURL: 565 break; // TODO: Not implemented 566 // LCOV_EXCL_STOP 567 } 568 } 569 570 if (is_at_points) { 571 // We need to handle batches of points 572 code << "\n // Note: Using batches of points\n"; 573 code << " const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n"; 574 code << " #pragma unroll\n"; 575 code << " for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n"; 576 code << " const CeedInt p = i % max_num_points;\n\n"; 577 578 code << " // -- Coordinates\n"; 579 code << " CeedScalar r_x[dim];\n"; 580 code << " ReadPoint<dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n"; 581 582 code << " // -- Input fields\n"; 583 for (CeedInt i = 0; i < num_input_fields; i++) { 584 std::string var_suffix = "_in_" + std::to_string(i); 585 586 code << " // ---- Input field " << i << "\n"; 587 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 588 // Basis action 589 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 590 switch (eval_mode) { 591 case CEED_EVAL_NONE: 592 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 593 code << " ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 594 << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 595 break; 596 case CEED_EVAL_INTERP: 597 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 598 code << " InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix 599 << ", r_x, r_s" << var_suffix << ");\n"; 600 break; 601 case CEED_EVAL_GRAD: 602 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 603 code << " GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix 604 << ", r_x, r_s" << var_suffix << ");\n"; 605 break; 606 case CEED_EVAL_WEIGHT: 607 code << " CeedScalar r_s" << var_suffix << "[1];\n"; 608 code << " r_s" << var_suffix << "[0] = 1.0;\n"; 609 break; 610 // LCOV_EXCL_START 611 case CEED_EVAL_DIV: 612 case CEED_EVAL_CURL: 613 break; // TODO: Not implemented 614 // LCOV_EXCL_STOP 615 } 616 } 617 code << "\n // -- Output fields\n"; 618 for (CeedInt i = 0; i < num_output_fields; i++) { 619 std::string var_suffix = "_out_" + std::to_string(i); 620 621 code << " // ---- Output field " << i << "\n"; 622 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 623 // Basis action 624 switch (eval_mode) { 625 case CEED_EVAL_NONE: 626 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 627 break; 628 case CEED_EVAL_INTERP: 629 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 630 break; 631 case CEED_EVAL_GRAD: 632 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 633 break; 634 // LCOV_EXCL_START 635 case CEED_EVAL_WEIGHT: 636 break; // Should not occur 637 case CEED_EVAL_DIV: 638 case CEED_EVAL_CURL: 639 break; // TODO: Not implemented 640 // LCOV_EXCL_STOP 641 } 642 } 643 644 } else if (use_3d_slices) { 645 // We treat quadrature points per slice in 3d to save registers 646 code << "\n // Note: Using planes of 3D elements\n"; 647 code << " #pragma unroll\n"; 648 code << " for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; 649 code << " // -- Input fields\n"; 650 for (CeedInt i = 0; i < num_input_fields; i++) { 651 std::string var_suffix = "_in_" + std::to_string(i); 652 653 code << " // ---- Input field " << i << "\n"; 654 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 655 // Basis action 656 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 657 switch (eval_mode) { 658 case CEED_EVAL_NONE: 659 bool is_strided; 660 661 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 662 663 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 664 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 665 if (is_strided) { 666 bool has_backend_strides; 667 CeedInt num_elem, elem_size; 668 669 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 670 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 671 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 672 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 673 674 if (!has_backend_strides) { 675 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 676 } 677 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 678 code << " ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << "," << strides[0] << "," << strides[1] << "," 679 << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n"; 680 } else { 681 CeedSize l_size = 0; 682 CeedInt comp_stride; 683 CeedElemRestriction_Cuda *rstr_data; 684 685 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 686 code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 687 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 688 code << " // CompStride: " << comp_stride << "\n"; 689 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 690 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 691 code << " ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix 692 << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 693 } 694 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 695 break; 696 case CEED_EVAL_INTERP: 697 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 698 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; 699 code << " r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; 700 code << " }\n"; 701 break; 702 case CEED_EVAL_GRAD: 703 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 704 code << " GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix 705 << ", r_s" << var_suffix << ");\n"; 706 break; 707 case CEED_EVAL_WEIGHT: 708 code << " CeedScalar r_s" << var_suffix << "[1];\n"; 709 code << " r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; 710 break; 711 // LCOV_EXCL_START 712 case CEED_EVAL_DIV: 713 case CEED_EVAL_CURL: 714 break; // TODO: Not implemented 715 // LCOV_EXCL_STOP 716 } 717 } 718 code << "\n // -- Output fields\n"; 719 for (CeedInt i = 0; i < num_output_fields; i++) { 720 std::string var_suffix = "_out_" + std::to_string(i); 721 722 code << " // ---- Output field " << i << "\n"; 723 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 724 // Basis action 725 switch (eval_mode) { 726 case CEED_EVAL_NONE: 727 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 728 break; 729 case CEED_EVAL_INTERP: 730 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 731 break; 732 case CEED_EVAL_GRAD: 733 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 734 break; 735 // LCOV_EXCL_START 736 case CEED_EVAL_WEIGHT: 737 break; // Should not occur 738 case CEED_EVAL_DIV: 739 case CEED_EVAL_CURL: 740 break; // TODO: Not implemented 741 // LCOV_EXCL_STOP 742 } 743 } 744 } else { 745 code << "\n // Note: Using full elements\n"; 746 code << " {\n"; 747 code << " // -- Input fields\n"; 748 for (CeedInt i = 0; i < num_input_fields; i++) { 749 code << " // ---- Input field " << i << "\n"; 750 code << " CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; 751 } 752 code << " // -- Output fields\n"; 753 for (CeedInt i = 0; i < num_output_fields; i++) { 754 code << " // ---- Output field " << i << "\n"; 755 code << " CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; 756 } 757 } 758 759 // Input and output buffers 760 code << "\n // -- QFunction inputs and outputs\n"; 761 code << " // ---- Inputs\n"; 762 code << " CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; 763 for (CeedInt i = 0; i < num_input_fields; i++) { 764 code << " // ------ Input field " << i << "\n"; 765 code << " inputs[" << i << "] = r_s_in_" << i << ";\n"; 766 } 767 code << " // ---- Outputs\n"; 768 code << " CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; 769 for (CeedInt i = 0; i < num_output_fields; i++) { 770 code << " // ------ Output field " << i << "\n"; 771 code << " outputs[" << i << "] = r_s_out_" << i << ";\n"; 772 } 773 774 // Apply QFunction 775 code << "\n // -- Apply QFunction\n"; 776 code << " " << qfunction_name << "(ctx, "; 777 if (dim != 3 || is_at_points || use_3d_slices) { 778 code << "1"; 779 } else { 780 code << "Q_1d"; 781 } 782 code << ", inputs, outputs);\n"; 783 784 if (is_at_points) { 785 // Map back to coefficients 786 code << "\n // -- Output fields\n"; 787 for (CeedInt i = 0; i < num_output_fields; i++) { 788 std::string var_suffix = "_out_" + std::to_string(i); 789 std::string P_name = "P_1d" + var_suffix; 790 791 code << " // ---- Output field " << i << "\n"; 792 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 793 // Basis action 794 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 795 switch (eval_mode) { 796 case CEED_EVAL_NONE: { 797 CeedInt comp_stride; 798 CeedElemRestriction elem_rstr; 799 800 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 801 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 802 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 803 code << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 804 code << " WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 805 << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]" 806 << ", r_s" << var_suffix << ", d" << var_suffix << ");\n"; 807 break; 808 } 809 case CEED_EVAL_INTERP: 810 code << " if (i >= points.num_per_elem[elem]) {\n"; 811 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 812 code << " }\n"; 813 code << " InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s" 814 << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 815 break; 816 case CEED_EVAL_GRAD: 817 code << " if (i >= points.num_per_elem[elem]) {\n"; 818 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 819 code << " }\n"; 820 code << " GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s" 821 << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 822 break; 823 // LCOV_EXCL_START 824 case CEED_EVAL_WEIGHT: 825 break; // Should not occur 826 case CEED_EVAL_DIV: 827 case CEED_EVAL_CURL: 828 break; // TODO: Not implemented 829 // LCOV_EXCL_STOP 830 } 831 } 832 } else if (use_3d_slices) { 833 // Copy or apply transpose grad, if needed 834 code << "\n // -- Output fields\n"; 835 for (CeedInt i = 0; i < num_output_fields; i++) { 836 std::string var_suffix = "_out_" + std::to_string(i); 837 std::string P_name = "P_1d" + var_suffix; 838 839 code << " // ---- Output field " << i << "\n"; 840 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 841 // Basis action 842 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 843 switch (eval_mode) { 844 case CEED_EVAL_NONE: 845 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 846 code << " r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 847 code << " }\n"; 848 break; 849 case CEED_EVAL_INTERP: 850 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 851 code << " r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 852 code << " }\n"; 853 break; 854 case CEED_EVAL_GRAD: 855 code << " GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G" 856 << var_suffix << ", r_q" << var_suffix << ");\n"; 857 break; 858 // LCOV_EXCL_START 859 case CEED_EVAL_WEIGHT: 860 break; // Should not occur 861 case CEED_EVAL_DIV: 862 case CEED_EVAL_CURL: 863 break; // TODO: Not implemented 864 // LCOV_EXCL_STOP 865 } 866 } 867 } 868 code << " }\n"; 869 return CEED_ERROR_SUCCESS; 870 } 871 872 //------------------------------------------------------------------------------ 873 // Build single operator kernel 874 //------------------------------------------------------------------------------ 875 extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { 876 bool is_tensor = true, is_at_points = false, use_3d_slices = false; 877 Ceed ceed; 878 CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0; 879 CeedQFunctionField *qf_input_fields, *qf_output_fields; 880 CeedQFunction_Cuda_gen *qf_data; 881 CeedQFunction qf; 882 CeedOperatorField *op_input_fields, *op_output_fields; 883 CeedOperator_Cuda_gen *data; 884 std::ostringstream code; 885 886 { 887 bool is_setup_done; 888 889 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 890 if (is_setup_done) return CEED_ERROR_SUCCESS; 891 } 892 893 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 894 CeedCallBackend(CeedOperatorGetData(op, &data)); 895 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 896 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 897 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 898 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 899 900 // Get operator data 901 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 902 CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields, 903 qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices)); 904 if (dim == 0) dim = 1; 905 data->dim = dim; 906 if (is_at_points) { 907 CeedElemRestriction_Cuda *rstr_data; 908 CeedElemRestriction rstr_points = NULL; 909 910 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 911 CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 912 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 913 CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data)); 914 data->points.indices = (CeedInt *)rstr_data->d_offsets; 915 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 916 } 917 if (is_at_points) use_3d_slices = false; 918 if (Q_1d == 0) { 919 if (is_at_points) Q_1d = max_num_points; 920 else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d)); 921 } 922 data->Q_1d = Q_1d; 923 924 // Check for restriction only identity operator 925 { 926 bool is_identity_qf; 927 928 CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 929 if (is_identity_qf) { 930 CeedEvalMode eval_mode_in, eval_mode_out; 931 932 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 933 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 934 CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, 935 "Backend does not implement restriction only identity operators"); 936 } 937 } 938 939 // Add atomicAdd function for old NVidia architectures 940 { 941 Ceed_Cuda *ceed_data; 942 struct cudaDeviceProp prop; 943 944 CeedCallBackend(CeedGetData(ceed, &ceed_data)); 945 CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); 946 if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { 947 code << "// AtomicAdd fallback source\n"; 948 code << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n"; 949 } 950 } 951 952 // Load basis source files 953 // TODO: Add non-tensor, AtPoints 954 code << "// Tensor basis source\n"; 955 code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n"; 956 code << "// AtPoints basis source\n"; 957 code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n"; 958 code << "// CodeGen operator source\n"; 959 code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n"; 960 961 // Get QFunction name 962 std::string qfunction_name(qf_data->qfunction_name); 963 std::string operator_name; 964 965 operator_name = "CeedKernelCudaGenOperator_" + qfunction_name; 966 967 // Define CEED_Q_VLA 968 code << "\n#undef CEED_Q_VLA\n"; 969 if (dim != 3 || is_at_points || use_3d_slices) { 970 code << "#define CEED_Q_VLA 1\n\n"; 971 } else { 972 code << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 973 } 974 975 // Add user QFunction source 976 { 977 const char *source_path; 978 979 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 980 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file"); 981 982 code << "// User QFunction source\n"; 983 code << "#include \"" << source_path << "\"\n\n"; 984 } 985 986 // Setup 987 code << "\n// -----------------------------------------------------------------------------\n"; 988 code << "// Operator Kernel\n"; 989 code << "// \n"; 990 code << "// d_[in,out]_i: CeedVector device array\n"; 991 code << "// r_[in,out]_e_i: Element vector register\n"; 992 code << "// r_[in,out]_q_i: Quadrature space vector register\n"; 993 code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficents register\n"; 994 code << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 995 code << "// \n"; 996 code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 997 code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 998 code << "// -----------------------------------------------------------------------------\n"; 999 code << "extern \"C\" __global__ void " << operator_name 1000 << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " 1001 "points) {\n"; 1002 1003 // Scratch buffers 1004 for (CeedInt i = 0; i < num_input_fields; i++) { 1005 CeedEvalMode eval_mode; 1006 1007 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1008 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 1009 code << " const CeedScalar *d_in_" << i << " = fields.inputs[" << i << "];\n"; 1010 } 1011 } 1012 for (CeedInt i = 0; i < num_output_fields; i++) { 1013 code << " CeedScalar *d_out_" << i << " = fields.outputs[" << i << "];\n"; 1014 } 1015 1016 code << " const CeedInt dim = " << dim << ";\n"; 1017 code << " const CeedInt Q_1d = " << Q_1d << ";\n"; 1018 if (is_at_points) { 1019 code << " const CeedInt max_num_points = " << max_num_points << ";\n"; 1020 code << " const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 1021 } 1022 1023 // Shared data 1024 code << " extern __shared__ CeedScalar slice[];\n"; 1025 code << " SharedData_Cuda data;\n"; 1026 code << " data.t_id_x = threadIdx.x;\n"; 1027 code << " data.t_id_y = threadIdx.y;\n"; 1028 code << " data.t_id_z = threadIdx.z;\n"; 1029 code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 1030 code << " data.slice = slice + data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n"; 1031 1032 // Initialize constants, and matrices B and G 1033 code << "\n // Input field constants and basis data\n"; 1034 for (CeedInt i = 0; i < num_input_fields; i++) { 1035 CeedCallBackend( 1036 CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_at_points, use_3d_slices)); 1037 } 1038 code << "\n // Output field constants and basis data\n"; 1039 for (CeedInt i = 0; i < num_output_fields; i++) { 1040 CeedCallBackend( 1041 CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, use_3d_slices)); 1042 } 1043 1044 // Loop over all elements 1045 code << "\n // Element loop\n"; 1046 code << " __syncthreads();\n"; 1047 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 1048 1049 // -- Compute minimum buffer space needed 1050 CeedInt max_rstr_buffer_size = 1; 1051 1052 for (CeedInt i = 0; i < num_input_fields; i++) { 1053 CeedInt num_comp, elem_size; 1054 CeedElemRestriction elem_rstr; 1055 1056 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1057 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1058 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 1059 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size); 1060 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1061 } 1062 for (CeedInt i = 0; i < num_output_fields; i++) { 1063 CeedInt num_comp, elem_size; 1064 CeedElemRestriction elem_rstr; 1065 1066 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1067 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1068 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 1069 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size); 1070 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1071 } 1072 code << " // Scratch restriction buffer space\n"; 1073 code << " CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1074 1075 // -- Determine best input field processing order 1076 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1077 1078 for (CeedInt i = 0; i < num_input_fields; i++) { 1079 field_rstr_in_buffer[i] = -1; 1080 input_field_order[i] = -1; 1081 } 1082 { 1083 bool is_ordered[CEED_FIELD_MAX]; 1084 CeedInt curr_index = 0; 1085 1086 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1087 for (CeedInt i = 0; i < num_input_fields; i++) { 1088 CeedVector vec_i; 1089 CeedElemRestriction rstr_i; 1090 1091 if (is_ordered[i]) continue; 1092 field_rstr_in_buffer[i] = i; 1093 is_ordered[i] = true; 1094 input_field_order[curr_index] = i; 1095 curr_index++; 1096 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1097 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1098 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1099 for (CeedInt j = i + 1; j < num_input_fields; j++) { 1100 CeedVector vec_j; 1101 CeedElemRestriction rstr_j; 1102 1103 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1104 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1105 if (rstr_i == rstr_j && vec_i == vec_j) { 1106 field_rstr_in_buffer[j] = i; 1107 is_ordered[j] = true; 1108 input_field_order[curr_index] = j; 1109 curr_index++; 1110 } 1111 CeedCallBackend(CeedVectorDestroy(&vec_j)); 1112 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1113 } 1114 CeedCallBackend(CeedVectorDestroy(&vec_i)); 1115 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1116 } 1117 } 1118 1119 // -- Input restriction and basis 1120 code << "\n // -- Input field restrictions and basis actions\n"; 1121 for (CeedInt i = 0; i < num_input_fields; i++) { 1122 CeedInt f = input_field_order[i]; 1123 1124 code << " // ---- Input field " << f << "\n"; 1125 1126 // ---- Restriction 1127 CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 1128 Q_1d, true, is_at_points, use_3d_slices)); 1129 1130 // ---- Basis action 1131 CeedCallBackend( 1132 CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_at_points, use_3d_slices)); 1133 } 1134 1135 // -- Q function 1136 CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields, 1137 num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_at_points, 1138 use_3d_slices)); 1139 1140 // -- Output basis and restriction 1141 code << "\n // -- Output field basis action and restrictions\n"; 1142 for (CeedInt i = 0; i < num_output_fields; i++) { 1143 code << " // ---- Output field " << i << "\n"; 1144 1145 // ---- Basis action 1146 CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, 1147 use_3d_slices)); 1148 1149 // ---- Restriction 1150 CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, 1151 is_at_points, use_3d_slices)); 1152 } 1153 1154 // Close loop and function 1155 code << " }\n"; 1156 code << "}\n"; 1157 code << "// -----------------------------------------------------------------------------\n\n"; 1158 1159 // View kernel for debugging 1160 CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d))); 1161 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op)); 1162 CeedCallBackend(CeedOperatorSetSetupDone(op)); 1163 CeedCallBackend(CeedDestroy(&ceed)); 1164 CeedCallBackend(CeedQFunctionDestroy(&qf)); 1165 return CEED_ERROR_SUCCESS; 1166 } 1167 1168 //------------------------------------------------------------------------------ 1169