1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed/ceed.h> 11 #include <ceed/backend.h> 12 #include <ceed/jit-tools.h> 13 #include <cuda_runtime.h> 14 #include <iostream> 15 #include <sstream> 16 #include "ceed-cuda-gen.h" 17 #include "../cuda/ceed-cuda-compile.h" 18 #include "../cuda-ref/ceed-cuda-ref.h" 19 #include "../cuda-shared/ceed-cuda-shared.h" 20 21 //------------------------------------------------------------------------------ 22 // Build singe operator kernel 23 //------------------------------------------------------------------------------ 24 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 25 26 using std::ostringstream; 27 using std::string; 28 int ierr; 29 bool is_setup_done; 30 ierr = CeedOperatorIsSetupDone(op, &is_setup_done); CeedChkBackend(ierr); 31 if (is_setup_done) return CEED_ERROR_SUCCESS; 32 Ceed ceed; 33 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 34 CeedOperator_Cuda_gen *data; 35 ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 36 CeedQFunction qf; 37 CeedQFunction_Cuda_gen *qf_data; 38 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 39 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 40 CeedSize lsize; 41 CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, 42 num_output_fields, num_comp, dim = 1; 43 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 44 Q_1d = Q; 45 CeedOperatorField *op_input_fields, *op_output_fields; 46 ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields); 47 CeedChkBackend(ierr); 48 CeedQFunctionField *qf_input_fields, *qf_output_fields; 49 ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields); 50 CeedChkBackend(ierr); 51 CeedEvalMode eval_mode; 52 CeedBasis basis; 53 CeedBasis_Cuda_shared *basis_data; 54 CeedElemRestriction Erestrict; 55 CeedElemRestriction_Cuda *restr_data; 56 57 // TODO: put in a function? 58 // Check for restriction only identity operator 59 bool is_identity_qf; 60 ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 61 if (is_identity_qf) { 62 CeedEvalMode eval_mode_in, eval_mode_out; 63 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in); CeedChkBackend(ierr); 64 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out); CeedChkBackend(ierr); 65 if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) 66 // LCOV_EXCL_START 67 return CeedError(ceed, CEED_ERROR_BACKEND, 68 "Backend does not implement restriction only identity operators"); 69 // LCOV_EXCL_STOP 70 } 71 72 ostringstream code; 73 74 // TODO: put in a function? 75 // Add atomicAdd function for old NVidia architectures 76 struct cudaDeviceProp prop; 77 Ceed_Cuda *ceed_data; 78 ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr); 79 ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr); 80 if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){ 81 char *atomic_add_path, *atomic_add_source; 82 ierr = CeedGetJitAbsolutePath(ceed, 83 "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", 84 &atomic_add_path); CeedChkBackend(ierr); 85 CeedDebug256(ceed, 2, "----- Loading Atomic Add Source -----\n"); 86 ierr = CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source); 87 CeedChkBackend(ierr); 88 code << atomic_add_source; 89 ierr = CeedFree(&atomic_add_path); CeedChkBackend(ierr); 90 ierr = CeedFree(&atomic_add_source); CeedChkBackend(ierr); 91 } 92 93 // Load basis source files 94 // TODO: generalize to accept different device functions? 95 { 96 char *tensor_basis_kernel_path, *tensor_basis_kernel_source; 97 ierr = CeedGetJitAbsolutePath(ceed, 98 "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", 99 &tensor_basis_kernel_path); CeedChkBackend(ierr); 100 CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); 101 ierr = CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source); 102 CeedChkBackend(ierr); 103 code << tensor_basis_kernel_source; 104 ierr = CeedFree(&tensor_basis_kernel_path); CeedChkBackend(ierr); 105 ierr = CeedFree(&tensor_basis_kernel_source); CeedChkBackend(ierr); 106 } 107 { 108 char *cuda_gen_template_path, *cuda_gen_template_source; 109 ierr = CeedGetJitAbsolutePath(ceed, 110 "ceed/jit-source/cuda/cuda-gen-templates.h", 111 &cuda_gen_template_path); CeedChkBackend(ierr); 112 CeedDebug256(ceed, 2, "----- Loading Cuda-Gen Template Source -----\n"); 113 ierr = CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source); 114 CeedChkBackend(ierr); 115 code << cuda_gen_template_source; 116 ierr = CeedFree(&cuda_gen_template_path); CeedChkBackend(ierr); 117 ierr = CeedFree(&cuda_gen_template_source); CeedChkBackend(ierr); 118 } 119 120 // Get QFunction source and name 121 string q_function_source(qf_data->q_function_source); 122 string q_function_name(qf_data->q_function_name); 123 string operator_name; 124 operator_name = "CeedKernelCudaGenOperator_" + q_function_name; 125 126 // Find dim, P_1d, Q_1d 127 data->max_P_1d = 0; 128 for (CeedInt i = 0; i < num_input_fields; i++) { 129 ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 130 if (basis != CEED_BASIS_COLLOCATED) { 131 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 132 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 133 CeedChkBackend(ierr); 134 135 // Collect dim, P_1d, and Q_1d 136 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 137 bool isTensor; 138 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 139 if (isTensor) { 140 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 141 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 142 if (P_1d > data->max_P_1d) data->max_P_1d = P_1d; 143 } else { 144 // LCOV_EXCL_START 145 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 146 // LCOV_EXCL_STOP 147 } 148 } 149 } 150 // Check output bases for Q_1d, dim as well 151 // The only input basis might be CEED_BASIS_COLLOCATED 152 for (CeedInt i = 0; i < num_output_fields; i++) { 153 ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 154 155 if (basis != CEED_BASIS_COLLOCATED) { 156 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 157 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 158 CeedChkBackend(ierr); 159 160 // Collect Q_1d 161 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 162 bool isTensor; 163 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 164 if (isTensor) { 165 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 166 } else { 167 // LCOV_EXCL_START 168 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 169 // LCOV_EXCL_STOP 170 } 171 } 172 } 173 data->dim = dim; 174 data->Q_1d = Q_1d; 175 176 // Only use 3D collocated gradient parallelization strategy when gradient is computed 177 // TODO: put in a function? 178 bool use_collograd_parallelization = false; 179 if (dim == 3) { 180 bool was_grad_found = false; 181 for (CeedInt i = 0; i < num_input_fields; i++) { 182 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 183 if (eval_mode == CEED_EVAL_GRAD) { 184 ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 185 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 186 use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 187 was_grad_found = true; 188 } 189 } 190 for (CeedInt i = 0; i < num_output_fields; i++) { 191 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 192 if (eval_mode == CEED_EVAL_GRAD) { 193 ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 194 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 195 use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 196 was_grad_found = true; 197 } 198 } 199 } 200 201 // Define CEED_Q_VLA 202 code << "\n#undef CEED_Q_VLA\n"; 203 if (dim != 3 || use_collograd_parallelization) { 204 code << "#define CEED_Q_VLA 1\n\n"; 205 } else { 206 code << "#define CEED_Q_VLA "<<Q_1d<<"\n\n"; 207 } 208 209 code << q_function_source; 210 211 // Setup 212 code << "\n// -----------------------------------------------------------------------------\n"; 213 code << "\nextern \"C\" __global__ void "<<operator_name<<"(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar* W) {\n"; 214 for (CeedInt i = 0; i < num_input_fields; i++) { 215 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 216 CeedChkBackend(ierr); 217 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 218 code << " const CeedScalar* d_u_" <<i<<" = fields.inputs["<<i<<"];\n"; 219 } 220 } 221 222 for (CeedInt i = 0; i < num_output_fields; i++) { 223 code << " CeedScalar* d_v_"<<i<<" = fields.outputs["<<i<<"];\n"; 224 } 225 226 code << " const CeedInt dim = "<<dim<<";\n"; 227 code << " const CeedInt Q_1d = "<<Q_1d<<";\n"; 228 229 code << " extern __shared__ CeedScalar slice[];\n"; 230 // TODO put in a function? InitSharedData_Cuda? 231 code << " SharedData_Cuda data;\n"; 232 code << " data.t_id_x = threadIdx.x;\n"; 233 code << " data.t_id_y = threadIdx.y;\n"; 234 code << " data.t_id_z = threadIdx.z;\n"; 235 code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 236 code << " data.slice = slice+data.t_id_z*T_1D"<<(dim>1?"*T_1D":"")<<";\n"; 237 238 code << "\n // -- Input field constants and basis data --\n"; 239 // TODO: Put in a function? 240 //Initialize constants, and matrices B and G 241 for (CeedInt i = 0; i < num_input_fields; i++) { 242 code << " // ---- Input field "<<i<<" ----\n"; 243 // Get elem_size, eval_mode, num_comp 244 ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); 245 CeedChkBackend(ierr); 246 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 247 CeedChkBackend(ierr); 248 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 249 CeedChkBackend(ierr); 250 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 251 CeedChkBackend(ierr); 252 253 // Set field constants 254 if (eval_mode != CEED_EVAL_WEIGHT) { 255 ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 256 if (basis != CEED_BASIS_COLLOCATED) { 257 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 258 code << " const CeedInt P_in_"<<i<<" = "<<P_1d<<";\n"; 259 } else { 260 code << " const CeedInt P_in_"<<i<<" = "<<Q_1d<<";\n"; 261 } 262 code << " const CeedInt num_comp_in_"<<i<<" = "<<num_comp<<";\n"; 263 } 264 265 // Load basis data 266 code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 267 switch (eval_mode) { 268 case CEED_EVAL_NONE: 269 break; 270 case CEED_EVAL_INTERP: 271 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 272 data->B.inputs[i] = basis_data->d_interp_1d; 273 code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 274 code << " loadMatrix<P_in_"<<i<<",Q_1d>(data, B.inputs["<<i<<"], s_B_in_"<<i<<");\n"; 275 break; 276 case CEED_EVAL_GRAD: 277 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 278 data->B.inputs[i] = basis_data->d_interp_1d; 279 code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 280 code << " loadMatrix<P_in_"<<i<<",Q_1d>(data, B.inputs["<<i<<"], s_B_in_"<<i<<");\n"; 281 if (use_collograd_parallelization) { 282 data->G.inputs[i] = basis_data->d_collo_grad_1d; 283 code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q_1d*Q_1d<<"];\n"; 284 code << " loadMatrix<Q_1d,Q_1d>(data, G.inputs["<<i<<"], s_G_in_"<<i<<");\n"; 285 } else { 286 bool has_collo_grad = !!basis_data->d_collo_grad_1d; 287 data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 288 code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q_1d*(has_collo_grad?Q_1d:P_1d)<<"];\n"; 289 code << " loadMatrix<"<<(has_collo_grad?"Q_1d":("P_in_"+std::to_string(i)))<<",Q_1d>(data, G.inputs["<<i<<"], s_G_in_"<<i<<");\n"; 290 } 291 break; 292 case CEED_EVAL_WEIGHT: 293 break; // No action 294 case CEED_EVAL_DIV: 295 break; // TODO: Not implemented 296 case CEED_EVAL_CURL: 297 break; // TODO: Not implemented 298 } 299 } 300 301 code << "\n // -- Output field constants and basis data --\n"; 302 for (CeedInt i = 0; i < num_output_fields; i++) { 303 code << " // ---- Output field "<<i<<" ----\n"; 304 // Get elem_size, eval_mode, num_comp 305 ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict); 306 CeedChkBackend(ierr); 307 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 308 CeedChkBackend(ierr); 309 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 310 CeedChkBackend(ierr); 311 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 312 CeedChkBackend(ierr); 313 314 // Set field constants 315 ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 316 if (basis != CEED_BASIS_COLLOCATED) { 317 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 318 code << " const CeedInt P_out_"<<i<<" = "<<P_1d<<";\n"; 319 } else { 320 code << " const CeedInt P_out_"<<i<<" = "<<Q_1d<<";\n"; 321 } 322 code << " const CeedInt num_comp_out_"<<i<<" = "<<num_comp<<";\n"; 323 324 // Load basis data 325 code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 326 switch (eval_mode) { 327 case CEED_EVAL_NONE: 328 break; // No action 329 case CEED_EVAL_INTERP: 330 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 331 data->B.outputs[i] = basis_data->d_interp_1d; 332 code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 333 code << " loadMatrix<P_out_"<<i<<",Q_1d>(data, B.outputs["<<i<<"], s_B_out_"<<i<<");\n"; 334 break; 335 case CEED_EVAL_GRAD: 336 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 337 data->B.outputs[i] = basis_data->d_interp_1d; 338 code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 339 code << " loadMatrix<P_out_"<<i<<",Q_1d>(data, B.outputs["<<i<<"], s_B_out_"<<i<<");\n"; 340 if (use_collograd_parallelization) { 341 data->G.outputs[i] = basis_data->d_collo_grad_1d; 342 code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q_1d*Q_1d<<"];\n"; 343 code << " loadMatrix<Q_1d,Q_1d>(data, G.outputs["<<i<<"], s_G_out_"<<i<<");\n"; 344 } else { 345 bool has_collo_grad = !!basis_data->d_collo_grad_1d; 346 data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 347 code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q_1d*(has_collo_grad?Q_1d:P_1d)<<"];\n"; 348 code << " loadMatrix<"<<(has_collo_grad?"Q_1d":("P_out_"+std::to_string(i)))<<",Q_1d>(data, G.outputs["<<i<<"], s_G_out_"<<i<<");\n"; 349 } 350 break; 351 // LCOV_EXCL_START 352 case CEED_EVAL_WEIGHT: { 353 Ceed ceed; 354 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 355 return CeedError(ceed, CEED_ERROR_BACKEND, 356 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 357 break; // Should not occur 358 } 359 case CEED_EVAL_DIV: 360 break; // TODO: Not implemented 361 case CEED_EVAL_CURL: 362 break; // TODO: Not implemented 363 // LCOV_EXCL_STOP 364 } 365 } 366 code << "\n // -- Element loop --\n"; 367 code << " __syncthreads();\n"; 368 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 369 // Input basis apply if needed 370 // Generate the correct eval mode code for each input 371 code << " // -- Input field restrictions and basis actions --\n"; 372 for (CeedInt i = 0; i < num_input_fields; i++) { 373 code << " // ---- Input field "<<i<<" ----\n"; 374 // Get elem_size, eval_mode, num_comp 375 ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); 376 CeedChkBackend(ierr); 377 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 378 CeedChkBackend(ierr); 379 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 380 CeedChkBackend(ierr); 381 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 382 CeedChkBackend(ierr); 383 384 // TODO: put in a function? 385 // Restriction 386 if (eval_mode != CEED_EVAL_WEIGHT && 387 !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { 388 code << " CeedScalar r_u_"<<i<<"[num_comp_in_"<<i<<"*P_in_"<<i<<"];\n"; 389 390 bool is_strided; 391 ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 392 if (!is_strided) { 393 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 394 CeedChkBackend(ierr); 395 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 396 CeedInt comp_stride; 397 ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 398 code << " // CompStride: "<<comp_stride<<"\n"; 399 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 400 data->indices.inputs[i] = restr_data->d_ind; 401 code << " readDofsOffset"<<dim<<"d<num_comp_in_"<<i<<", "<<comp_stride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.inputs["<<i<<"], d_u_"<<i<<", r_u_"<<i<<");\n"; 402 } else { 403 bool backendstrides; 404 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 405 CeedChkBackend(ierr); 406 CeedInt num_elem; 407 ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 408 CeedChkBackend(ierr); 409 CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 410 if (!backendstrides) { 411 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 412 CeedChkBackend(ierr); 413 } 414 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 415 code << " readDofsStrided"<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u_"<<i<<", r_u_"<<i<<");\n"; 416 } 417 } 418 419 // TODO: put in a function? 420 // Basis action 421 code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 422 switch (eval_mode) { 423 case CEED_EVAL_NONE: 424 if (!use_collograd_parallelization) { 425 code << " CeedScalar* r_t_"<<i<<" = r_u_"<<i<<";\n"; 426 } 427 break; 428 case CEED_EVAL_INTERP: 429 code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*Q_1d];\n"; 430 code << " Interp"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", r_t_"<<i<<");\n"; 431 break; 432 case CEED_EVAL_GRAD: 433 if (use_collograd_parallelization) { 434 code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*Q_1d];\n"; 435 code << " Interp"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", r_t_"<<i<<");\n"; 436 } else { 437 CeedInt P_1d; 438 ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 439 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 440 code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*dim*Q_1d];\n"; 441 code << " Grad"<<(dim>1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t_"<<i<<");\n"; 442 } 443 break; 444 case CEED_EVAL_WEIGHT: 445 code << " CeedScalar r_t_"<<i<<"[Q_1d];\n"; 446 ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 447 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 448 data->W = basis_data->d_q_weight_1d; 449 code << " Weight"<<(dim>1?"Tensor":"")<<dim<<"d<Q_1d>(data, W, r_t_"<<i<<");\n"; 450 break; // No action 451 case CEED_EVAL_DIV: 452 break; // TODO: Not implemented 453 case CEED_EVAL_CURL: 454 break; // TODO: Not implemented 455 } 456 } 457 458 // TODO: put in a function + separate colograd logic 459 // Q function 460 code << "\n // -- Output field setup --\n"; 461 for (CeedInt i = 0; i < num_output_fields; i++) { 462 code << "\n // ---- Output field "<<i<<" ----\n"; 463 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 464 CeedChkBackend(ierr); 465 if (eval_mode==CEED_EVAL_GRAD) 466 { 467 if (use_collograd_parallelization) { 468 //Accumulator for gradient slices 469 code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*Q_1d];\n"; 470 code << " for (CeedInt i = 0; i < num_comp_out_"<<i<<"; i++) {\n"; 471 code << " for (CeedInt j = 0; j < Q_1d; ++j) {\n"; 472 code << " r_tt_"<<i<<"[j + i*Q_1d] = 0.0;\n"; 473 code << " }\n"; 474 code << " }\n"; 475 } else { 476 code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*dim*Q_1d];\n"; 477 } 478 } 479 if (eval_mode==CEED_EVAL_NONE || eval_mode==CEED_EVAL_INTERP) 480 { 481 code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*Q_1d];\n"; 482 } 483 } 484 // We treat quadrature points per slice in 3d to save registers 485 if (use_collograd_parallelization) { 486 code << "\n // Note: Using planes of 3D elements\n"; 487 code << "#pragma unroll\n"; 488 code << " for (CeedInt q = 0; q < Q_1d; q++) {\n"; 489 code << " // -- Input fields --\n"; 490 for (CeedInt i = 0; i < num_input_fields; i++) { 491 code << " // ---- Input field "<<i<<" ----\n"; 492 // Get elem_size, eval_mode, num_comp 493 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 494 CeedChkBackend(ierr); 495 // Basis action 496 code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 497 switch (eval_mode) { 498 case CEED_EVAL_NONE: 499 code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"];\n"; 500 501 bool is_strided; 502 ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); CeedChkBackend(ierr); 503 ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 504 if (!is_strided) { 505 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 506 CeedChkBackend(ierr); 507 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 508 CeedInt comp_stride; 509 ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 510 code << " // CompStride: "<<comp_stride<<"\n"; 511 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 512 data->indices.inputs[i] = restr_data->d_ind; 513 code << " readSliceQuadsOffset"<<"3d<num_comp_in_"<<i<<", "<<comp_stride<<", Q_1d>(data, lsize_in_"<<i<<", elem, q, indices.inputs["<<i<<"], d_u_"<<i<<", r_q_"<<i<<");\n"; 514 } else { 515 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); CeedChkBackend(ierr); 516 bool backendstrides; 517 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 518 CeedChkBackend(ierr); 519 CeedInt num_elem; 520 ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 521 CeedChkBackend(ierr); 522 CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 523 if (!backendstrides) { 524 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 525 CeedChkBackend(ierr); 526 } 527 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 528 code << " readSliceQuadsStrided"<<"3d<num_comp_in_"<<i<<",Q_1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u_"<<i<<", r_q_"<<i<<");\n"; 529 } 530 break; 531 case CEED_EVAL_INTERP: 532 code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"];\n"; 533 code << " for (CeedInt j = 0; j < num_comp_in_"<<i<<" ; ++j) {\n"; 534 code << " r_q_"<<i<<"[j] = r_t_"<<i<<"[q + j*Q_1d];\n"; 535 code << " }\n"; 536 break; 537 case CEED_EVAL_GRAD: 538 code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"*dim];\n"; 539 code << " gradCollo3d<num_comp_in_"<<i<<",Q_1d>(data, q, r_t_"<<i<<", s_G_in_"<<i<<", r_q_"<<i<<");\n"; 540 break; 541 case CEED_EVAL_WEIGHT: 542 code << " CeedScalar r_q_"<<i<<"[1];\n"; 543 code << " r_q_"<<i<<"[0] = r_t_"<<i<<"[q];\n"; 544 break; // No action 545 case CEED_EVAL_DIV: 546 break; // TODO: Not implemented 547 case CEED_EVAL_CURL: 548 break; // TODO: Not implemented 549 } 550 } 551 code << "\n // -- Output fields --\n"; 552 for (CeedInt i = 0; i < num_output_fields; i++) { 553 code << " // ---- Output field "<<i<<" ----\n"; 554 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 555 CeedChkBackend(ierr); 556 // Basis action 557 switch (eval_mode) { 558 case CEED_EVAL_NONE: 559 code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"];\n"; 560 break; // No action 561 case CEED_EVAL_INTERP: 562 code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"];\n"; 563 break; 564 case CEED_EVAL_GRAD: 565 code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"*dim];\n"; 566 break; 567 case CEED_EVAL_WEIGHT: 568 break; // Should not occur 569 case CEED_EVAL_DIV: 570 break; // TODO: Not implemented 571 case CEED_EVAL_CURL: 572 break; // TODO: Not implemented 573 } 574 } 575 } else { 576 code << "\n // Note: Using full elements\n"; 577 code << " // -- Input fields --\n"; 578 for (CeedInt i = 0; i < num_input_fields; i++) { 579 code << " // ---- Input field "<<i<<" ----\n"; 580 code << " CeedScalar* r_q_"<<i<<" = r_t_"<<i<<";\n"; 581 } 582 code << " // -- Output fields --\n"; 583 for (CeedInt i = 0; i < num_output_fields; i++) { 584 code << " // ---- Output field "<<i<<" ----\n"; 585 code << " CeedScalar* r_qq_"<<i<<" = r_tt_"<<i<<";\n"; 586 } 587 } 588 code << "\n // -- QFunction Inputs and outputs --\n"; 589 code << " CeedScalar* in["<<num_input_fields<<"];\n"; 590 for (CeedInt i = 0; i < num_input_fields; i++) { 591 code << " // ---- Input field "<<i<<" ----\n"; 592 code << " in["<<i<<"] = r_q_"<<i<<";\n"; 593 } 594 code << " CeedScalar* out["<<num_output_fields<<"];\n"; 595 for (CeedInt i = 0; i < num_output_fields; i++) { 596 code << " // ---- Output field "<<i<<" ----\n"; 597 code << " out["<<i<<"] = r_qq_"<<i<<";\n"; 598 } 599 code << "\n // -- Apply QFunction --\n"; 600 code << " "<<q_function_name<<"(ctx, "; 601 if (dim != 3 || use_collograd_parallelization) { 602 code << "1"; 603 } else { 604 code << "Q_1d"; 605 } 606 code << ", in, out);\n"; 607 if (use_collograd_parallelization) { 608 code << " // -- Output fields --\n"; 609 for (CeedInt i = 0; i < num_output_fields; i++) { 610 code << " // ---- Output field "<<i<<" ----\n"; 611 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 612 CeedChkBackend(ierr); 613 // Basis action 614 code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 615 switch (eval_mode) { 616 case CEED_EVAL_NONE: 617 code << " for (CeedInt j = 0; j < num_comp_out_"<<i<<" ; ++j) {\n"; 618 code << " r_tt_"<<i<<"[q + j*Q_1d] = r_qq_"<<i<<"[j];\n"; 619 code << " }\n"; 620 break; // No action 621 case CEED_EVAL_INTERP: 622 code << " for (CeedInt j = 0; j < num_comp_out_"<<i<<" ; ++j) {\n"; 623 code << " r_tt_"<<i<<"[q + j*Q_1d] = r_qq_"<<i<<"[j];\n"; 624 code << " }\n"; 625 break; 626 case CEED_EVAL_GRAD: 627 code << " gradColloTranspose3d<num_comp_out_"<<i<<",Q_1d>(data, q, r_qq_"<<i<<", s_G_out_"<<i<<", r_tt_"<<i<<");\n"; 628 break; 629 case CEED_EVAL_WEIGHT: 630 break; // Should not occur 631 case CEED_EVAL_DIV: 632 break; // TODO: Not implemented 633 case CEED_EVAL_CURL: 634 break; // TODO: Not implemented 635 } 636 } 637 code << " }\n"; 638 } 639 640 // Output basis apply if needed 641 // Generate the correct eval mode code for each output 642 code << "\n // -- Output field basis action and restrictions --\n"; 643 for (CeedInt i = 0; i < num_output_fields; i++) { 644 code << " // ---- Output field "<<i<<" ----\n"; 645 // Get elem_size, eval_mode, num_comp 646 ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict); 647 CeedChkBackend(ierr); 648 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 649 CeedChkBackend(ierr); 650 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 651 CeedChkBackend(ierr); 652 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 653 CeedChkBackend(ierr); 654 // TODO put in a function 655 // Basis action 656 code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 657 switch (eval_mode) { 658 case CEED_EVAL_NONE: 659 code << " CeedScalar* r_v_"<<i<<" = r_tt_"<<i<<";\n"; 660 break; // No action 661 case CEED_EVAL_INTERP: 662 code << " CeedScalar r_v_"<<i<<"[num_comp_out_"<<i<<"*P_out_"<<i<<"];\n"; 663 code << " InterpTranspose"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", r_v_"<<i<<");\n"; 664 break; 665 case CEED_EVAL_GRAD: 666 code << " CeedScalar r_v_"<<i<<"[num_comp_out_"<<i<<"*P_out_"<<i<<"];\n"; 667 if (use_collograd_parallelization) { 668 code << " InterpTranspose"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", r_v_"<<i<<");\n"; 669 } else { 670 CeedInt P_1d; 671 ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 672 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 673 code << " GradTranspose"<<(dim>1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v_"<<i<<");\n"; 674 } 675 break; 676 // LCOV_EXCL_START 677 case CEED_EVAL_WEIGHT: { 678 Ceed ceed; 679 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 680 return CeedError(ceed, CEED_ERROR_BACKEND, 681 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 682 break; // Should not occur 683 } 684 case CEED_EVAL_DIV: 685 break; // TODO: Not implemented 686 case CEED_EVAL_CURL: 687 break; // TODO: Not implemented 688 // LCOV_EXCL_STOP 689 } 690 // TODO put in a function 691 // Restriction 692 bool is_strided; 693 ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 694 if (!is_strided) { 695 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 696 CeedChkBackend(ierr); 697 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 698 CeedInt comp_stride; 699 ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 700 code << " // CompStride: "<<comp_stride<<"\n"; 701 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 702 data->indices.outputs[i] = restr_data->d_ind; 703 code << " writeDofsOffset"<<dim<<"d<num_comp_out_"<<i<<", "<<comp_stride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.outputs["<<i<<"], r_v_"<<i<<", d_v_"<<i<<");\n"; 704 } else { 705 bool has_backend_strides; 706 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); 707 CeedChkBackend(ierr); 708 CeedInt num_elem; 709 ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 710 CeedChkBackend(ierr); 711 CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 712 if (!has_backend_strides) { 713 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 714 CeedChkBackend(ierr); 715 } 716 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 717 code << " writeDofsStrided"<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v_"<<i<<", d_v_"<<i<<");\n"; 718 } 719 } 720 721 code << " }\n"; 722 code << "}\n"; 723 code << "// -----------------------------------------------------------------------------\n\n"; 724 725 // View kernel for debugging 726 CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 727 CeedDebug(ceed, code.str().c_str()); 728 729 ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 730 "T_1D", CeedIntMax(Q_1d, data->max_P_1d)); 731 CeedChkBackend(ierr); 732 ierr = CeedGetKernelCuda(ceed, data->module, operator_name.c_str(), &data->op); 733 CeedChkBackend(ierr); 734 735 ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 736 return CEED_ERROR_SUCCESS; 737 } 738 //------------------------------------------------------------------------------ 739