xref: /libCEED/rust/libceed-sys/c-src/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp (revision 437930d19388999b5cc2d76e2fe0d14f58fb41f3)
10d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
20d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
30d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details.
40d0321e0SJeremy L Thompson //
50d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software
60d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral
70d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and
80d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed.
90d0321e0SJeremy L Thompson //
100d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
110d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office
120d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for
130d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including
140d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early
150d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative.
160d0321e0SJeremy L Thompson 
170d0321e0SJeremy L Thompson #include <ceed/ceed.h>
180d0321e0SJeremy L Thompson #include <ceed/backend.h>
19*437930d1SJeremy L Thompson #include <ceed/jit-tools.h>
200d0321e0SJeremy L Thompson #include <iostream>
210d0321e0SJeremy L Thompson #include <sstream>
220d0321e0SJeremy L Thompson #include <string.h>
230d0321e0SJeremy L Thompson #include "ceed-cuda-ref.h"
240d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h"
250d0321e0SJeremy L Thompson 
260d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
270d0321e0SJeremy L Thompson // Build QFunction kernel
280d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
290d0321e0SJeremy L Thompson extern "C" int CeedCudaBuildQFunction(CeedQFunction qf) {
300d0321e0SJeremy L Thompson   CeedInt ierr;
310d0321e0SJeremy L Thompson   using std::ostringstream;
320d0321e0SJeremy L Thompson   using std::string;
330d0321e0SJeremy L Thompson   Ceed ceed;
340d0321e0SJeremy L Thompson   CeedQFunctionGetCeed(qf, &ceed);
350d0321e0SJeremy L Thompson   CeedQFunction_Cuda *data;
360d0321e0SJeremy L Thompson   ierr = CeedQFunctionGetData(qf, (void **)&data); CeedChkBackend(ierr);
37*437930d1SJeremy L Thompson 
380d0321e0SJeremy L Thompson   // QFunction is built
39*437930d1SJeremy L Thompson   if (data->QFunction)
400d0321e0SJeremy L Thompson     return CEED_ERROR_SUCCESS;
41*437930d1SJeremy L Thompson 
42*437930d1SJeremy L Thompson   if (!data->qfunction_source)
43*437930d1SJeremy L Thompson     // LCOV_EXCL_START
44*437930d1SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
45*437930d1SJeremy L Thompson                      "No QFunction source or CUfunction provided.");
46*437930d1SJeremy L Thompson   // LCOV_EXCL_STOP
470d0321e0SJeremy L Thompson 
480d0321e0SJeremy L Thompson   // QFunction kernel generation
49*437930d1SJeremy L Thompson   CeedInt num_input_fields, num_output_fields, size;
50*437930d1SJeremy L Thompson   CeedQFunctionField *input_fields, *output_fields;
51*437930d1SJeremy L Thompson   ierr = CeedQFunctionGetFields(qf, &num_input_fields, &input_fields,
52*437930d1SJeremy L Thompson                                 &num_output_fields, &output_fields);
530d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
540d0321e0SJeremy L Thompson 
550d0321e0SJeremy L Thompson   // Build strings for final kernel
56*437930d1SJeremy L Thompson   char *read_write_kernel_path, *read_write_kernel_source;
57*437930d1SJeremy L Thompson   ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-ref-qfunction.h",
58*437930d1SJeremy L Thompson                              &read_write_kernel_path); CeedChkBackend(ierr);
59*437930d1SJeremy L Thompson   ierr = CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source);
60*437930d1SJeremy L Thompson   CeedChkBackend(ierr);
61*437930d1SJeremy L Thompson   string qfunction_source(data->qfunction_source);
62*437930d1SJeremy L Thompson   string qfunction_name(data->qfunction_name);
63*437930d1SJeremy L Thompson   string read_write(read_write_kernel_source);
64*437930d1SJeremy L Thompson   string kernel_name = "CeedKernel_Cuda_ref_" + qfunction_name;
650d0321e0SJeremy L Thompson   ostringstream code;
660d0321e0SJeremy L Thompson 
670d0321e0SJeremy L Thompson   // Defintions
680d0321e0SJeremy L Thompson   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
690d0321e0SJeremy L Thompson   code << "#define CEED_QFUNCTION_HELPER inline __device__\n";
700d0321e0SJeremy L Thompson   code << "#define CeedPragmaSIMD\n";
710d0321e0SJeremy L Thompson   code << "#define CEED_ERROR_SUCCESS 0\n";
720d0321e0SJeremy L Thompson   code << "#define CEED_Q_VLA 1\n\n";
730d0321e0SJeremy L Thompson   code << "typedef struct { const CeedScalar* inputs[16]; CeedScalar* outputs[16]; } Fields_Cuda;\n";
74*437930d1SJeremy L Thompson   code << read_write;
75*437930d1SJeremy L Thompson   code << qfunction_source;
76*437930d1SJeremy L Thompson   code << "extern \"C\" __global__ void " << kernel_name << "(void *ctx, CeedInt Q, Fields_Cuda fields) {\n";
770d0321e0SJeremy L Thompson 
780d0321e0SJeremy L Thompson   // Inputs
79*437930d1SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
800d0321e0SJeremy L Thompson     code << "  // Input field " << i << "\n";
81*437930d1SJeremy L Thompson     ierr = CeedQFunctionFieldGetSize(input_fields[i], &size); CeedChkBackend(ierr);
820d0321e0SJeremy L Thompson     code << "  const CeedInt size_in_" << i << " = "<<size<<";\n";
830d0321e0SJeremy L Thompson     code << "  CeedScalar r_q" << i << "[size_in_" << i << "];\n";
840d0321e0SJeremy L Thompson   }
85*437930d1SJeremy L Thompson   code << "\n";
860d0321e0SJeremy L Thompson 
870d0321e0SJeremy L Thompson   // Outputs
88*437930d1SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
890d0321e0SJeremy L Thompson     code << "  // Output field " << i << "\n";
90*437930d1SJeremy L Thompson     ierr = CeedQFunctionFieldGetSize(output_fields[i], &size); CeedChkBackend(ierr);
910d0321e0SJeremy L Thompson     code << "  const CeedInt size_out_" << i << " = " << size << ";\n";
920d0321e0SJeremy L Thompson     code << "  CeedScalar r_qq" << i << "[size_out_" << i << "];\n";
930d0321e0SJeremy L Thompson   }
94*437930d1SJeremy L Thompson   code << "\n";
950d0321e0SJeremy L Thompson 
960d0321e0SJeremy L Thompson   // Setup input/output arrays
97*437930d1SJeremy L Thompson   code << "  const CeedScalar* in[" << num_input_fields << "];\n";
98*437930d1SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
990d0321e0SJeremy L Thompson     code << "    in[" << i << "] = r_q" << i << ";\n";
1000d0321e0SJeremy L Thompson   }
101*437930d1SJeremy L Thompson   code << "  CeedScalar* out[" << num_output_fields << "];\n";
102*437930d1SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
1030d0321e0SJeremy L Thompson     code << "    out[" << i << "] = r_qq" << i << ";\n";
1040d0321e0SJeremy L Thompson   }
105*437930d1SJeremy L Thompson   code << "\n";
1060d0321e0SJeremy L Thompson 
1070d0321e0SJeremy L Thompson   // Loop over quadrature points
1080d0321e0SJeremy L Thompson   code << "  for (CeedInt q = blockIdx.x * blockDim.x + threadIdx.x; q < Q; q += blockDim.x * gridDim.x) {\n";
1090d0321e0SJeremy L Thompson 
1100d0321e0SJeremy L Thompson   // Load inputs
111*437930d1SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
1120d0321e0SJeremy L Thompson     code << "    // Input field " << i << "\n";
1130d0321e0SJeremy L Thompson     code << "    readQuads<size_in_" << i << ">(q, Q, fields.inputs[" << i << "], r_q" << i << ");\n";
1140d0321e0SJeremy L Thompson   }
1150d0321e0SJeremy L Thompson   // QFunction
1160d0321e0SJeremy L Thompson   code << "    // QFunction\n";
117*437930d1SJeremy L Thompson   code << "    " << qfunction_name << "(ctx, 1, in, out);\n";
1180d0321e0SJeremy L Thompson 
1190d0321e0SJeremy L Thompson   // Write outputs
120*437930d1SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
1210d0321e0SJeremy L Thompson     code << "    // Output field " << i << "\n";
1220d0321e0SJeremy L Thompson     code << "    writeQuads<size_out_" << i << ">(q, Q, r_qq" << i << ", fields.outputs[" << i << "]);\n";
1230d0321e0SJeremy L Thompson   }
1240d0321e0SJeremy L Thompson   code << "  }\n";
1250d0321e0SJeremy L Thompson   code << "}\n";
1260d0321e0SJeremy L Thompson 
1270d0321e0SJeremy L Thompson   // View kernel for debugging
128*437930d1SJeremy L Thompson   CeedDebug256(ceed, 1, "Generated QFunction Kernels:\n");
1290d0321e0SJeremy L Thompson   CeedDebug(ceed, code.str().c_str());
1300d0321e0SJeremy L Thompson 
1310d0321e0SJeremy L Thompson   // Compile kernel
1320d0321e0SJeremy L Thompson   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0);
1330d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
134*437930d1SJeremy L Thompson   ierr = CeedGetKernelCuda(ceed, data->module, kernel_name.c_str(), &data->QFunction);
1350d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
1360d0321e0SJeremy L Thompson 
1370d0321e0SJeremy L Thompson   // Cleanup
138*437930d1SJeremy L Thompson   ierr = CeedFree(&data->qfunction_source); CeedChkBackend(ierr);
139*437930d1SJeremy L Thompson   ierr = CeedFree(&read_write_kernel_path); CeedChkBackend(ierr);
140*437930d1SJeremy L Thompson   ierr = CeedFree(&read_write_kernel_source); CeedChkBackend(ierr);
141*437930d1SJeremy L Thompson 
1420d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
1430d0321e0SJeremy L Thompson }
1440d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
145