Lines Matching refs:tab
180 …ernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i, in CeedOperatorBuildKernelFieldData_Cuda_gen() argument
210 …code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name… in CeedOperatorBuildKernelFieldData_Cuda_gen()
228 code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
233 …code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << "… in CeedOperatorBuildKernelFieldData_Cuda_gen()
235 …code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << … in CeedOperatorBuildKernelFieldData_Cuda_gen()
237 code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
241 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
270 code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
276 code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
278 …code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\… in CeedOperatorBuildKernelFieldData_Cuda_gen()
279 …code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << … in CeedOperatorBuildKernelFieldData_Cuda_gen()
309 code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
315 code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
317 …code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\… in CeedOperatorBuildKernelFieldData_Cuda_gen()
318 …code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << … in CeedOperatorBuildKernelFieldData_Cuda_gen()
329 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
331 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
333 …code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\… in CeedOperatorBuildKernelFieldData_Cuda_gen()
334 …code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << … in CeedOperatorBuildKernelFieldData_Cuda_gen()
345 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
347 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
349 …code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\… in CeedOperatorBuildKernelFieldData_Cuda_gen()
350 …code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << … in CeedOperatorBuildKernelFieldData_Cuda_gen()
356 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
358 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; in CeedOperatorBuildKernelFieldData_Cuda_gen()
360 …code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_… in CeedOperatorBuildKernelFieldData_Cuda_gen()
362 …code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tenso… in CeedOperatorBuildKernelFieldData_Cuda_gen()
383 …nelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i, in CeedOperatorBuildKernelRestriction_Cuda_gen() argument
413 code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; in CeedOperatorBuildKernelRestriction_Cuda_gen()
417 …code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];… in CeedOperatorBuildKernelRestriction_Cuda_gen()
420 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; in CeedOperatorBuildKernelRestriction_Cuda_gen()
427 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; in CeedOperatorBuildKernelRestriction_Cuda_gen()
429 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; in CeedOperatorBuildKernelRestriction_Cuda_gen()
431 …code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix <… in CeedOperatorBuildKernelRestriction_Cuda_gen()
447 …code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << va… in CeedOperatorBuildKernelRestriction_Cuda_gen()
449 …code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix <<… in CeedOperatorBuildKernelRestriction_Cuda_gen()
458 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; in CeedOperatorBuildKernelRestriction_Cuda_gen()
476 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; in CeedOperatorBuildKernelRestriction_Cuda_gen()
478 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; in CeedOperatorBuildKernelRestriction_Cuda_gen()
480 …code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix … in CeedOperatorBuildKernelRestriction_Cuda_gen()
496 …code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << va… in CeedOperatorBuildKernelRestriction_Cuda_gen()
498 …code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix <… in CeedOperatorBuildKernelRestriction_Cuda_gen()
520 …ildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i, in CeedOperatorBuildKernelBasis_Cuda_gen() argument
550 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; in CeedOperatorBuildKernelBasis_Cuda_gen()
555 code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; in CeedOperatorBuildKernelBasis_Cuda_gen()
562 …code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q… in CeedOperatorBuildKernelBasis_Cuda_gen()
563 …code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << "… in CeedOperatorBuildKernelBasis_Cuda_gen()
571 …code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tenso… in CeedOperatorBuildKernelBasis_Cuda_gen()
572 …code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << "… in CeedOperatorBuildKernelBasis_Cuda_gen()
580 …code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q… in CeedOperatorBuildKernelBasis_Cuda_gen()
581 …code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << "… in CeedOperatorBuildKernelBasis_Cuda_gen()
587 …code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];… in CeedOperatorBuildKernelBasis_Cuda_gen()
588 …code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << "… in CeedOperatorBuildKernelBasis_Cuda_gen()
597 …code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix… in CeedOperatorBuildKernelBasis_Cuda_gen()
599 …code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << "… in CeedOperatorBuildKernelBasis_Cuda_gen()
604 …code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix… in CeedOperatorBuildKernelBasis_Cuda_gen()
605 …code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_na… in CeedOperatorBuildKernelBasis_Cuda_gen()
611 code << tab << "// Nothing to do AtPoints\n"; in CeedOperatorBuildKernelBasis_Cuda_gen()
618 …code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1… in CeedOperatorBuildKernelBasis_Cuda_gen()
621 …code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix … in CeedOperatorBuildKernelBasis_Cuda_gen()
634 code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; in CeedOperatorBuildKernelBasis_Cuda_gen()
637 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; in CeedOperatorBuildKernelBasis_Cuda_gen()
641 …code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << "… in CeedOperatorBuildKernelBasis_Cuda_gen()
650 …code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << "… in CeedOperatorBuildKernelBasis_Cuda_gen()
655 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; in CeedOperatorBuildKernelBasis_Cuda_gen()
659 …code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << "… in CeedOperatorBuildKernelBasis_Cuda_gen()
665 …code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << "… in CeedOperatorBuildKernelBasis_Cuda_gen()
675 …code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << "… in CeedOperatorBuildKernelBasis_Cuda_gen()
680 …code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_na… in CeedOperatorBuildKernelBasis_Cuda_gen()
700 …Function_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt max_dim, in CeedOperatorBuildKernelQFunction_Cuda_gen() argument
712 code << tab << "// -- Output field setup\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
718 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
723 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
725 …code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tenso… in CeedOperatorBuildKernelQFunction_Cuda_gen()
732 …code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3… in CeedOperatorBuildKernelQFunction_Cuda_gen()
733 …code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name :… in CeedOperatorBuildKernelQFunction_Cuda_gen()
736 …code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tenso… in CeedOperatorBuildKernelQFunction_Cuda_gen()
743 …code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3… in CeedOperatorBuildKernelQFunction_Cuda_gen()
744 …code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name :… in CeedOperatorBuildKernelQFunction_Cuda_gen()
748 …code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];… in CeedOperatorBuildKernelQFunction_Cuda_gen()
749 …code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" <… in CeedOperatorBuildKernelQFunction_Cuda_gen()
751 …code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix… in CeedOperatorBuildKernelQFunction_Cuda_gen()
768 code << tab << "// Note: Using batches of points\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
769 …code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_point… in CeedOperatorBuildKernelQFunction_Cuda_gen()
770 code << tab << "#pragma unroll\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
771 …code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += … in CeedOperatorBuildKernelQFunction_Cuda_gen()
772 tab.push(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
773 code << tab << "const CeedInt p = i % max_num_points;\n\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
775 code << tab << "// -- Coordinates\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
776 code << tab << "CeedScalar r_x[max_dim];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
777 …code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_poin… in CeedOperatorBuildKernelQFunction_Cuda_gen()
779 code << tab << "// -- Input fields\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
786 code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
789 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
792 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
793 code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix in CeedOperatorBuildKernelQFunction_Cuda_gen()
797 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
798 …code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " <… in CeedOperatorBuildKernelQFunction_Cuda_gen()
802 …code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix… in CeedOperatorBuildKernelQFunction_Cuda_gen()
803 …code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << … in CeedOperatorBuildKernelQFunction_Cuda_gen()
807 code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
808 code << tab << "r_s" << var_suffix << "[0] = 1.0;\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
818 code << tab << "// -- Output fields\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
824 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
829 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
832 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
835 …code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix… in CeedOperatorBuildKernelQFunction_Cuda_gen()
850 code << tab << "// Note: Using planes of 3D elements\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
851 code << tab << "#pragma unroll\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
852 code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
853 tab.push(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
854 code << tab << "// -- Input fields\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
860 code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
863 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
868 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
884 …code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << va… in CeedOperatorBuildKernelQFunction_Cuda_gen()
886 …code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" <<… in CeedOperatorBuildKernelQFunction_Cuda_gen()
894 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
896 … code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
899 …code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix <… in CeedOperatorBuildKernelQFunction_Cuda_gen()
905 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
906 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
907 tab.push(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
908 … code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
909 tab.pop(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
910 code << tab << "}\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
913 …code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix… in CeedOperatorBuildKernelQFunction_Cuda_gen()
914 …code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q,… in CeedOperatorBuildKernelQFunction_Cuda_gen()
918 code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
919 code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
929 code << tab << "// -- Output fields\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
935 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
940 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
943 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
946 …code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix… in CeedOperatorBuildKernelQFunction_Cuda_gen()
959 code << tab << "// Note: Using full elements\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
960 code << tab << "{\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
961 tab.push(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
962 code << tab << "// -- Input fields\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
967 code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
968 code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
970 code << tab << "// -- Output fields\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
975 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
976 code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
982 code << tab << "// -- QFunction inputs and outputs\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
983 code << tab << "// ---- Inputs\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
984 code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
989 code << tab << "// ------ Input field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
990 code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
992 code << tab << "// ---- Outputs\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
993 code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
998 code << tab << "// ------ Output field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
999 code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1004 code << tab << "// -- Apply QFunction\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1005 code << tab << "" << qfunction_name << "(ctx, "; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1016 code << tab << "// -- Output fields\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1023 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1026 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1036 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1037 code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix in CeedOperatorBuildKernelQFunction_Cuda_gen()
1043 code << tab << "if (i >= points.num_per_elem[elem]) {\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1044 tab.push(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
1045 …code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "… in CeedOperatorBuildKernelQFunction_Cuda_gen()
1046 tab.pop(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
1047 code << tab << "}\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1048 …code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_po… in CeedOperatorBuildKernelQFunction_Cuda_gen()
1052 code << tab << "if (i >= points.num_per_elem[elem]) {\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1053 tab.push(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
1054 …code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++)… in CeedOperatorBuildKernelQFunction_Cuda_gen()
1055 tab.pop(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
1056 code << tab << "}\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1057 …code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_poin… in CeedOperatorBuildKernelQFunction_Cuda_gen()
1072 code << tab << "// -- Output fields\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1079 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1082 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1085 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1086 tab.push(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
1087 … code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1088 tab.pop(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
1089 code << tab << "}\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1092 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1093 tab.push(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
1094 … code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1095 tab.pop(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
1096 code << tab << "}\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1099 …code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>… in CeedOperatorBuildKernelQFunction_Cuda_gen()
1112 tab.pop(); in CeedOperatorBuildKernelQFunction_Cuda_gen()
1113 code << tab << "}\n"; in CeedOperatorBuildKernelQFunction_Cuda_gen()
1130 Tab tab; in CeedOperatorBuildKernel_Cuda_gen() local
1264 code << tab << "// AtomicAdd fallback source\n"; in CeedOperatorBuildKernel_Cuda_gen()
1265 code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n"; in CeedOperatorBuildKernel_Cuda_gen()
1271 code << tab << "// Tensor basis source\n"; in CeedOperatorBuildKernel_Cuda_gen()
1272 code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n"; in CeedOperatorBuildKernel_Cuda_gen()
1275 code << tab << "// Non-tensor basis source\n"; in CeedOperatorBuildKernel_Cuda_gen()
1276 code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n"; in CeedOperatorBuildKernel_Cuda_gen()
1296 code << "\n" << tab << "#undef CEED_Q_VLA\n"; in CeedOperatorBuildKernel_Cuda_gen()
1298 code << tab << "#define CEED_Q_VLA 1\n\n"; in CeedOperatorBuildKernel_Cuda_gen()
1300 code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; in CeedOperatorBuildKernel_Cuda_gen()
1310 code << tab << "// User QFunction source\n"; in CeedOperatorBuildKernel_Cuda_gen()
1311 code << tab << "#include \"" << source_path << "\"\n\n"; in CeedOperatorBuildKernel_Cuda_gen()
1315 …code << "\n" << tab << "// -----------------------------------------------------------------------… in CeedOperatorBuildKernel_Cuda_gen()
1316 code << tab << "// Operator Kernel\n"; in CeedOperatorBuildKernel_Cuda_gen()
1317 code << tab << "// \n"; in CeedOperatorBuildKernel_Cuda_gen()
1318 code << tab << "// d_[in,out]_i: CeedVector device array\n"; in CeedOperatorBuildKernel_Cuda_gen()
1319 code << tab << "// r_[in,out]_e_i: Element vector register\n"; in CeedOperatorBuildKernel_Cuda_gen()
1320 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; in CeedOperatorBuildKernel_Cuda_gen()
1321 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; in CeedOperatorBuildKernel_Cuda_gen()
1322 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; in CeedOperatorBuildKernel_Cuda_gen()
1323 code << tab << "// \n"; in CeedOperatorBuildKernel_Cuda_gen()
1324 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; in CeedOperatorBuildKernel_Cuda_gen()
1325 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; in CeedOperatorBuildKernel_Cuda_gen()
1326 …code << tab << "// -----------------------------------------------------------------------------\n… in CeedOperatorBuildKernel_Cuda_gen()
1327 code << tab << "extern \"C\" __global__ void " << operator_name in CeedOperatorBuildKernel_Cuda_gen()
1330 tab.push(); in CeedOperatorBuildKernel_Cuda_gen()
1338 … code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; in CeedOperatorBuildKernel_Cuda_gen()
1342 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; in CeedOperatorBuildKernel_Cuda_gen()
1345 code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; in CeedOperatorBuildKernel_Cuda_gen()
1347 code << tab << "const CeedInt Q = " << Q << ";\n"; in CeedOperatorBuildKernel_Cuda_gen()
1350 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; in CeedOperatorBuildKernel_Cuda_gen()
1353 code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; in CeedOperatorBuildKernel_Cuda_gen()
1354 code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; in CeedOperatorBuildKernel_Cuda_gen()
1358 code << tab << "extern __shared__ CeedScalar slice[];\n"; in CeedOperatorBuildKernel_Cuda_gen()
1359 code << tab << "SharedData_Cuda data;\n"; in CeedOperatorBuildKernel_Cuda_gen()
1360 code << tab << "data.t_id_x = threadIdx.x;\n"; in CeedOperatorBuildKernel_Cuda_gen()
1361 code << tab << "data.t_id_y = threadIdx.y;\n"; in CeedOperatorBuildKernel_Cuda_gen()
1362 code << tab << "data.t_id_z = threadIdx.z;\n"; in CeedOperatorBuildKernel_Cuda_gen()
1363 …code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockD… in CeedOperatorBuildKernel_Cuda_gen()
1364 …code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? … in CeedOperatorBuildKernel_Cuda_gen()
1471 code << "\n" << tab << "// Input field constants and basis data\n"; in CeedOperatorBuildKernel_Cuda_gen()
1473 …CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], … in CeedOperatorBuildKernel_Cuda_gen()
1476 code << "\n" << tab << "// Output field constants and basis data\n"; in CeedOperatorBuildKernel_Cuda_gen()
1478 …CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i],… in CeedOperatorBuildKernel_Cuda_gen()
1483 code << "\n" << tab << "// Element loop\n"; in CeedOperatorBuildKernel_Cuda_gen()
1484 code << tab << "__syncthreads();\n"; in CeedOperatorBuildKernel_Cuda_gen()
1485 …code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += … in CeedOperatorBuildKernel_Cuda_gen()
1486 tab.push(); in CeedOperatorBuildKernel_Cuda_gen()
1519 code << tab << "// Scratch restriction buffer space\n"; in CeedOperatorBuildKernel_Cuda_gen()
1520 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; in CeedOperatorBuildKernel_Cuda_gen()
1567 code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; in CeedOperatorBuildKernel_Cuda_gen()
1573 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; in CeedOperatorBuildKernel_Cuda_gen()
1576 …CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buff… in CeedOperatorBuildKernel_Cuda_gen()
1580 …CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_i… in CeedOperatorBuildKernel_Cuda_gen()
1585 …CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points… in CeedOperatorBuildKernel_Cuda_gen()
1590 code << "\n" << tab << "// -- Output field basis action and restrictions\n"; in CeedOperatorBuildKernel_Cuda_gen()
1595 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernel_Cuda_gen()
1598 …CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_… in CeedOperatorBuildKernel_Cuda_gen()
1602 …CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, i, NULL, op_output_fi… in CeedOperatorBuildKernel_Cuda_gen()
1607 tab.pop(); in CeedOperatorBuildKernel_Cuda_gen()
1608 code << tab << "}\n"; in CeedOperatorBuildKernel_Cuda_gen()
1609 tab.pop(); in CeedOperatorBuildKernel_Cuda_gen()
1610 code << tab << "}\n"; in CeedOperatorBuildKernel_Cuda_gen()
1611 …code << tab << "// -----------------------------------------------------------------------------\n… in CeedOperatorBuildKernel_Cuda_gen()
1647 Tab tab; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen() local
1680 code << tab << "// AtomicAdd fallback source\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1681 code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1686 code << tab << "// Tensor basis source\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1687 code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1688 code << tab << "// AtPoints basis source\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1689 …code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1690 code << tab << "// CodeGen operator source\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1691 code << tab << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1704 code << "\n" << tab << "#undef CEED_Q_VLA\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1705 code << tab << "#define CEED_Q_VLA 1\n\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1714 code << tab << "// User QFunction source\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1715 code << tab << "#include \"" << source_path << "\"\n\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1719 …code << "\n" << tab << "// -----------------------------------------------------------------------… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1720 code << tab << "// Operator Assembly Kernel\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1721 code << tab << "// \n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1722 code << tab << "// d_[in,out]_i: CeedVector device array\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1723 code << tab << "// r_[in,out]_e_i: Element vector register\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1724 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1725 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1726 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1727 code << tab << "// \n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1728 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1729 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1730 …code << tab << "// -----------------------------------------------------------------------------\n… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1731 code << tab << "extern \"C\" __global__ void " << operator_name in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1734 tab.push(); in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1742 … code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1746 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1749 code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1750 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1751 code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1752 code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1755 code << tab << "extern __shared__ CeedScalar slice[];\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1756 code << tab << "SharedData_Cuda data;\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1757 code << tab << "data.t_id_x = threadIdx.x;\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1758 code << tab << "data.t_id_y = threadIdx.y;\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1759 code << tab << "data.t_id_z = threadIdx.z;\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1760 …code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockD… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1761 …code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? … in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1837 code << "\n" << tab << "// Input field constants and basis data\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1839 …CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], … in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1842 code << "\n" << tab << "// Output field constants and basis data\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1844 …CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i],… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1849 code << "\n" << tab << "// Element loop\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1850 code << tab << "__syncthreads();\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1851 …code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += … in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1852 tab.push(); in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1885 code << tab << "// Scratch restriction buffer space\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1886 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1933 code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1950 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1955 code << tab << "// Active field - no restriction or basis action here\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1958 …code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1961 … code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1965 …CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buff… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1969 …CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_i… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1977 code << "\n" << tab << "// Loop over nodes in active field\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1978 …code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_su… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1980 tab.push(); in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1983 code << tab << "// Set current active node and component to 1.0\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
1984 …code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2002 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2005 …CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_i… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2010 …CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2015 code << "\n" << tab << "// -- Output field basis action and restrictions\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2030 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2033 …CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2045 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2047 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2048 …code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_st… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2059 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2061 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2062 …code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stri… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2069 code << "\n" << tab << "// Reset current active node and component to 0.0\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2070 …code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2074 tab.pop(); in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2075 code << tab << "}\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2078 tab.pop(); in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2079 code << tab << "}\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2080 tab.pop(); in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2081 code << tab << "}\n"; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2082 …code << tab << "// -----------------------------------------------------------------------------\n… in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2127 Tab tab; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen() local
2204 code << tab << "// AtomicAdd fallback source\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2205 code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2211 code << tab << "// Tensor basis source\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2212 code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2215 code << tab << "// Non-tensor basis source\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2216 code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2232 code << "\n" << tab << "#undef CEED_Q_VLA\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2234 code << tab << "#define CEED_Q_VLA 1\n\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2236 code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2246 code << tab << "// User QFunction source\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2247 code << tab << "#include \"" << source_path << "\"\n\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2251 …code << "\n" << tab << "// -----------------------------------------------------------------------… in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2252 code << tab << "// Operator Assembly Kernel\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2253 code << tab << "// \n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2254 code << tab << "// d_[in,out]_i: CeedVector device array\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2255 code << tab << "// r_[in,out]_e_i: Element vector register\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2256 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2257 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2258 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2259 code << tab << "// \n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2260 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2261 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2262 …code << tab << "// -----------------------------------------------------------------------------\n… in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2263 code << tab << "extern \"C\" __global__ void " << operator_name in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2266 tab.push(); in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2274 … code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2288 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2292 code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2294 code << tab << "const CeedInt Q = " << Q << ";\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2297 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2301 code << tab << "extern __shared__ CeedScalar slice[];\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2302 code << tab << "SharedData_Cuda data;\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2303 code << tab << "data.t_id_x = threadIdx.x;\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2304 code << tab << "data.t_id_y = threadIdx.y;\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2305 code << tab << "data.t_id_z = threadIdx.z;\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2306 …code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockD… in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2307 …code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? … in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2414 code << "\n" << tab << "// Input field constants and basis data\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2416 …CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], … in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2419 code << "\n" << tab << "// Output field constants and basis data\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2421 …CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i],… in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2426 code << "\n" << tab << "// Element loop\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2427 code << tab << "__syncthreads();\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2428 …code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += … in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2429 tab.push(); in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2462 code << tab << "// Scratch restriction buffer space\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2463 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2510 code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2528 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2539 … code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*" in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2542 …code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_… in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2544 code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2547 …CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buff… in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2551 …CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_i… in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2555 code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2560 code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2585 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2586 code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2589 code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2594 code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2597 code << "\n" << tab << "CeedInt input_offset = 0;\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2598 code << tab << "// Loop over active QFunction input fields\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2599 code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2600 code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2601 tab.push(); in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2604 code << "\n" << tab << "// Loop over current active input field size\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2605 code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2606 code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2607 tab.push(); in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2610 code << tab << "// Set current active point and component to 1.0\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2612 code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2614 code << tab << "r_q_in[a][s] = 1.0;\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2618 …CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points… in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2623 code << "\n" << tab << "// -- Output field basis action and restrictions\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2640 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2645 …code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_ou… in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2652 code << "\n" << tab << "// Reset current active node and component to 0.0\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2654 code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2656 code << tab << "r_q_in[a][s] = 0.0;\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2660 tab.pop(); in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2661 code << tab << "}\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2662 code << tab << "input_offset += field_size_in;\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2665 tab.pop(); in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2666 code << tab << "}\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2669 tab.pop(); in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2670 code << tab << "}\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2671 tab.pop(); in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2672 code << tab << "}\n"; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
2673 …code << tab << "// -----------------------------------------------------------------------------\n… in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()