# Training examples: (query, list of function calls)
training_examples = [
# === Simple single-table queries ===
{
"query": "How many weather stations do we have?",
"calls": [
{"function": "load_table", "args": {"table": "stations"}},
{"function": "count_rows", "args": {}}
]
},
{
"query": "List all Primary type stations",
"calls": [
{"function": "load_table", "args": {"table": "stations"}},
{"function": "filter_rows", "args": {"column": "station_type", "operator": "==", "value": "Primary"}},
{"function": "top_k", "args": {"k": 100}}
]
},
{
"query": "What is the population of the Urban region?",
"calls": [
{"function": "load_table", "args": {"table": "population"}},
{"function": "filter_rows", "args": {"column": "region", "operator": "==", "value": "Urban"}},
{"function": "get_value", "args": {"column": "population", "aggregation": "first"}}
]
},
{
"query": "Show the 5 oldest weather stations",
"calls": [
{"function": "load_table", "args": {"table": "stations"}},
{"function": "bottom_k", "args": {"k": 5, "column": "install_year"}}
]
},
{
"query": "Count weather readings from October 2024",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "filter_date_range", "args": {"column": "date", "start_date": "2024-10-01", "end_date": "2024-10-31"}},
{"function": "count_rows", "args": {}}
]
},
# === Two-table joins ===
{
"query": "What is the average temperature by region?",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "aggregate", "args": {"group_by": ["region"], "aggregations": {"temperature_c": "mean"}}},
{"function": "top_k", "args": {"k": 10}}
]
},
{
"query": "Which stations in Suburban areas have the most alerts?",
"calls": [
{"function": "load_table", "args": {"table": "alerts"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "filter_rows", "args": {"column": "region", "operator": "==", "value": "Suburban"}},
{"function": "aggregate", "args": {"group_by": ["station_id", "station_name"], "aggregations": {"alert_id": "count"}}},
{"function": "top_k", "args": {"k": 5, "column": "alert_id"}}
]
},
{
"query": "Find the coldest station overall",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "aggregate", "args": {"group_by": ["station_id", "station_name"], "aggregations": {"temperature_c": "mean"}}},
{"function": "bottom_k", "args": {"k": 1, "column": "temperature_c"}}
]
},
{
"query": "How many Critical severity alerts occurred?",
"calls": [
{"function": "load_table", "args": {"table": "alerts"}},
{"function": "filter_rows", "args": {"column": "severity", "operator": "==", "value": "Critical"}},
{"function": "count_rows", "args": {}}
]
},
{
"query": "What's the total precipitation by station type?",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "aggregate", "args": {"group_by": ["station_type"], "aggregations": {"precipitation_mm": "sum"}}},
{"function": "top_k", "args": {"k": 10}}
]
},
# === Three-table joins ===
{
"query": "Which stations in high population areas have the worst air quality?",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "join_table", "args": {"right_table": "population", "on": "region"}},
{"function": "filter_rows", "args": {"column": "population", "operator": ">", "value": 400000}},
{"function": "aggregate", "args": {"group_by": ["station_id", "station_name"], "aggregations": {"air_quality_index": "mean"}}},
{"function": "top_k", "args": {"k": 5, "column": "air_quality_index"}}
]
},
{
"query": "Average temperature in Urban areas in November",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "filter_rows", "args": {"column": "region", "operator": "==", "value": "Urban"}},
{"function": "filter_date_range", "args": {"column": "date", "start_date": "2024-11-01", "end_date": "2024-11-30"}},
{"function": "get_value", "args": {"column": "temperature_c", "aggregation": "mean"}}
]
},
{
"query": "Find the hottest station in Coastal region last week",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "filter_rows", "args": {"column": "region", "operator": "==", "value": "Coastal"}},
{"function": "filter_date_range", "args": {"column": "date", "start_date": "2024-12-17", "end_date": "2024-12-24"}},
{"function": "aggregate", "args": {"group_by": ["station_id", "station_name"], "aggregations": {"temperature_c": "mean"}}},
{"function": "top_k", "args": {"k": 1, "column": "temperature_c"}}
]
},
# === Rolling window queries ===
{
"query": "What's the 7-day average temperature trend for station ST001?",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "filter_rows", "args": {"column": "station_id", "operator": "==", "value": "ST001"}},
{"function": "rolling_window", "args": {"column": "temperature_c", "window": 7, "function": "mean"}},
{"function": "top_k", "args": {"k": 30}}
]
},
{
"query": "Calculate 3-day rolling max temperature by station",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "rolling_window", "args": {"column": "temperature_c", "window": 3, "function": "max", "group_by": "station_id"}},
{"function": "top_k", "args": {"k": 50}}
]
},
{
"query": "Which station had the highest 7-day average temperature in November in highly populated areas?",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "join_table", "args": {"right_table": "population", "on": "region"}},
{"function": "filter_rows", "args": {"column": "population", "operator": ">", "value": 400000}},
{"function": "filter_date_range", "args": {"column": "date", "start_date": "2024-11-01", "end_date": "2024-11-30"}},
{"function": "rolling_window", "args": {"column": "temperature_c", "window": 7, "function": "mean", "group_by": "station_id"}},
{"function": "aggregate", "args": {"group_by": ["station_id", "station_name"], "aggregations": {"temperature_c_rolling_7_mean": "max"}}},
{"function": "top_k", "args": {"k": 1, "column": "temperature_c_rolling_7_mean"}}
]
},
# === More filter variations ===
{
"query": "Find days with temperature above 30 degrees",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "filter_rows", "args": {"column": "temperature_c", "operator": ">", "value": 30}},
{"function": "top_k", "args": {"k": 50}}
]
},
{
"query": "Stations at elevation above 300 meters",
"calls": [
{"function": "load_table", "args": {"table": "stations"}},
{"function": "filter_rows", "args": {"column": "elevation_m", "operator": ">", "value": 300}},
{"function": "top_k", "args": {"k": 20}}
]
},
{
"query": "Low humidity days (below 40%) count",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "filter_rows", "args": {"column": "humidity_pct", "operator": "<", "value": 40}},
{"function": "count_rows", "args": {}}
]
},
# === Complex multi-step queries ===
{
"query": "Which region has the most weather alerts per capita?",
"calls": [
{"function": "load_table", "args": {"table": "alerts"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "aggregate", "args": {"group_by": ["region"], "aggregations": {"alert_id": "count"}}},
{"function": "join_table", "args": {"right_table": "population", "on": "region"}},
{"function": "top_k", "args": {"k": 10}}
]
},
{
"query": "Average wind speed for stations installed after 2020",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "filter_rows", "args": {"column": "install_year", "operator": ">", "value": 2020}},
{"function": "get_value", "args": {"column": "wind_speed_kmh", "aggregation": "mean"}}
]
},
{
"query": "Heat Wave alerts in Rural areas",
"calls": [
{"function": "load_table", "args": {"table": "alerts"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "filter_rows", "args": {"column": "region", "operator": "==", "value": "Rural"}},
{"function": "filter_rows", "args": {"column": "alert_type", "operator": "==", "value": "Heat Wave"}},
{"function": "count_rows", "args": {}}
]
},
{
"query": "Highest precipitation day in December",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "filter_date_range", "args": {"column": "date", "start_date": "2024-12-01", "end_date": "2024-12-31"}},
{"function": "top_k", "args": {"k": 1, "column": "precipitation_mm"}}
]
},
{
"query": "Compare average temperature between Urban and Rural regions",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "filter_rows", "args": {"column": "region", "operator": "in", "value": ["Urban", "Rural"]}},
{"function": "aggregate", "args": {"group_by": ["region"], "aggregations": {"temperature_c": "mean"}}},
{"function": "top_k", "args": {"k": 10}}
]
},
{
"query": "Station with most consistent temperature (lowest std dev)",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "aggregate", "args": {"group_by": ["station_id", "station_name"], "aggregations": {"temperature_c": "std"}}},
{"function": "bottom_k", "args": {"k": 1, "column": "temperature_c"}}
]
},
{
"query": "Median income of regions with temperature above 25 degrees on average",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "aggregate", "args": {"group_by": ["region"], "aggregations": {"temperature_c": "mean"}}},
{"function": "filter_rows", "args": {"column": "temperature_c", "operator": ">", "value": 25}},
{"function": "join_table", "args": {"right_table": "population", "on": "region"}},
{"function": "top_k", "args": {"k": 10}}
]
},
# === More examples for better coverage ===
{
"query": "List all stations in the Urban region",
"calls": [
{"function": "load_table", "args": {"table": "stations"}},
{"function": "filter_rows", "args": {"column": "region", "operator": "==", "value": "Urban"}},
{"function": "top_k", "args": {"k": 50}}
]
},
{
"query": "Total rainfall in Suburban areas",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "filter_rows", "args": {"column": "region", "operator": "==", "value": "Suburban"}},
{"function": "get_value", "args": {"column": "precipitation_mm", "aggregation": "sum"}}
]
},
{
"query": "Most recent weather reading for each station",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "sort_values", "args": {"column": "date", "ascending": False}},
{"function": "aggregate", "args": {"group_by": ["station_id"], "aggregations": {"date": "first", "temperature_c": "first"}}},
{"function": "top_k", "args": {"k": 20}}
]
},
{
"query": "Average air quality by region",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "join_table", "args": {"right_table": "stations", "on": "station_id"}},
{"function": "aggregate", "args": {"group_by": ["region"], "aggregations": {"air_quality_index": "mean"}}},
{"function": "top_k", "args": {"k": 10}}
]
},
{
"query": "Storm alerts count by station",
"calls": [
{"function": "load_table", "args": {"table": "alerts"}},
{"function": "filter_rows", "args": {"column": "alert_type", "operator": "==", "value": "Storm"}},
{"function": "aggregate", "args": {"group_by": ["station_id"], "aggregations": {"alert_id": "count"}}},
{"function": "top_k", "args": {"k": 20, "column": "alert_id"}}
]
},
{
"query": "Windiest day on record",
"calls": [
{"function": "load_table", "args": {"table": "weather"}},
{"function": "top_k", "args": {"k": 1, "column": "wind_speed_kmh"}}
]
},
]
print(f"Created {len(training_examples)} training examples")
print(f"\nSample:")
print(json.dumps(training_examples[15], indent=2)) # The complex 7-day average query