import re, subprocess, json
TASKS = [
{'name': 'spiral_order',
'prompt': 'Write a Python function spiral_order(matrix: list[list[int]]) -> list[int] that returns all elements of an m x n matrix in spiral (clockwise) order starting from the top-left. Handle empty matrices and non-square shapes. ONLY output a fenced python block.',
'tests': [
("[[1,2,3],[4,5,6],[7,8,9]]", "[1, 2, 3, 6, 9, 8, 7, 4, 5]"),
("[[1,2,3,4],[5,6,7,8],[9,10,11,12]]", "[1, 2, 3, 4, 8, 12, 11, 10, 9, 5, 6, 7]"),
("[]", "[]"),
("[[1]]", "[1]"),
("[[1,2,3]]", "[1, 2, 3]"),
("[[1],[2],[3]]", "[1, 2, 3]"),
]},
{'name': 'word_break',
'prompt': 'Write a Python function word_break(s: str, words: list[str]) -> bool that returns True iff s can be segmented into a sequence of one or more words from the list (words may be reused). ONLY output a fenced python block.',
'tests': [
("'leetcode', ['leet','code']", "True"),
("'applepenapple', ['apple','pen']", "True"),
("'catsandog', ['cats','dog','sand','and','cat']", "False"),
("'', ['a']", "True"),
("'aaaaaaa', ['aaa','aaaa']", "True"),
]},
]
def extract(text):
m = re.search(r'```(?:python)?\s*(.*?)```', text, re.DOTALL)
return m.group(1).strip() if m else text.strip()
def run_tests(code, fn, tests):
body = code + "\nimport json\nr=[]\n"
for args, expected in tests:
body += f"try:\n r.append(repr({fn}({args}))=={expected!r})\nexcept: r.append(False)\n"
body += "print(json.dumps(r))"
p = subprocess.run(['python3', '-c', body], capture_output=True, text=True, timeout=6)
if p.returncode != 0: return 0, len(tests)
res = json.loads(p.stdout.strip().split('\n')[-1])
return sum(res), len(res)
async def ask(model, prompt, level=None):
cfg = types.GenerateContentConfig(
thinking_config=types.ThinkingConfig(thinking_level=level)) if level else None
t0 = time.time()
kw = {'model': model, 'contents': prompt}
if cfg: kw['config'] = cfg
r = await client.aio.models.generate_content(**kw)
return (r.text, round(time.time()-t0, 2),
getattr(r.usage_metadata, 'thoughts_token_count', None) or 0)
rows = []
configs = [('gemini-3.1-flash-lite', None),
('gemini-3.5-flash', 'minimal'),
('gemini-3.5-flash', 'low'),
('gemini-3.5-flash', 'medium'),
('gemini-3.5-flash', 'high')]
for task in TASKS:
rs = await asyncio.gather(*[ask(m, task['prompt'], l) for m, l in configs])
for (m, l), (text, dt, th) in zip(configs, rs):
p, t = run_tests(extract(text), task['name'], task['tests'])
rows.append({'task': task['name'],
'label': 'lite' if 'lite' in m else f'flash/{l}',
'dt_s': dt, 'thoughts': th, 'pass': f'{p}/{t}'})
pd.DataFrame(rows)