Stage 11 — Agents: Solutions
Worked solutions for Stage 11.
Dependencies: an Anthropic API key (the example uses Claude; OpenAI / Gemini work the same with adapted SDKs).
Agent loop in <100 lines
Build a single-agent loop with no framework. Tools: web search, read URL, calculator, finalize.
import os, json, time, requests
import anthropic
cli = anthropic.Anthropic()
# ---- Tool implementations ----
def web_search(query: str) -> str:
# Replace with your search provider (Brave, Tavily, Serper, etc.)
# This is a sketch; real search would parse JSON results into a clean list.
r = requests.get(
"https://api.search.brave.com/res/v1/web/search",
params={"q": query, "count": 5},
headers={"X-Subscription-Token": os.environ["BRAVE_KEY"]},
timeout=10,
)
items = r.json().get("web", {}).get("results", [])
return "\n".join(f"[{i+1}] {it['title']}\n{it['url']}\n{it['description']}" for i, it in enumerate(items[:5]))
def read_url(url: str) -> str:
# Real version: use trafilatura, readability, or a proper extractor
r = requests.get(url, timeout=10, headers={"User-Agent": "research-agent/1.0"})
text = r.text
# crude clipping
return text[:5000]
def calculator(expression: str) -> str:
# Sandboxed eval — real version: use sympy, never raw eval
import math
allowed = {"abs": abs, "round": round, "pi": math.pi, "e": math.e,
"sqrt": math.sqrt, "log": math.log, "sin": math.sin, "cos": math.cos}
try:
return str(eval(expression, {"__builtins__": {}}, allowed))
except Exception as e:
return f"calculator error: {e}"
handlers = {
"web_search": web_search,
"read_url": read_url,
"calculator": calculator,
}
# ---- Tool definitions ----
tools = [
{
"name": "web_search",
"description": "Search the web. Use for finding up-to-date information.",
"input_schema": {
"type": "object",
"properties": {"query": {"type": "string"}},
"required": ["query"],
},
},
{
"name": "read_url",
"description": "Fetch a URL and return its content (truncated). Use after web_search to read a result.",
"input_schema": {
"type": "object",
"properties": {"url": {"type": "string"}},
"required": ["url"],
},
},
{
"name": "calculator",
"description": "Evaluate a math expression. Use for arithmetic — don't compute in your head.",
"input_schema": {
"type": "object",
"properties": {"expression": {"type": "string"}},
"required": ["expression"],
},
},
]
# ---- The loop ----
def run_agent(task: str, max_steps: int = 12, max_tokens_total: int = 100_000):
messages = [{"role": "user", "content": task}]
used = 0
for step in range(max_steps):
resp = cli.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
tools=tools,
messages=messages,
)
used += resp.usage.input_tokens + resp.usage.output_tokens
if used > max_tokens_total:
return "BUDGET EXCEEDED"
if resp.stop_reason == "end_turn":
text = "".join(b.text for b in resp.content if b.type == "text")
return text
if resp.stop_reason == "tool_use":
messages.append({"role": "assistant", "content": resp.content})
tool_results = []
for block in resp.content:
if block.type != "tool_use":
continue
handler = handlers.get(block.name)
if handler is None:
result = f"Unknown tool: {block.name}"
else:
try:
result = handler(**block.input)
except Exception as e:
result = f"Tool error: {e}"
tool_results.append({
"type": "tool_result",
"tool_use_id": block.id,
"content": str(result)[:5000], # cap output size
})
messages.append({"role": "user", "content": tool_results})
continue
return f"Unexpected stop_reason: {resp.stop_reason}"
return "Max steps exceeded"
if __name__ == "__main__":
answer = run_agent(
"Who was the CEO of Apple when the iPhone 7 launched? "
"Show the iPhone 7 launch year and confirm with a source."
)
print(answer)
About 80 lines. Hits the multi-hop test. Notice:
- Budget cap in tokens (prevents runaway).
- Max steps (prevents loops).
- Tool output truncation (prevents context blowup).
- Tool error messages the model can act on.
- Single agent, several tools — no framework.
Add memory: summarize old turns
def maybe_compact(messages, threshold=80_000):
"""If history is huge, summarize the older middle turns."""
total_chars = sum(len(json.dumps(m)) for m in messages)
if total_chars < threshold:
return messages
# Keep the original user task and the last 3 turns verbatim;
# summarize everything in between.
head = messages[:1]
tail = messages[-3:]
middle = messages[1:-3]
if not middle:
return messages
summary_prompt = (
"Summarize the following conversation in 200 words, preserving any facts and "
"tool results that might be needed later:\n\n" + json.dumps(middle)
)
resp = cli.messages.create(
model="claude-haiku-4-5",
max_tokens=512,
messages=[{"role": "user", "content": summary_prompt}],
)
summary = resp.content[0].text
return head + [{"role": "user", "content": f"[Earlier turns summary]\n{summary}"}] + tail
Call messages = maybe_compact(messages) before each cli.messages.create() in the loop. Now sessions can run dozens of turns without context blowing up.
Detect tool loops
def repeated_call(messages, n=3):
"""Detect if the same tool was called with the same input 3+ times in a row."""
last_calls = []
for m in reversed(messages):
if m["role"] == "assistant" and isinstance(m["content"], list):
for b in m["content"]:
if hasattr(b, "type") and b.type == "tool_use":
last_calls.append((b.name, json.dumps(b.input)))
if len(last_calls) >= n:
return len(set(last_calls[:n])) == 1
return False
# In the loop, after getting a tool_use response:
if repeated_call(messages):
messages.append({"role": "user", "content": [{"type": "text",
"text": "You've called the same tool with the same input 3 times. Try a different approach or finalize."}]})
Catches the most common agent failure mode (loops) and gently nudges the model.
Add a simple budget tracker as a class
class Budget:
def __init__(self, max_tokens=100_000, max_calls=20, max_seconds=120):
self.start = time.time()
self.tokens = 0
self.calls = 0
self.max_tokens = max_tokens
self.max_calls = max_calls
self.max_seconds = max_seconds
def record(self, usage):
self.tokens += usage.input_tokens + usage.output_tokens
self.calls += 1
def check(self):
if self.tokens > self.max_tokens:
raise RuntimeError(f"Token budget exceeded ({self.tokens})")
if self.calls > self.max_calls:
raise RuntimeError(f"Call budget exceeded ({self.calls})")
if time.time() - self.start > self.max_seconds:
raise RuntimeError(f"Time budget exceeded ({time.time() - self.start:.1f}s)")
Wrap your agent call: budget = Budget(); ... then budget.record(resp.usage); budget.check() after each LLM call.
Run the same task with a reasoning model
def run_agent_reasoning(task, **kwargs):
return cli.messages.create(
model="claude-opus-4-7",
max_tokens=4096,
thinking={"type": "enabled", "budget_tokens": 16000},
tools=tools,
messages=[{"role": "user", "content": task}],
)
For multi-hop research, reasoning often helps. Compare:
- Cost: reasoning thinks for many extra tokens.
- Latency: 30s+ for hard questions.
- Quality: catches its own errors during thinking.
Use reasoning for genuinely hard tasks; cheap models for triage and easy lookups.