Stage 11 — Agents: Solutions

Worked solutions for Stage 11.

Dependencies: an Anthropic API key (the example uses Claude; OpenAI / Gemini work the same with adapted SDKs).

Agent loop in <100 lines

Build a single-agent loop with no framework. Tools: web search, read URL, calculator, finalize.

import os, json, time, requests
import anthropic

cli = anthropic.Anthropic()

# ---- Tool implementations ----

def web_search(query: str) -> str:
    # Replace with your search provider (Brave, Tavily, Serper, etc.)
    # This is a sketch; real search would parse JSON results into a clean list.
    r = requests.get(
        "https://api.search.brave.com/res/v1/web/search",
        params={"q": query, "count": 5},
        headers={"X-Subscription-Token": os.environ["BRAVE_KEY"]},
        timeout=10,
    )
    items = r.json().get("web", {}).get("results", [])
    return "\n".join(f"[{i+1}] {it['title']}\n{it['url']}\n{it['description']}" for i, it in enumerate(items[:5]))

def read_url(url: str) -> str:
    # Real version: use trafilatura, readability, or a proper extractor
    r = requests.get(url, timeout=10, headers={"User-Agent": "research-agent/1.0"})
    text = r.text
    # crude clipping
    return text[:5000]

def calculator(expression: str) -> str:
    # Sandboxed eval — real version: use sympy, never raw eval
    import math
    allowed = {"abs": abs, "round": round, "pi": math.pi, "e": math.e,
               "sqrt": math.sqrt, "log": math.log, "sin": math.sin, "cos": math.cos}
    try:
        return str(eval(expression, {"__builtins__": {}}, allowed))
    except Exception as e:
        return f"calculator error: {e}"

handlers = {
    "web_search": web_search,
    "read_url": read_url,
    "calculator": calculator,
}

# ---- Tool definitions ----

tools = [
    {
        "name": "web_search",
        "description": "Search the web. Use for finding up-to-date information.",
        "input_schema": {
            "type": "object",
            "properties": {"query": {"type": "string"}},
            "required": ["query"],
        },
    },
    {
        "name": "read_url",
        "description": "Fetch a URL and return its content (truncated). Use after web_search to read a result.",
        "input_schema": {
            "type": "object",
            "properties": {"url": {"type": "string"}},
            "required": ["url"],
        },
    },
    {
        "name": "calculator",
        "description": "Evaluate a math expression. Use for arithmetic — don't compute in your head.",
        "input_schema": {
            "type": "object",
            "properties": {"expression": {"type": "string"}},
            "required": ["expression"],
        },
    },
]

# ---- The loop ----

def run_agent(task: str, max_steps: int = 12, max_tokens_total: int = 100_000):
    messages = [{"role": "user", "content": task}]
    used = 0

    for step in range(max_steps):
        resp = cli.messages.create(
            model="claude-sonnet-4-6",
            max_tokens=2048,
            tools=tools,
            messages=messages,
        )
        used += resp.usage.input_tokens + resp.usage.output_tokens
        if used > max_tokens_total:
            return "BUDGET EXCEEDED"

        if resp.stop_reason == "end_turn":
            text = "".join(b.text for b in resp.content if b.type == "text")
            return text

        if resp.stop_reason == "tool_use":
            messages.append({"role": "assistant", "content": resp.content})

            tool_results = []
            for block in resp.content:
                if block.type != "tool_use":
                    continue
                handler = handlers.get(block.name)
                if handler is None:
                    result = f"Unknown tool: {block.name}"
                else:
                    try:
                        result = handler(**block.input)
                    except Exception as e:
                        result = f"Tool error: {e}"
                tool_results.append({
                    "type": "tool_result",
                    "tool_use_id": block.id,
                    "content": str(result)[:5000],   # cap output size
                })
            messages.append({"role": "user", "content": tool_results})
            continue

        return f"Unexpected stop_reason: {resp.stop_reason}"

    return "Max steps exceeded"


if __name__ == "__main__":
    answer = run_agent(
        "Who was the CEO of Apple when the iPhone 7 launched? "
        "Show the iPhone 7 launch year and confirm with a source."
    )
    print(answer)

About 80 lines. Hits the multi-hop test. Notice:

Budget cap in tokens (prevents runaway).
Max steps (prevents loops).
Tool output truncation (prevents context blowup).
Tool error messages the model can act on.
Single agent, several tools — no framework.

Add memory: summarize old turns

def maybe_compact(messages, threshold=80_000):
    """If history is huge, summarize the older middle turns."""
    total_chars = sum(len(json.dumps(m)) for m in messages)
    if total_chars < threshold:
        return messages

    # Keep the original user task and the last 3 turns verbatim;
    # summarize everything in between.
    head = messages[:1]
    tail = messages[-3:]
    middle = messages[1:-3]

    if not middle:
        return messages

    summary_prompt = (
        "Summarize the following conversation in 200 words, preserving any facts and "
        "tool results that might be needed later:\n\n" + json.dumps(middle)
    )
    resp = cli.messages.create(
        model="claude-haiku-4-5",
        max_tokens=512,
        messages=[{"role": "user", "content": summary_prompt}],
    )
    summary = resp.content[0].text

    return head + [{"role": "user", "content": f"[Earlier turns summary]\n{summary}"}] + tail

Call messages = maybe_compact(messages) before each cli.messages.create() in the loop. Now sessions can run dozens of turns without context blowing up.

Detect tool loops

def repeated_call(messages, n=3):
    """Detect if the same tool was called with the same input 3+ times in a row."""
    last_calls = []
    for m in reversed(messages):
        if m["role"] == "assistant" and isinstance(m["content"], list):
            for b in m["content"]:
                if hasattr(b, "type") and b.type == "tool_use":
                    last_calls.append((b.name, json.dumps(b.input)))
                    if len(last_calls) >= n:
                        return len(set(last_calls[:n])) == 1
    return False

# In the loop, after getting a tool_use response:
if repeated_call(messages):
    messages.append({"role": "user", "content": [{"type": "text",
        "text": "You've called the same tool with the same input 3 times. Try a different approach or finalize."}]})

Catches the most common agent failure mode (loops) and gently nudges the model.

Add a simple budget tracker as a class

class Budget:
    def __init__(self, max_tokens=100_000, max_calls=20, max_seconds=120):
        self.start = time.time()
        self.tokens = 0
        self.calls = 0
        self.max_tokens = max_tokens
        self.max_calls = max_calls
        self.max_seconds = max_seconds

    def record(self, usage):
        self.tokens += usage.input_tokens + usage.output_tokens
        self.calls += 1

    def check(self):
        if self.tokens > self.max_tokens:
            raise RuntimeError(f"Token budget exceeded ({self.tokens})")
        if self.calls > self.max_calls:
            raise RuntimeError(f"Call budget exceeded ({self.calls})")
        if time.time() - self.start > self.max_seconds:
            raise RuntimeError(f"Time budget exceeded ({time.time() - self.start:.1f}s)")

Wrap your agent call: budget = Budget(); ... then budget.record(resp.usage); budget.check() after each LLM call.

Run the same task with a reasoning model

def run_agent_reasoning(task, **kwargs):
    return cli.messages.create(
        model="claude-opus-4-7",
        max_tokens=4096,
        thinking={"type": "enabled", "budget_tokens": 16000},
        tools=tools,
        messages=[{"role": "user", "content": task}],
    )

For multi-hop research, reasoning often helps. Compare:

Cost: reasoning thinks for many extra tokens.
Latency: 30s+ for hard questions.
Quality: catches its own errors during thinking.

Use reasoning for genuinely hard tasks; cheap models for triage and easy lookups.