Stage 09 — RAG: Solutions
Worked solutions for Stage 9.
Dependencies: chromadb, sentence-transformers, rank_bm25, an LLM API.
Minimal RAG over your notes
Build a basic RAG over 1k+ documents. Retrieve → prompt → answer with citations.
import os, glob, hashlib
import chromadb
from sentence_transformers import SentenceTransformer
import anthropic
# 1. Load documents (here: a folder of .md files)
docs = []
for path in glob.glob("notes/**/*.md", recursive=True):
with open(path) as f:
text = f.read()
docs.append({"id": hashlib.md5(path.encode()).hexdigest(), "path": path, "text": text})
# 2. Chunk
def chunk(text, size=500, overlap=50):
words = text.split()
chunks = []
for i in range(0, len(words), size - overlap):
chunks.append(" ".join(words[i:i + size]))
return chunks
# 3. Embed + store
encoder = SentenceTransformer("BAAI/bge-large-en-v1.5")
client = chromadb.PersistentClient("./db")
col = client.get_or_create_collection("notes", metadata={"hnsw:space": "cosine"})
batch_ids, batch_texts, batch_meta = [], [], []
for doc in docs:
for i, c in enumerate(chunk(doc["text"])):
batch_ids.append(f"{doc['id']}_{i}")
batch_texts.append(c)
batch_meta.append({"path": doc["path"], "chunk": i})
# Batch-encode for speed
embeddings = encoder.encode(batch_texts, batch_size=32, show_progress_bar=True).tolist()
col.add(ids=batch_ids, documents=batch_texts, embeddings=embeddings, metadatas=batch_meta)
# 4. Query
def ask(question, k=5):
q_emb = encoder.encode([question]).tolist()
results = col.query(query_embeddings=q_emb, n_results=k)
sources = []
parts = []
for i, (doc, meta) in enumerate(zip(results["documents"][0], results["metadatas"][0])):
sources.append(f"[{i+1}] {meta['path']} (chunk {meta['chunk']})")
parts.append(f"[{i+1}] {doc}")
context = "\n\n".join(parts)
prompt = f"""Answer using only the context below. If the context doesn't contain
the answer, say "I don't know." Cite sources by number, e.g. "According to [2]."
<context>
{context}
</context>
<question>
{question}
</question>"""
cli = anthropic.Anthropic()
resp = cli.messages.create(
model="claude-haiku-4-5",
max_tokens=1024,
messages=[{"role": "user", "content": prompt}],
)
answer = resp.content[0].text
return answer, sources
answer, sources = ask("What's the right embedding model for code?")
print(answer)
print("\n--- Sources ---")
for s in sources:
print(s)
That’s a working RAG in ~60 lines. It already covers: chunking, embedding, vector storage, retrieval, generation, citation. From here, every improvement is layered on this baseline.
Build a 50-query golden set and measure recall@10
50 representative queries with expected source docs.
golden = [
{"q": "How do I set up pgvector?", "expected_path": "notes/pgvector.md"},
{"q": "What's the difference between cosine and Euclidean distance?", "expected_path": "notes/embedding-geometry.md"},
# ... 48 more, ideally derived from real users or your own questions about the corpus
]
def retrieve(query, k=10):
q_emb = encoder.encode([query]).tolist()
results = col.query(query_embeddings=q_emb, n_results=k)
return [m["path"] for m in results["metadatas"][0]]
# Recall@k: how often is the expected doc in top-k?
def recall_at_k(golden, k=10):
hits = 0
for item in golden:
retrieved = retrieve(item["q"], k=k)
if item["expected_path"] in retrieved:
hits += 1
return hits / len(golden)
for k in [1, 3, 5, 10]:
r = recall_at_k(golden, k)
print(f"recall@{k} = {r:.2%}")
Aim for recall@10 ≥ 80% before optimizing anything else. If it’s lower, the bottleneck is in retrieval, not generation; chunking, embedding model, or hybrid search will help.
Hybrid search (BM25 + dense)
Combine BM25 keyword search with dense vector search.
from rank_bm25 import BM25Okapi
# Build BM25 index over the same chunks
tokenized = [text.lower().split() for text in batch_texts]
bm25 = BM25Okapi(tokenized)
def hybrid_retrieve(query, k=10):
# Dense
q_emb = encoder.encode([query]).tolist()
dense_results = col.query(query_embeddings=q_emb, n_results=20)
dense_ids = dense_results["ids"][0]
# BM25
bm25_scores = bm25.get_scores(query.lower().split())
top_bm25_idx = bm25_scores.argsort()[::-1][:20]
bm25_ids = [batch_ids[i] for i in top_bm25_idx]
# Reciprocal rank fusion
rrf = {}
for rank, id_ in enumerate(dense_ids, 1):
rrf[id_] = rrf.get(id_, 0) + 1 / (60 + rank)
for rank, id_ in enumerate(bm25_ids, 1):
rrf[id_] = rrf.get(id_, 0) + 1 / (60 + rank)
return sorted(rrf, key=rrf.get, reverse=True)[:k]
For queries with rare terms (acronyms, IDs, proper nouns), this often picks up 5–15pp recall@10.
Reranking with a cross-encoder
from sentence_transformers import CrossEncoder
reranker = CrossEncoder("BAAI/bge-reranker-v2-m3")
def retrieve_and_rerank(query, k=10, candidate_k=50):
# Stage 1: retrieve a wider net
candidates = hybrid_retrieve(query, k=candidate_k)
candidate_texts = [col.get(ids=[id_])["documents"][0] for id_ in candidates]
# Stage 2: rerank
pairs = [(query, t) for t in candidate_texts]
scores = reranker.predict(pairs)
ranked = sorted(zip(candidates, scores), key=lambda x: -x[1])[:k]
return [id_ for id_, _ in ranked]
Almost always improves precision. The two-stage architecture (cheap recall → expensive precision) is the production-RAG default.
Faithfulness eval with LLM-as-judge
Sample answers from your RAG; check if every claim is supported by the context.
import anthropic
import json
cli = anthropic.Anthropic()
def faithfulness(question, answer, context):
prompt = f"""You are evaluating whether an answer is faithful to its source context.
Question: {question}
Context: {context}
Answer: {answer}
For each substantive claim in the answer, decide if it is FULLY SUPPORTED, PARTIALLY SUPPORTED, or NOT SUPPORTED by the context.
Output JSON: {{"verdict": "supported" | "partial" | "unsupported", "explanation": "..."}}
"""
resp = cli.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
messages=[{"role": "user", "content": prompt}],
)
return json.loads(resp.content[0].text)
# Run on your golden set
verdicts = []
for item in golden[:20]:
answer, _ = ask(item["q"])
context = "\n".join(retrieve(item["q"], k=5))
verdicts.append(faithfulness(item["q"], answer, context))
supported = sum(1 for v in verdicts if v["verdict"] == "supported")
print(f"faithfulness: {supported}/{len(verdicts)} = {supported/len(verdicts):.0%}")
Aim for ≥90% on factual queries. If you’re seeing “unsupported” verdicts, your retrieval may be off (model is filling gaps with prior knowledge) or your prompt isn’t strong enough (model isn’t grounding).