# Hybrid search with reciprocal rank fusion
class HybridRetriever:
def __init__(self, documents: list[str]):
"""Initialize both BM25 and semantic retrievers."""
self.bm25 = BM25Retriever(documents)
self.semantic = SemanticRetriever(documents)
self.documents = documents
def reciprocal_rank_fusion(
self,
bm25_results: list[dict],
semantic_results: list[dict],
k: int = 60 # RRF constant
) -> list[dict]:
"""
Fuse results using Reciprocal Rank Fusion.
RRF score = sum(1 / (k + rank)) for each result list.
"""
scores = {}
# Add BM25 scores
for result in bm25_results:
doc = result['document']
scores[doc] = scores.get(doc, 0) + 1 / (k + result['rank'])
# Add semantic scores
for result in semantic_results:
doc = result['document']
scores[doc] = scores.get(doc, 0) + 1 / (k + result['rank'])
# Sort by combined score
ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
return [
{"document": doc, "score": score, "rank": i + 1}
for i, (doc, score) in enumerate(ranked)
]
def search(self, query: str, top_k: int = 5) -> list[dict]:
"""Hybrid search with RRF fusion."""
# Get results from both retrievers
bm25_results = self.bm25.search(query, top_k=20)
semantic_results = self.semantic.search(query, top_k=20)
# Fuse and return top-k
fused = self.reciprocal_rank_fusion(bm25_results, semantic_results)
return fused[:top_k]
# Example showing hybrid's advantage
documents = [
"Python 3.11 introduced exception groups for better error handling",
"Error handling in Python uses try-except blocks",
"JavaScript has try-catch for exception management"
]
hybrid = HybridRetriever(documents)
# Query 1: Exact match (BM25 wins)
results = hybrid.search("Python 3.11 features")
# Returns: Doc 1 ranked highest (exact version match)
# Query 2: Conceptual (semantic wins)
results = hybrid.search("How do I handle errors in Python?")
# Returns: Doc 2 ranked highest (semantic match to error handling)
# Query 3: Hybrid helps
results = hybrid.search("Python exception handling")
# Returns: Docs 1 & 2 both highly ranked (BM25 + semantic agree)