From f77fd229d630e07d539aedb5da372ab8ffff4e0b Mon Sep 17 00:00:00 2001 From: m3taversal Date: Mon, 23 Mar 2026 17:44:06 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20stop=20word=20filtering=20in=20entity=20?= =?UTF-8?q?scoring=20=E2=80=94=20common=20words=20polluted=20rankings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'the', 'full', 'text', 'proposal' etc. were matching irrelevant entities. Robin Hanson record ranked #2 behind Drift because Drift matched 'the' and 'proposal' in its name. Now only meaningful tokens (>=3 chars, not stop words) contribute to entity scoring. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- telegram/kb_retrieval.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/telegram/kb_retrieval.py b/telegram/kb_retrieval.py index ca4921a..a1e9241 100644 --- a/telegram/kb_retrieval.py +++ b/telegram/kb_retrieval.py @@ -430,6 +430,17 @@ def retrieve_context(query: str, repo_dir: str, index: KBIndex | None = None, # ─── Scoring ────────────────────────────────────────────────────────── +_STOP_WORDS = frozenset({ + "the", "for", "and", "but", "not", "you", "can", "has", "are", "was", + "its", "all", "had", "her", "one", "our", "out", "new", "now", "old", + "see", "way", "may", "say", "she", "two", "how", "did", "get", "put", + "give", "me", "ok", "full", "text", "what", "about", "tell", "this", + "that", "with", "from", "have", "more", "some", "than", "them", "then", + "into", "also", "just", "your", "been", "here", "will", "does", "know", + "please", "think", +}) + + def _score_entity(query_lower: str, query_tokens: list[str], entity: dict) -> float: """Score an entity against a query. Higher = more relevant.""" name_lower = entity["name"].lower() @@ -437,9 +448,10 @@ def _score_entity(query_lower: str, query_tokens: list[str], entity: dict) -> fl aliases = entity.get("aliases", []) score = 0.0 - for token in query_tokens: - if len(token) < 2: - continue + # Filter out stop words — only score meaningful tokens + meaningful_tokens = [t for t in query_tokens if t not in _STOP_WORDS and len(t) >= 3] + + for token in meaningful_tokens: # Name match (highest signal) if token in name_lower: score += 3.0 @@ -451,8 +463,8 @@ def _score_entity(query_lower: str, query_tokens: list[str], entity: dict) -> fl score += 0.5 # Boost multi-word name matches (e.g. "robin hanson" in entity name) - if len(query_tokens) >= 2: - bigrams = [f"{query_tokens[i]} {query_tokens[i+1]}" for i in range(len(query_tokens) - 1)] + if len(meaningful_tokens) >= 2: + bigrams = [f"{meaningful_tokens[i]} {meaningful_tokens[i+1]}" for i in range(len(meaningful_tokens) - 1)] for bg in bigrams: if bg in name_lower: score += 5.0