fix: stop word filtering in entity scoring — common words polluted rankings
'the', 'full', 'text', 'proposal' etc. were matching irrelevant entities. Robin Hanson record ranked #2 behind Drift because Drift matched 'the' and 'proposal' in its name. Now only meaningful tokens (>=3 chars, not stop words) contribute to entity scoring. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
parent
089b4609d5
commit
f77fd229d6
1 changed files with 17 additions and 5 deletions
|
|
@ -430,6 +430,17 @@ def retrieve_context(query: str, repo_dir: str, index: KBIndex | None = None,
|
||||||
# ─── Scoring ──────────────────────────────────────────────────────────
|
# ─── Scoring ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
_STOP_WORDS = frozenset({
|
||||||
|
"the", "for", "and", "but", "not", "you", "can", "has", "are", "was",
|
||||||
|
"its", "all", "had", "her", "one", "our", "out", "new", "now", "old",
|
||||||
|
"see", "way", "may", "say", "she", "two", "how", "did", "get", "put",
|
||||||
|
"give", "me", "ok", "full", "text", "what", "about", "tell", "this",
|
||||||
|
"that", "with", "from", "have", "more", "some", "than", "them", "then",
|
||||||
|
"into", "also", "just", "your", "been", "here", "will", "does", "know",
|
||||||
|
"please", "think",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
def _score_entity(query_lower: str, query_tokens: list[str], entity: dict) -> float:
|
def _score_entity(query_lower: str, query_tokens: list[str], entity: dict) -> float:
|
||||||
"""Score an entity against a query. Higher = more relevant."""
|
"""Score an entity against a query. Higher = more relevant."""
|
||||||
name_lower = entity["name"].lower()
|
name_lower = entity["name"].lower()
|
||||||
|
|
@ -437,9 +448,10 @@ def _score_entity(query_lower: str, query_tokens: list[str], entity: dict) -> fl
|
||||||
aliases = entity.get("aliases", [])
|
aliases = entity.get("aliases", [])
|
||||||
score = 0.0
|
score = 0.0
|
||||||
|
|
||||||
for token in query_tokens:
|
# Filter out stop words — only score meaningful tokens
|
||||||
if len(token) < 2:
|
meaningful_tokens = [t for t in query_tokens if t not in _STOP_WORDS and len(t) >= 3]
|
||||||
continue
|
|
||||||
|
for token in meaningful_tokens:
|
||||||
# Name match (highest signal)
|
# Name match (highest signal)
|
||||||
if token in name_lower:
|
if token in name_lower:
|
||||||
score += 3.0
|
score += 3.0
|
||||||
|
|
@ -451,8 +463,8 @@ def _score_entity(query_lower: str, query_tokens: list[str], entity: dict) -> fl
|
||||||
score += 0.5
|
score += 0.5
|
||||||
|
|
||||||
# Boost multi-word name matches (e.g. "robin hanson" in entity name)
|
# Boost multi-word name matches (e.g. "robin hanson" in entity name)
|
||||||
if len(query_tokens) >= 2:
|
if len(meaningful_tokens) >= 2:
|
||||||
bigrams = [f"{query_tokens[i]} {query_tokens[i+1]}" for i in range(len(query_tokens) - 1)]
|
bigrams = [f"{meaningful_tokens[i]} {meaningful_tokens[i+1]}" for i in range(len(meaningful_tokens) - 1)]
|
||||||
for bg in bigrams:
|
for bg in bigrams:
|
||||||
if bg in name_lower:
|
if bg in name_lower:
|
||||||
score += 5.0
|
score += 5.0
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue