Pulled from /opt/teleo-eval/telegram/ on VPS. Includes: - bot.py (92K), kb_retrieval.py, kb_tools.py (agentic retrieval) - retrieval.py (RRF merge, query decomposition, entity traversal) - response.py (system prompt builder, response parser) - agent_config.py, agent_runner.py (multi-agent template unit support) - approval_stages.py, approvals.py, digest.py (approval workflow) - eval_checks.py, eval.py (response quality checks) - output_gate.py, x_publisher.py, x_client.py, x_search.py (X pipeline) - market_data.py, worktree_lock.py (utilities) - rio.yaml, theseus.yaml (agent configs) These files were deployed to VPS but never committed to the repo. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
747 lines
28 KiB
Python
747 lines
28 KiB
Python
#!/usr/bin/env python3
|
||
"""KB Retrieval for Telegram bot — multi-layer search across the Teleo knowledge base.
|
||
|
||
Architecture (Ganymede-reviewed):
|
||
Layer 1: Entity resolution — query tokens → entity name/aliases/tags → entity file
|
||
Layer 2: Claim search — substring + keyword matching on titles AND descriptions
|
||
Layer 3: Agent context — positions, beliefs referencing matched entities/claims
|
||
|
||
Entry point: retrieve_context(query, repo_dir) → KBContext
|
||
|
||
Epimetheus owns this module.
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
import time
|
||
from dataclasses import dataclass, field
|
||
from pathlib import Path
|
||
|
||
import yaml
|
||
|
||
logger = logging.getLogger("kb-retrieval")
|
||
|
||
# ─── Types ────────────────────────────────────────────────────────────
|
||
|
||
|
||
@dataclass
|
||
class EntityMatch:
|
||
"""A matched entity with its profile."""
|
||
name: str
|
||
path: str
|
||
entity_type: str
|
||
domain: str
|
||
overview: str # first ~500 chars of body
|
||
tags: list[str]
|
||
related_claims: list[str] # wiki-link titles from body
|
||
|
||
|
||
@dataclass
|
||
class ClaimMatch:
|
||
"""A matched claim."""
|
||
title: str
|
||
path: str
|
||
domain: str
|
||
confidence: str
|
||
description: str
|
||
score: float # relevance score
|
||
|
||
|
||
@dataclass
|
||
class PositionMatch:
|
||
"""An agent position on a topic."""
|
||
agent: str
|
||
title: str
|
||
content: str # first ~500 chars
|
||
|
||
|
||
@dataclass
|
||
class KBContext:
|
||
"""Full KB context for a query — passed to the LLM prompt."""
|
||
entities: list[EntityMatch] = field(default_factory=list)
|
||
claims: list[ClaimMatch] = field(default_factory=list)
|
||
positions: list[PositionMatch] = field(default_factory=list)
|
||
belief_excerpts: list[str] = field(default_factory=list)
|
||
stats: dict = field(default_factory=dict)
|
||
|
||
|
||
# ─── Index ────────────────────────────────────────────────────────────
|
||
|
||
|
||
class KBIndex:
|
||
"""In-memory index of entities, claims, and agent state. Rebuilt on mtime change."""
|
||
|
||
def __init__(self, repo_dir: str):
|
||
self.repo_dir = Path(repo_dir)
|
||
self._entities: list[dict] = [] # [{name, path, type, domain, tags, handles, body_excerpt, aliases}]
|
||
self._claims: list[dict] = [] # [{title, path, domain, confidence, description}]
|
||
self._positions: list[dict] = [] # [{agent, title, path, content}]
|
||
self._beliefs: list[dict] = [] # [{agent, path, content}]
|
||
self._entity_alias_map: dict[str, list[int]] = {} # lowercase alias → indices into _entities
|
||
self._last_build: float = 0
|
||
|
||
def ensure_fresh(self, max_age_seconds: int = 300):
|
||
"""Rebuild index if stale. Rebuilds every max_age_seconds (default 5 min)."""
|
||
now = time.time()
|
||
if now - self._last_build > max_age_seconds:
|
||
self._build()
|
||
|
||
def _build(self):
|
||
"""Rebuild all indexes from filesystem."""
|
||
logger.info("Rebuilding KB index from %s", self.repo_dir)
|
||
start = time.time()
|
||
|
||
self._entities = []
|
||
self._claims = []
|
||
self._positions = []
|
||
self._beliefs = []
|
||
self._entity_alias_map = {}
|
||
|
||
self._index_entities()
|
||
self._index_claims()
|
||
self._index_agent_state()
|
||
self._last_build = time.time()
|
||
|
||
logger.info("KB index built in %.1fs: %d entities, %d claims, %d positions",
|
||
time.time() - start, len(self._entities), len(self._claims), len(self._positions))
|
||
|
||
def _index_entities(self):
|
||
"""Scan entities/ and decisions/ for entity and decision files."""
|
||
entity_dirs = [
|
||
self.repo_dir / "entities",
|
||
self.repo_dir / "decisions",
|
||
]
|
||
for entities_dir in entity_dirs:
|
||
if not entities_dir.exists():
|
||
continue
|
||
for md_file in entities_dir.rglob("*.md"):
|
||
self._index_single_entity(md_file)
|
||
|
||
def _index_single_entity(self, md_file: Path):
|
||
"""Index a single entity or decision file."""
|
||
try:
|
||
fm, body = _parse_frontmatter(md_file)
|
||
if not fm or fm.get("type") not in ("entity", "decision"):
|
||
return
|
||
|
||
name = fm.get("name", md_file.stem)
|
||
handles = fm.get("handles", []) or []
|
||
tags = fm.get("tags", []) or []
|
||
entity_type = fm.get("entity_type", "unknown")
|
||
domain = fm.get("domain", "unknown")
|
||
|
||
# For decision records, also index summary and proposer as searchable text
|
||
summary = fm.get("summary", "")
|
||
proposer = fm.get("proposer", "")
|
||
|
||
# Build aliases from multiple sources
|
||
aliases = set()
|
||
aliases.add(name.lower())
|
||
aliases.add(md_file.stem.lower()) # slugified name
|
||
for h in handles:
|
||
aliases.add(h.lower().lstrip("@"))
|
||
for t in tags:
|
||
aliases.add(t.lower())
|
||
# Add proposer name as alias for decision records
|
||
if proposer:
|
||
aliases.add(proposer.lower())
|
||
# Add parent_entity as alias (Ganymede: MetaDAO queries should surface its decisions)
|
||
parent = fm.get("parent_entity", "")
|
||
if parent:
|
||
parent_slug = parent.strip("[]").lower()
|
||
aliases.add(parent_slug)
|
||
|
||
# Mine body for ticker mentions ($XXXX and standalone ALL-CAPS tokens)
|
||
dollar_tickers = re.findall(r"\$([A-Z]{2,10})", body[:2000])
|
||
for ticker in dollar_tickers:
|
||
aliases.add(ticker.lower())
|
||
aliases.add(f"${ticker.lower()}")
|
||
# Standalone all-caps tokens (likely tickers: OMFG, META, SOL)
|
||
caps_tokens = re.findall(r"\b([A-Z]{2,10})\b", body[:2000])
|
||
for token in caps_tokens:
|
||
# Filter common English words that happen to be short caps
|
||
if token not in ("THE", "AND", "FOR", "NOT", "BUT", "HAS", "ARE", "WAS",
|
||
"ITS", "ALL", "CAN", "HAD", "HER", "ONE", "OUR", "OUT",
|
||
"NEW", "NOW", "OLD", "SEE", "WAY", "MAY", "SAY", "SHE",
|
||
"TWO", "HOW", "BOY", "DID", "GET", "PUT", "KEY", "TVL",
|
||
"AMM", "CEO", "SDK", "API", "ICO", "APY", "FAQ", "IPO"):
|
||
aliases.add(token.lower())
|
||
aliases.add(f"${token.lower()}")
|
||
|
||
# Also add aliases field if it exists (future schema)
|
||
for a in (fm.get("aliases", []) or []):
|
||
aliases.add(a.lower())
|
||
|
||
# Extract wiki-linked claim references from body
|
||
related_claims = re.findall(r"\[\[([^\]]+)\]\]", body)
|
||
|
||
# Body excerpt — decisions get full body, entities get 500 chars
|
||
ft = fm.get("type")
|
||
if ft == "decision":
|
||
# Full body for decision records — proposals can be 6K+
|
||
overview = body[:8000] if body else (summary or "")
|
||
elif summary:
|
||
overview = f"{summary} "
|
||
body_lines = [l for l in body.split("\n") if l.strip() and not l.startswith("#")]
|
||
remaining = 500 - len(overview)
|
||
if remaining > 0:
|
||
overview += " ".join(body_lines[:10])[:remaining]
|
||
else:
|
||
body_lines = [l for l in body.split("\n") if l.strip() and not l.startswith("#")]
|
||
overview = " ".join(body_lines[:10])[:500]
|
||
|
||
idx = len(self._entities)
|
||
self._entities.append({
|
||
"name": name,
|
||
"path": str(md_file),
|
||
"type": entity_type,
|
||
"domain": domain,
|
||
"tags": tags,
|
||
"handles": handles,
|
||
"aliases": list(aliases),
|
||
"overview": overview,
|
||
"related_claims": related_claims,
|
||
})
|
||
|
||
# Register all aliases in lookup map
|
||
for alias in aliases:
|
||
self._entity_alias_map.setdefault(alias, []).append(idx)
|
||
|
||
except Exception as e:
|
||
logger.warning("Failed to index entity %s: %s", md_file, e)
|
||
|
||
def _index_claims(self):
|
||
"""Scan domains/, core/, and foundations/ for claim files."""
|
||
claim_dirs = [
|
||
self.repo_dir / "domains",
|
||
self.repo_dir / "core",
|
||
self.repo_dir / "foundations",
|
||
]
|
||
for claim_dir in claim_dirs:
|
||
if not claim_dir.exists():
|
||
continue
|
||
for md_file in claim_dir.rglob("*.md"):
|
||
# Skip _map.md and other non-claim files
|
||
if md_file.name.startswith("_"):
|
||
continue
|
||
try:
|
||
fm, body = _parse_frontmatter(md_file)
|
||
if not fm:
|
||
# Many claims lack explicit type — index them anyway
|
||
title = md_file.stem.replace("-", " ")
|
||
self._claims.append({
|
||
"title": title,
|
||
"path": str(md_file),
|
||
"domain": _domain_from_path(md_file, self.repo_dir),
|
||
"confidence": "unknown",
|
||
"description": "",
|
||
})
|
||
continue
|
||
|
||
# Skip non-claim types if type is explicit
|
||
ft = fm.get("type")
|
||
if ft and ft not in ("claim", None):
|
||
continue
|
||
|
||
title = md_file.stem.replace("-", " ")
|
||
self._claims.append({
|
||
"title": title,
|
||
"path": str(md_file),
|
||
"domain": fm.get("domain", _domain_from_path(md_file, self.repo_dir)),
|
||
"confidence": fm.get("confidence", "unknown"),
|
||
"description": fm.get("description", ""),
|
||
})
|
||
except Exception as e:
|
||
logger.warning("Failed to index claim %s: %s", md_file, e)
|
||
|
||
def _index_agent_state(self):
|
||
"""Scan agents/ for positions and beliefs."""
|
||
agents_dir = self.repo_dir / "agents"
|
||
if not agents_dir.exists():
|
||
return
|
||
for agent_dir in agents_dir.iterdir():
|
||
if not agent_dir.is_dir():
|
||
continue
|
||
agent_name = agent_dir.name
|
||
|
||
# Index positions
|
||
positions_dir = agent_dir / "positions"
|
||
if positions_dir.exists():
|
||
for md_file in positions_dir.glob("*.md"):
|
||
try:
|
||
fm, body = _parse_frontmatter(md_file)
|
||
title = fm.get("title", md_file.stem.replace("-", " ")) if fm else md_file.stem.replace("-", " ")
|
||
content = body[:500] if body else ""
|
||
self._positions.append({
|
||
"agent": agent_name,
|
||
"title": title,
|
||
"path": str(md_file),
|
||
"content": content,
|
||
})
|
||
except Exception as e:
|
||
logger.warning("Failed to index position %s: %s", md_file, e)
|
||
|
||
# Index beliefs (just the file, we'll excerpt on demand)
|
||
beliefs_file = agent_dir / "beliefs.md"
|
||
if beliefs_file.exists():
|
||
try:
|
||
content = beliefs_file.read_text()[:3000]
|
||
self._beliefs.append({
|
||
"agent": agent_name,
|
||
"path": str(beliefs_file),
|
||
"content": content,
|
||
})
|
||
except Exception as e:
|
||
logger.warning("Failed to index beliefs %s: %s", beliefs_file, e)
|
||
|
||
|
||
# ─── Retrieval ────────────────────────────────────────────────────────
|
||
|
||
|
||
def retrieve_context(query: str, repo_dir: str, index: KBIndex | None = None,
|
||
max_claims: int = 8, max_entities: int = 5,
|
||
max_positions: int = 3,
|
||
kb_scope: list[str] | None = None) -> KBContext:
|
||
"""Main entry point: retrieve full KB context for a query.
|
||
|
||
Three layers:
|
||
1. Entity resolution — match query tokens to entities, scored by relevance
|
||
2. Claim search — substring + keyword matching on titles and descriptions
|
||
3. Agent context — positions and beliefs referencing matched entities/claims
|
||
"""
|
||
if index is None:
|
||
index = KBIndex(repo_dir)
|
||
index.ensure_fresh()
|
||
|
||
ctx = KBContext()
|
||
|
||
# Normalize query
|
||
query_lower = query.lower()
|
||
query_tokens = _tokenize(query_lower)
|
||
|
||
# ── Layer 1: Entity Resolution ──
|
||
# Score each entity by how many query tokens match its aliases/name
|
||
scored_entities: list[tuple[float, int]] = [] # (score, index)
|
||
|
||
# Build a set of candidate indices from alias map + substring matching
|
||
candidate_indices = set()
|
||
for token in query_tokens:
|
||
if token in index._entity_alias_map:
|
||
candidate_indices.update(index._entity_alias_map[token])
|
||
if token.startswith("$"):
|
||
bare = token[1:]
|
||
if bare in index._entity_alias_map:
|
||
candidate_indices.update(index._entity_alias_map[bare])
|
||
|
||
for i, ent in enumerate(index._entities):
|
||
for token in query_tokens:
|
||
if len(token) >= 3 and token in ent["name"].lower():
|
||
candidate_indices.add(i)
|
||
|
||
# Score candidates by query token overlap
|
||
for idx in candidate_indices:
|
||
ent = index._entities[idx]
|
||
score = _score_entity(query_lower, query_tokens, ent)
|
||
if score > 0:
|
||
scored_entities.append((score, idx))
|
||
|
||
scored_entities.sort(key=lambda x: x[0], reverse=True)
|
||
|
||
for score, idx in scored_entities[:max_entities]:
|
||
ent = index._entities[idx]
|
||
ctx.entities.append(EntityMatch(
|
||
name=ent["name"],
|
||
path=ent["path"],
|
||
entity_type=ent["type"],
|
||
domain=ent["domain"],
|
||
overview=_sanitize_for_prompt(ent["overview"], max_len=8000),
|
||
tags=ent["tags"],
|
||
related_claims=ent["related_claims"],
|
||
))
|
||
|
||
# Collect entity-related claim titles for boosting
|
||
entity_claim_titles = set()
|
||
for em in ctx.entities:
|
||
for rc in em.related_claims:
|
||
entity_claim_titles.add(rc.lower().replace("-", " "))
|
||
|
||
# ── Layer 2: Claim Search ──
|
||
# Import min score threshold (filters single-stopword garbage matches)
|
||
try:
|
||
from lib.config import RETRIEVAL_MIN_CLAIM_SCORE as MIN_SCORE
|
||
except ImportError:
|
||
MIN_SCORE = 3.0
|
||
|
||
scored_claims: list[tuple[float, dict]] = []
|
||
|
||
# Normalize kb_scope paths for prefix matching
|
||
_scope_prefixes = None
|
||
if kb_scope:
|
||
_scope_prefixes = [str(Path(repo_dir) / s) for s in kb_scope]
|
||
|
||
for claim in index._claims:
|
||
# Domain filtering: if kb_scope is set, only score claims in-scope
|
||
if _scope_prefixes:
|
||
if not any(claim["path"].startswith(p) for p in _scope_prefixes):
|
||
continue
|
||
score = _score_claim(query_lower, query_tokens, claim, entity_claim_titles)
|
||
if score >= MIN_SCORE:
|
||
scored_claims.append((score, claim))
|
||
|
||
scored_claims.sort(key=lambda x: x[0], reverse=True)
|
||
|
||
for score, claim in scored_claims[:max_claims]:
|
||
ctx.claims.append(ClaimMatch(
|
||
title=claim["title"],
|
||
path=claim["path"],
|
||
domain=claim["domain"],
|
||
confidence=claim["confidence"],
|
||
description=_sanitize_for_prompt(claim.get("description", "")),
|
||
score=score,
|
||
))
|
||
|
||
# ── Layer 3: Agent Context ──
|
||
# Find positions referencing matched entities or claims
|
||
match_terms = set(query_tokens)
|
||
for em in ctx.entities:
|
||
match_terms.add(em.name.lower())
|
||
for cm in ctx.claims:
|
||
# Add key words from matched claim titles
|
||
match_terms.update(t for t in cm.title.lower().split() if len(t) >= 4)
|
||
|
||
for pos in index._positions:
|
||
pos_text = (pos["title"] + " " + pos["content"]).lower()
|
||
overlap = sum(1 for t in match_terms if t in pos_text)
|
||
if overlap >= 2:
|
||
ctx.positions.append(PositionMatch(
|
||
agent=pos["agent"],
|
||
title=pos["title"],
|
||
content=_sanitize_for_prompt(pos["content"]),
|
||
))
|
||
if len(ctx.positions) >= max_positions:
|
||
break
|
||
|
||
# Extract relevant belief excerpts
|
||
for belief in index._beliefs:
|
||
belief_text = belief["content"].lower()
|
||
overlap = sum(1 for t in match_terms if t in belief_text)
|
||
if overlap >= 2:
|
||
# Extract relevant paragraphs
|
||
excerpts = _extract_relevant_paragraphs(belief["content"], match_terms, max_paragraphs=2)
|
||
for exc in excerpts:
|
||
ctx.belief_excerpts.append(f"**{belief['agent']}**: {_sanitize_for_prompt(exc)}")
|
||
|
||
# Stats
|
||
ctx.stats = {
|
||
"total_claims": len(index._claims),
|
||
"total_entities": len(index._entities),
|
||
"total_positions": len(index._positions),
|
||
"entities_matched": len(ctx.entities),
|
||
"claims_matched": len(ctx.claims),
|
||
}
|
||
|
||
return ctx
|
||
|
||
|
||
# ─── Scoring ──────────────────────────────────────────────────────────
|
||
|
||
|
||
_STOP_WORDS = frozenset({
|
||
"the", "for", "and", "but", "not", "you", "can", "has", "are", "was",
|
||
"its", "all", "had", "her", "one", "our", "out", "new", "now", "old",
|
||
"see", "way", "may", "say", "she", "two", "how", "did", "get", "put",
|
||
"give", "me", "ok", "full", "text", "what", "about", "tell", "this",
|
||
"that", "with", "from", "have", "more", "some", "than", "them", "then",
|
||
"into", "also", "just", "your", "been", "here", "will", "does", "know",
|
||
"please", "think",
|
||
})
|
||
|
||
|
||
def _score_entity(query_lower: str, query_tokens: list[str], entity: dict) -> float:
|
||
"""Score an entity against a query. Higher = more relevant."""
|
||
name_lower = entity["name"].lower()
|
||
overview_lower = entity.get("overview", "").lower()
|
||
aliases = entity.get("aliases", [])
|
||
score = 0.0
|
||
|
||
# Filter out stop words — only score meaningful tokens
|
||
meaningful_tokens = [t for t in query_tokens if t not in _STOP_WORDS and len(t) >= 3]
|
||
|
||
for token in meaningful_tokens:
|
||
# Name match (highest signal)
|
||
if token in name_lower:
|
||
score += 3.0
|
||
# Alias match (tags, proposer, parent_entity, tickers)
|
||
elif any(token == a or token in a for a in aliases):
|
||
score += 1.0
|
||
# Overview match (body content)
|
||
elif token in overview_lower:
|
||
score += 0.5
|
||
|
||
# Boost multi-word name matches (e.g. "robin hanson" in entity name)
|
||
if len(meaningful_tokens) >= 2:
|
||
bigrams = [f"{meaningful_tokens[i]} {meaningful_tokens[i+1]}" for i in range(len(meaningful_tokens) - 1)]
|
||
for bg in bigrams:
|
||
if bg in name_lower:
|
||
score += 5.0
|
||
|
||
return score
|
||
|
||
|
||
def _score_claim(query_lower: str, query_tokens: list[str], claim: dict,
|
||
entity_claim_titles: set[str]) -> float:
|
||
"""Score a claim against a query. Higher = more relevant."""
|
||
title = claim["title"].lower()
|
||
desc = claim.get("description", "").lower()
|
||
searchable = title + " " + desc
|
||
score = 0.0
|
||
|
||
# Filter stopwords — same as entity scoring. Without this, "from", "what", "to"
|
||
# all score points and garbage like "fee revenue splits" matches on "living".
|
||
meaningful_tokens = [t for t in query_tokens if t not in _STOP_WORDS and len(t) >= 3]
|
||
|
||
# Substring match on meaningful tokens only
|
||
for token in meaningful_tokens:
|
||
if token in searchable:
|
||
score += 2.0 if token in title else 1.0
|
||
|
||
# Boost if this claim is wiki-linked from a matched entity
|
||
if any(t in title for t in entity_claim_titles):
|
||
score += 5.0
|
||
|
||
# Boost multi-word matches (use meaningful tokens only)
|
||
if len(meaningful_tokens) >= 2:
|
||
bigrams = [f"{meaningful_tokens[i]} {meaningful_tokens[i+1]}" for i in range(len(meaningful_tokens) - 1)]
|
||
for bg in bigrams:
|
||
if bg in searchable:
|
||
score += 3.0
|
||
|
||
return score
|
||
|
||
|
||
# ─── Helpers ──────────────────────────────────────────────────────────
|
||
|
||
|
||
def _parse_frontmatter(path: Path) -> tuple[dict | None, str]:
|
||
"""Parse YAML frontmatter and body from a markdown file."""
|
||
try:
|
||
text = path.read_text(errors="replace")
|
||
except Exception:
|
||
return None, ""
|
||
|
||
if not text.startswith("---"):
|
||
return None, text
|
||
|
||
end = text.find("\n---", 3)
|
||
if end == -1:
|
||
return None, text
|
||
|
||
try:
|
||
fm = yaml.safe_load(text[3:end])
|
||
if not isinstance(fm, dict):
|
||
return None, text
|
||
body = text[end + 4:].strip()
|
||
return fm, body
|
||
except yaml.YAMLError:
|
||
return None, text
|
||
|
||
|
||
def _domain_from_path(path: Path, repo_dir: Path) -> str:
|
||
"""Infer domain from file path."""
|
||
rel = path.relative_to(repo_dir)
|
||
parts = rel.parts
|
||
if len(parts) >= 2 and parts[0] in ("domains", "entities", "decisions"):
|
||
return parts[1]
|
||
if len(parts) >= 1 and parts[0] == "core":
|
||
return "core"
|
||
if len(parts) >= 1 and parts[0] == "foundations":
|
||
return parts[1] if len(parts) >= 2 else "foundations"
|
||
return "unknown"
|
||
|
||
|
||
def _tokenize(text: str) -> list[str]:
|
||
"""Split query into searchable tokens."""
|
||
# Keep $ prefix for ticker matching
|
||
tokens = re.findall(r"\$?\w+", text.lower())
|
||
# Filter out very short stop words but keep short tickers
|
||
return [t for t in tokens if len(t) >= 2]
|
||
|
||
|
||
def _sanitize_for_prompt(text: str, max_len: int = 1000) -> str:
|
||
"""Sanitize content before injecting into LLM prompt (Ganymede: security)."""
|
||
# Strip code blocks
|
||
text = re.sub(r"```.*?```", "[code block removed]", text, flags=re.DOTALL)
|
||
# Strip anything that looks like system instructions
|
||
text = re.sub(r"(system:|assistant:|human:|<\|.*?\|>)", "", text, flags=re.IGNORECASE)
|
||
# Truncate
|
||
return text[:max_len]
|
||
|
||
|
||
def _extract_relevant_paragraphs(text: str, terms: set[str], max_paragraphs: int = 2) -> list[str]:
|
||
"""Extract paragraphs from text that contain the most matching terms."""
|
||
paragraphs = text.split("\n\n")
|
||
scored = []
|
||
for p in paragraphs:
|
||
p_stripped = p.strip()
|
||
if len(p_stripped) < 20:
|
||
continue
|
||
p_lower = p_stripped.lower()
|
||
overlap = sum(1 for t in terms if t in p_lower)
|
||
if overlap > 0:
|
||
scored.append((overlap, p_stripped[:300]))
|
||
scored.sort(key=lambda x: x[0], reverse=True)
|
||
return [text for _, text in scored[:max_paragraphs]]
|
||
|
||
|
||
def format_context_for_prompt(ctx: KBContext) -> str:
|
||
"""Format KBContext as text for injection into the LLM prompt."""
|
||
sections = []
|
||
|
||
if ctx.entities:
|
||
sections.append("## Matched Entities")
|
||
for i, ent in enumerate(ctx.entities):
|
||
sections.append(f"**{ent.name}** ({ent.entity_type}, {ent.domain})")
|
||
# Top 3 entities get full content, rest get truncated
|
||
if i < 3:
|
||
sections.append(ent.overview[:8000])
|
||
else:
|
||
sections.append(ent.overview[:500])
|
||
if ent.related_claims:
|
||
sections.append("Related claims: " + ", ".join(ent.related_claims[:5]))
|
||
sections.append("")
|
||
|
||
if ctx.claims:
|
||
sections.append("## Relevant KB Claims")
|
||
for claim in ctx.claims:
|
||
sections.append(f"- **{claim.title}** (confidence: {claim.confidence}, domain: {claim.domain})")
|
||
if claim.description:
|
||
sections.append(f" {claim.description}")
|
||
sections.append("")
|
||
|
||
if ctx.positions:
|
||
sections.append("## Agent Positions")
|
||
for pos in ctx.positions:
|
||
sections.append(f"**{pos.agent}**: {pos.title}")
|
||
sections.append(pos.content[:200])
|
||
sections.append("")
|
||
|
||
if ctx.belief_excerpts:
|
||
sections.append("## Relevant Beliefs")
|
||
for exc in ctx.belief_excerpts:
|
||
sections.append(exc)
|
||
sections.append("")
|
||
|
||
if not sections:
|
||
return "No relevant KB content found for this query."
|
||
|
||
# Add stats footer
|
||
sections.append(f"---\nKB: {ctx.stats.get('total_claims', '?')} claims, "
|
||
f"{ctx.stats.get('total_entities', '?')} entities. "
|
||
f"Matched: {ctx.stats.get('entities_matched', 0)} entities, "
|
||
f"{ctx.stats.get('claims_matched', 0)} claims.")
|
||
|
||
return "\n".join(sections)
|
||
|
||
|
||
# --- Qdrant vector search integration ---
|
||
|
||
# Module-level import guard for lib.search (Fix 3: no per-call sys.path manipulation)
|
||
_vector_search = None
|
||
try:
|
||
import sys as _sys
|
||
import os as _os
|
||
_pipeline_root = _os.path.dirname(_os.path.dirname(_os.path.abspath(__file__)))
|
||
if _pipeline_root not in _sys.path:
|
||
_sys.path.insert(0, _pipeline_root)
|
||
from lib.search import search as _vector_search
|
||
except ImportError:
|
||
logger.warning("Qdrant search unavailable at module load (lib.search not found)")
|
||
|
||
|
||
def retrieve_vector_context(query: str,
|
||
keyword_paths: list[str] | None = None) -> tuple[str, dict]:
|
||
"""Semantic search via Qdrant — returns (formatted_text, metadata).
|
||
|
||
Complements retrieve_context() (symbolic/keyword) with semantic similarity.
|
||
Falls back gracefully if Qdrant is unavailable.
|
||
|
||
Args:
|
||
keyword_paths: Claim paths already matched by keyword search. These are
|
||
excluded at the Qdrant query level AND from graph expansion to avoid
|
||
duplicates in the prompt.
|
||
|
||
Returns:
|
||
(formatted_text, metadata_dict)
|
||
metadata_dict: {direct_results: [...], expanded_results: [...],
|
||
layers_hit: [...], duration_ms: int}
|
||
"""
|
||
import time as _time
|
||
t0 = _time.monotonic()
|
||
empty_meta = {"direct_results": [], "expanded_results": [],
|
||
"layers_hit": [], "duration_ms": 0}
|
||
|
||
if _vector_search is None:
|
||
return "", empty_meta
|
||
|
||
try:
|
||
results = _vector_search(query, expand=True,
|
||
exclude=keyword_paths)
|
||
except Exception as e:
|
||
logger.warning("Qdrant search failed: %s", e)
|
||
return "", empty_meta
|
||
|
||
duration = int((_time.monotonic() - t0) * 1000)
|
||
|
||
if results.get("error") or not results.get("direct_results"):
|
||
return "", {**empty_meta, "duration_ms": duration,
|
||
"error": results.get("error")}
|
||
|
||
layers_hit = ["qdrant"]
|
||
if results.get("expanded_results"):
|
||
layers_hit.append("graph")
|
||
|
||
# Build structured metadata for audit
|
||
meta = {
|
||
"direct_results": [
|
||
{"path": r["claim_path"], "title": r["claim_title"],
|
||
"score": r["score"], "domain": r.get("domain", ""),
|
||
"source": "qdrant"}
|
||
for r in results["direct_results"]
|
||
],
|
||
"expanded_results": [
|
||
{"path": r["claim_path"], "title": r["claim_title"],
|
||
"edge_type": r.get("edge_type", "related"),
|
||
"from_claim": r.get("from_claim", ""), "source": "graph"}
|
||
for r in results.get("expanded_results", [])
|
||
],
|
||
"layers_hit": layers_hit,
|
||
"duration_ms": duration,
|
||
}
|
||
|
||
# Build formatted text for prompt (Fix 4: subsection headers)
|
||
sections = []
|
||
sections.append("## Semantic Search Results (Qdrant)")
|
||
sections.append("")
|
||
sections.append("### Direct matches")
|
||
|
||
for r in results["direct_results"]:
|
||
score_pct = int(r["score"] * 100)
|
||
line = f"- **{r['claim_title']}** ({score_pct}% match"
|
||
if r.get("domain"):
|
||
line += f", {r['domain']}"
|
||
if r.get("confidence"):
|
||
line += f", {r['confidence']}"
|
||
line += ")"
|
||
sections.append(line)
|
||
if r.get("snippet"):
|
||
sections.append(f" {r['snippet']}")
|
||
|
||
if results.get("expanded_results"):
|
||
sections.append("")
|
||
sections.append("### Related claims (graph expansion)")
|
||
for r in results["expanded_results"]:
|
||
edge = r.get("edge_type", "related")
|
||
weight_str = f" ×{r.get('edge_weight', 1.0)}" if r.get("edge_weight", 1.0) != 1.0 else ""
|
||
sections.append(f"- {r['claim_title']} ({edge}{weight_str} → {r.get('from_claim', '').split('/')[-1]})")
|
||
|
||
return "\n".join(sections), meta
|