teleo-codex/ops/pipeline-v2/telegram/kb_tools.py
m3taversal 7bfce6b706 commit telegram bot module from VPS — 20 files never previously in repo
Pulled from /opt/teleo-eval/telegram/ on VPS. Includes:
- bot.py (92K), kb_retrieval.py, kb_tools.py (agentic retrieval)
- retrieval.py (RRF merge, query decomposition, entity traversal)
- response.py (system prompt builder, response parser)
- agent_config.py, agent_runner.py (multi-agent template unit support)
- approval_stages.py, approvals.py, digest.py (approval workflow)
- eval_checks.py, eval.py (response quality checks)
- output_gate.py, x_publisher.py, x_client.py, x_search.py (X pipeline)
- market_data.py, worktree_lock.py (utilities)
- rio.yaml, theseus.yaml (agent configs)

These files were deployed to VPS but never committed to the repo.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 11:02:32 +02:00

719 lines
26 KiB
Python

#!/usr/bin/env python3
"""KB tools for LLM function-calling — source tracing + entity/claim lookup.
These tools let the agent trace claims back to their original sources,
find all claims from a specific piece of research, and read source documents.
Epimetheus owns this module.
"""
import logging
import os
import re
from pathlib import Path
import yaml
logger = logging.getLogger("tg.kb_tools")
# ─── Tool definitions (OpenAI function-calling format) ───────────────
TOOL_DEFINITIONS = [
{
"type": "function",
"function": {
"name": "find_by_source",
"description": (
"Find all claims extracted from a specific source (article, paper, thread). "
"Search by author name, source title, or keywords. Returns all claims from "
"matching sources with their frontmatter."
),
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Author name, source title, or keywords to match against claim source fields",
},
},
"required": ["query"],
},
},
},
{
"type": "function",
"function": {
"name": "read_source",
"description": (
"Read the original source document (article, thread, paper) that claims were "
"extracted from. Use when you need the full context behind a claim, not just "
"the extracted summary."
),
"parameters": {
"type": "object",
"properties": {
"source_title": {
"type": "string",
"description": "Title or slug of the source document to read",
},
},
"required": ["source_title"],
},
},
},
{
"type": "function",
"function": {
"name": "read_entity",
"description": "Read the full profile of a KB entity (project, person, protocol).",
"parameters": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "Entity name or slug",
},
},
"required": ["name"],
},
},
},
{
"type": "function",
"function": {
"name": "list_entity_links",
"description": "List all entities and claims linked from an entity's wiki-links.",
"parameters": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "Entity name or slug",
},
},
"required": ["name"],
},
},
},
{
"type": "function",
"function": {
"name": "read_claim",
"description": "Read the full content of a specific claim file.",
"parameters": {
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "Claim title or slug",
},
},
"required": ["title"],
},
},
},
{
"type": "function",
"function": {
"name": "search_kb",
"description": "Search the KB for claims matching a query. Uses keyword matching.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query",
},
"max_results": {
"type": "integer",
"description": "Max results to return (default 5)",
},
},
"required": ["query"],
},
},
},
{
"type": "function",
"function": {
"name": "explore_graph",
"description": (
"Follow knowledge graph edges from a claim to find connected claims. "
"Returns all claims linked via supports, challenges, depends_on, and related edges. "
"Use this to discover the full argument structure around a claim — what supports it, "
"what challenges it, and what it depends on."
),
"parameters": {
"type": "object",
"properties": {
"claim_title": {
"type": "string",
"description": "Title or slug of the claim to explore edges from",
},
},
"required": ["claim_title"],
},
},
},
{
"type": "function",
"function": {
"name": "search_sources",
"description": (
"Search the source archive for original documents by topic, author, or title. "
"Returns matching source files with their titles and first few lines. "
"Use this when you want to find the original research/article/thread, not just extracted claims."
),
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Topic, author name, or keywords to search source documents",
},
"max_results": {
"type": "integer",
"description": "Max results to return (default 5)",
},
},
"required": ["query"],
},
},
},
{
"type": "function",
"function": {
"name": "pr_status",
"description": (
"Check the status of a pipeline PR by number. Returns eval verdicts, "
"merge status, time in queue, rejection reasons, and retry counts."
),
"parameters": {
"type": "object",
"properties": {
"pr_number": {
"type": "integer",
"description": "PR number to look up",
},
},
"required": ["pr_number"],
},
},
},
{
"type": "function",
"function": {
"name": "check_duplicate",
"description": (
"Check if a claim is a near-duplicate of existing KB content. "
"Returns top-3 closest matches with similarity scores. "
">=0.85 = likely duplicate, 0.70-0.85 = check manually, <0.70 = novel."
),
"parameters": {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The claim text to check for duplicates",
},
},
"required": ["text"],
},
},
},
]
# ─── Tool implementations ────────────────────────────────────────────
def find_by_source(query: str, kb_dir: str) -> str:
"""Find all claims extracted from sources matching the query.
Searches claim frontmatter `source:` fields for author names, titles, keywords.
Returns structured list of all claims from matching sources.
"""
query_lower = query.lower()
query_tokens = [t for t in re.findall(r'\w+', query_lower) if len(t) >= 3]
# Scan all claim files for matching source fields
matches: list[dict] = []
claim_dirs = [
Path(kb_dir) / "domains",
Path(kb_dir) / "core",
Path(kb_dir) / "foundations",
]
for claim_dir in claim_dirs:
if not claim_dir.exists():
continue
for md_file in claim_dir.rglob("*.md"):
if md_file.name.startswith("_"):
continue
try:
fm, body = _parse_frontmatter(md_file)
if not fm:
continue
source = fm.get("source", "")
source_file = fm.get("source_file", "")
searchable = f"{source} {source_file}".lower()
# Score: how many query tokens appear in the source field
score = sum(1 for t in query_tokens if t in searchable)
if score >= max(1, len(query_tokens) // 2):
matches.append({
"title": md_file.stem.replace("-", " "),
"path": str(md_file.relative_to(kb_dir)),
"source": source,
"source_file": source_file,
"domain": fm.get("domain", "unknown"),
"confidence": fm.get("confidence", "unknown"),
"description": fm.get("description", ""),
"score": score,
})
except Exception:
continue
if not matches:
return f"No claims found from sources matching '{query}'."
# Sort by score desc, group by source
matches.sort(key=lambda m: m["score"], reverse=True)
# Group by source
by_source: dict[str, list[dict]] = {}
for m in matches:
key = m["source"] or "unknown"
by_source.setdefault(key, []).append(m)
lines = [f"Found {len(matches)} claims from {len(by_source)} matching sources:\n"]
for source_name, claims in list(by_source.items())[:5]: # Cap at 5 sources
lines.append(f"## Source: {source_name}")
if claims[0].get("source_file"):
lines.append(f"File: {claims[0]['source_file']}")
for c in claims[:10]: # Cap at 10 claims per source
lines.append(f"- **{c['title']}** ({c['confidence']}, {c['domain']})")
if c["description"]:
lines.append(f" {c['description'][:200]}")
lines.append("")
return "\n".join(lines)[:4000]
def read_source(source_title: str, kb_dir: str) -> str:
"""Read the original source document from the archive.
Looks in inbox/archive/ and sources/ for matching files.
"""
title_lower = source_title.lower()
slug = re.sub(r'[^a-z0-9]+', '-', title_lower).strip('-')
# Search paths for source files
search_dirs = [
Path(kb_dir) / "inbox" / "archive",
Path(kb_dir) / "sources",
Path(kb_dir) / "inbox" / "queue",
]
best_match = None
best_score = 0
for search_dir in search_dirs:
if not search_dir.exists():
continue
for md_file in search_dir.rglob("*.md"):
file_slug = md_file.stem.lower()
# Score by token overlap
score = 0
for token in re.findall(r'\w+', title_lower):
if len(token) >= 3 and token in file_slug:
score += 1
if slug in file_slug:
score += 5 # Exact slug match
if score > best_score:
best_score = score
best_match = md_file
if not best_match:
return f"Source document '{source_title}' not found in archive."
try:
content = best_match.read_text(errors="replace")
# Truncate to 4K for prompt safety
if len(content) > 4000:
content = content[:4000] + "\n\n[... truncated, full document is longer ...]"
return f"## Source: {best_match.name}\n\n{content}"
except Exception as e:
return f"Error reading source: {e}"
def read_entity(name: str, kb_dir: str) -> str:
"""Read the full profile of a KB entity."""
entity_file = _find_file(name, [
Path(kb_dir) / "entities",
Path(kb_dir) / "decisions",
])
if not entity_file:
return f"Entity '{name}' not found."
try:
content = entity_file.read_text(errors="replace")
return content[:4000]
except Exception as e:
return f"Error reading entity: {e}"
def list_entity_links(name: str, kb_dir: str) -> str:
"""List all wiki-links from an entity file, with dedup."""
entity_file = _find_file(name, [
Path(kb_dir) / "entities",
Path(kb_dir) / "decisions",
])
if not entity_file:
return f"Entity '{name}' not found."
try:
content = entity_file.read_text(errors="replace")
links = re.findall(r"\[\[([^\]]+)\]\]", content)
# Dedup while preserving order
seen = set()
unique_links = []
for link in links:
if link.lower() not in seen:
seen.add(link.lower())
unique_links.append(link)
if not unique_links:
return f"Entity '{name}' has no wiki-links."
return f"Entity '{name}' links to {len(unique_links)} items:\n" + "\n".join(
f"- [[{link}]]" for link in unique_links
)
except Exception as e:
return f"Error reading entity links: {e}"
def read_claim(title: str, kb_dir: str) -> str:
"""Read the full content of a claim file."""
claim_file = _find_file(title, [
Path(kb_dir) / "domains",
Path(kb_dir) / "core",
Path(kb_dir) / "foundations",
])
if not claim_file:
return f"Claim '{title}' not found."
try:
content = claim_file.read_text(errors="replace")
return content[:4000]
except Exception as e:
return f"Error reading claim: {e}"
def search_kb(query: str, kb_dir: str, max_results: int = 5) -> str:
"""Search KB claims by keyword matching."""
from kb_retrieval import KBIndex, retrieve_context
index = KBIndex(kb_dir)
index.ensure_fresh()
ctx = retrieve_context(query, kb_dir, index=index, max_claims=max_results)
if not ctx.claims:
return f"No claims found for '{query}'."
lines = [f"Found {len(ctx.claims)} claims:"]
for c in ctx.claims:
lines.append(f"- **{c.title}** ({c.confidence}, {c.domain}, score: {c.score:.1f})")
if c.description:
lines.append(f" {c.description[:200]}")
return "\n".join(lines)
def explore_graph(claim_title: str, kb_dir: str) -> str:
"""Follow knowledge graph edges from a claim to find connected claims.
Uses lib/search.py graph_expand() for 1-hop traversal of supports/challenges/
depends_on/related edges in frontmatter.
"""
# Find the claim file first
claim_file = _find_file(claim_title, [
Path(kb_dir) / "domains",
Path(kb_dir) / "core",
Path(kb_dir) / "foundations",
])
if not claim_file:
return f"Claim '{claim_title}' not found. Try a different title or use search_kb to find it first."
try:
rel_path = str(claim_file.relative_to(kb_dir))
except ValueError:
rel_path = str(claim_file)
# Use the existing graph_expand from lib/search.py
try:
from lib.search import graph_expand
expanded = graph_expand([rel_path], repo_root=Path(kb_dir), max_expanded=20)
except ImportError:
# Fallback: parse edges directly from the file
expanded = []
fm, body = _parse_frontmatter(claim_file)
if fm:
for edge_type in ("supports", "challenges", "challenged_by", "depends_on", "related"):
targets = fm.get(edge_type, [])
if isinstance(targets, str):
targets = [targets]
if isinstance(targets, list):
for t in targets:
expanded.append({"claim_title": t, "edge_type": edge_type, "edge_weight": 1.0})
if not expanded:
return f"Claim '{claim_title}' has no graph edges (no supports, challenges, or related claims)."
# Group by edge type for readability
by_type: dict[str, list[dict]] = {}
for e in expanded:
by_type.setdefault(e["edge_type"], []).append(e)
lines = [f"Graph edges from '{claim_title}' ({len(expanded)} connected claims):\n"]
type_labels = {
"supports": "Supports (this claim backs these up)",
"challenges": "Challenges (this claim argues against these)",
"challenged_by": "Challenged by (these argue against this claim)",
"depends_on": "Depends on (prerequisites for this claim)",
"related": "Related (connected by topic)",
"wiki_links": "Wiki-linked (mentioned in body text)",
}
for edge_type, items in by_type.items():
label = type_labels.get(edge_type, edge_type)
lines.append(f"### {label}")
for item in items:
title = item.get("claim_title", "unknown")
weight = item.get("edge_weight", 1.0)
lines.append(f"- {title}" + (f" (weight: {weight})" if weight != 1.0 else ""))
lines.append("")
return "\n".join(lines)[:4000]
def search_sources(query: str, kb_dir: str, max_results: int = 5) -> str:
"""Search the source archive for original documents by topic/author/title.
Scans inbox/archive/ and sources/ directories, scoring by token overlap.
"""
query_lower = query.lower()
query_tokens = [t for t in re.findall(r'\w+', query_lower) if len(t) >= 3]
if not query_tokens:
return "Query too short — provide at least one keyword with 3+ characters."
search_dirs = [
Path(kb_dir) / "inbox" / "archive",
Path(kb_dir) / "sources",
Path(kb_dir) / "inbox" / "queue",
]
matches: list[dict] = []
for search_dir in search_dirs:
if not search_dir.exists():
continue
for md_file in search_dir.rglob("*.md"):
if md_file.name.startswith("_"):
continue
file_stem = md_file.stem.lower().replace("-", " ")
# Score by token overlap with filename
score = sum(1 for t in query_tokens if t in file_stem)
# Also check first 500 chars of file content for author/topic
if score == 0:
try:
head = md_file.read_text(errors="replace")[:500].lower()
score = sum(0.5 for t in query_tokens if t in head)
except Exception:
continue
if score >= max(1, len(query_tokens) // 3):
# Read first few lines for preview
try:
preview = md_file.read_text(errors="replace")[:300].strip()
except Exception:
preview = "(could not read)"
matches.append({
"title": md_file.stem.replace("-", " "),
"path": str(md_file.relative_to(kb_dir)),
"score": score,
"preview": preview,
})
if not matches:
return f"No source documents found matching '{query}'. Try different keywords or check find_by_source for claims from that source."
matches.sort(key=lambda m: m["score"], reverse=True)
matches = matches[:max_results]
lines = [f"Found {len(matches)} source documents:\n"]
for m in matches:
lines.append(f"### {m['title']}")
lines.append(f"Path: {m['path']}")
lines.append(f"{m['preview'][:200]}")
lines.append("")
return "\n".join(lines)[:4000]
# ─── Tool dispatcher ─────────────────────────────────────────────────
def execute_tool(tool_name: str, args: dict, kb_dir: str) -> str:
"""Dispatch a tool call by name. Returns the tool's string result."""
if tool_name == "find_by_source":
return find_by_source(args.get("query", ""), kb_dir)
elif tool_name == "read_source":
return read_source(args.get("source_title", ""), kb_dir)
elif tool_name == "read_entity":
return read_entity(args.get("name", ""), kb_dir)
elif tool_name == "list_entity_links":
return list_entity_links(args.get("name", ""), kb_dir)
elif tool_name == "read_claim":
return read_claim(args.get("title", ""), kb_dir)
elif tool_name == "search_kb":
return search_kb(args.get("query", ""), kb_dir, args.get("max_results", 5))
elif tool_name == "explore_graph":
return explore_graph(args.get("claim_title", ""), kb_dir)
elif tool_name == "search_sources":
return search_sources(args.get("query", ""), kb_dir, args.get("max_results", 5))
elif tool_name == "pr_status":
return _tool_pr_status(args.get("pr_number", 0))
elif tool_name == "check_duplicate":
return _tool_check_duplicate(args.get("text", ""))
else:
return f"Unknown tool: {tool_name}"
# ─── Helpers ─────────────────────────────────────────────────────────
def _parse_frontmatter(path: Path) -> tuple[dict | None, str]:
"""Parse YAML frontmatter and body from a markdown file."""
try:
text = path.read_text(errors="replace")
except Exception:
return None, ""
if not text.startswith("---"):
return None, text
end = text.find("\n---", 3)
if end == -1:
return None, text
try:
fm = yaml.safe_load(text[3:end])
if not isinstance(fm, dict):
return None, text
body = text[end + 4:].strip()
return fm, body
except yaml.YAMLError:
return None, text
def _find_file(name: str, search_dirs: list[Path]) -> Path | None:
"""Find a markdown file by name/slug across search directories."""
slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
name_lower = name.lower()
for search_dir in search_dirs:
if not search_dir.exists():
continue
for md_file in search_dir.rglob("*.md"):
if md_file.name.startswith("_"):
continue
stem_lower = md_file.stem.lower()
# Exact slug match
if stem_lower == slug:
return md_file
# Normalized match (spaces vs hyphens)
if stem_lower.replace("-", " ") == name_lower.replace("-", " "):
return md_file
# Substring match for long titles
if len(slug) >= 8 and slug in stem_lower:
return md_file
return None
# ─── Pipeline DB tools ──────────────────────────────────────────────
def _tool_pr_status(pr_number: int) -> str:
"""Wrapper for pr_status() — connects to pipeline DB, returns formatted string."""
import json
import sqlite3
db_path = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
try:
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
row = conn.execute(
"""SELECT number, branch, source_path, status, domain, agent,
commit_type, tier, leo_verdict, domain_verdict,
domain_agent, eval_issues, priority, origin,
cost_usd, created_at, merged_at, last_attempt, last_error,
transient_retries, substantive_retries, description
FROM prs WHERE number = ?""",
(pr_number,),
).fetchone()
conn.close()
if not row:
return f"PR #{pr_number} not found."
issues = []
try:
issues = json.loads(row["eval_issues"] or "[]")
except (json.JSONDecodeError, TypeError):
pass
lines = [
f"PR #{row['number']}{row['status'].upper()}",
f"Branch: {row['branch']}",
f"Domain: {row['domain'] or 'unknown'} | Agent: {row['agent'] or 'pipeline'}",
f"Type: {row['commit_type'] or 'unknown'} | Tier: {row['tier'] or 'unknown'}",
f"Leo verdict: {row['leo_verdict']} | Domain verdict: {row['domain_verdict']}",
]
if row["description"]:
lines.append(f"Description: {row['description']}")
if issues:
lines.append(f"Eval issues: {', '.join(str(i) for i in issues)}")
if row["last_error"]:
lines.append(f"Last error: {row['last_error'][:200]}")
lines.append(f"Retries: {row['transient_retries']} transient, {row['substantive_retries']} substantive")
lines.append(f"Created: {row['created_at']} | Last attempt: {row['last_attempt']}")
if row["merged_at"]:
lines.append(f"Merged: {row['merged_at']}")
if row["cost_usd"]:
lines.append(f"Eval cost: ${row['cost_usd']:.4f}")
return "\n".join(lines)
except Exception as e:
return f"Error querying PR #{pr_number}: {e}"
def _tool_check_duplicate(text: str) -> str:
"""Wrapper for check_duplicate() — calls Qdrant, returns formatted string."""
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from lib.search import check_duplicate as _check_dup
if not text:
return "Error: text is required."
result = _check_dup(text)
if result.get("error"):
return f"Error: {result['error']}"
lines = [f"Verdict: {result['verdict'].upper()} (highest score: {result['highest_score']:.4f})"]
for i, m in enumerate(result["matches"], 1):
lines.append(
f" {i}. [{m['score']:.4f}] {m['claim_title'][:80]}"
f"\n Path: {m['claim_path']}"
)
if not result["matches"]:
lines.append(" No matches found above minimum threshold.")
return "\n".join(lines)