#!/usr/bin/env python3 """KB tools for LLM function-calling — source tracing + entity/claim lookup. These tools let the agent trace claims back to their original sources, find all claims from a specific piece of research, and read source documents. Epimetheus owns this module. """ import logging import os import re from pathlib import Path import yaml logger = logging.getLogger("tg.kb_tools") # ─── Tool definitions (OpenAI function-calling format) ─────────────── TOOL_DEFINITIONS = [ { "type": "function", "function": { "name": "find_by_source", "description": ( "Find all claims extracted from a specific source (article, paper, thread). " "Search by author name, source title, or keywords. Returns all claims from " "matching sources with their frontmatter." ), "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "Author name, source title, or keywords to match against claim source fields", }, }, "required": ["query"], }, }, }, { "type": "function", "function": { "name": "read_source", "description": ( "Read the original source document (article, thread, paper) that claims were " "extracted from. Use when you need the full context behind a claim, not just " "the extracted summary." ), "parameters": { "type": "object", "properties": { "source_title": { "type": "string", "description": "Title or slug of the source document to read", }, }, "required": ["source_title"], }, }, }, { "type": "function", "function": { "name": "read_entity", "description": "Read the full profile of a KB entity (project, person, protocol).", "parameters": { "type": "object", "properties": { "name": { "type": "string", "description": "Entity name or slug", }, }, "required": ["name"], }, }, }, { "type": "function", "function": { "name": "list_entity_links", "description": "List all entities and claims linked from an entity's wiki-links.", "parameters": { "type": "object", "properties": { "name": { "type": "string", "description": "Entity name or slug", }, }, "required": ["name"], }, }, }, { "type": "function", "function": { "name": "read_claim", "description": "Read the full content of a specific claim file.", "parameters": { "type": "object", "properties": { "title": { "type": "string", "description": "Claim title or slug", }, }, "required": ["title"], }, }, }, { "type": "function", "function": { "name": "search_kb", "description": "Search the KB for claims matching a query. Uses keyword matching.", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "Search query", }, "max_results": { "type": "integer", "description": "Max results to return (default 5)", }, }, "required": ["query"], }, }, }, { "type": "function", "function": { "name": "explore_graph", "description": ( "Follow knowledge graph edges from a claim to find connected claims. " "Returns all claims linked via supports, challenges, depends_on, and related edges. " "Use this to discover the full argument structure around a claim — what supports it, " "what challenges it, and what it depends on." ), "parameters": { "type": "object", "properties": { "claim_title": { "type": "string", "description": "Title or slug of the claim to explore edges from", }, }, "required": ["claim_title"], }, }, }, { "type": "function", "function": { "name": "search_sources", "description": ( "Search the source archive for original documents by topic, author, or title. " "Returns matching source files with their titles and first few lines. " "Use this when you want to find the original research/article/thread, not just extracted claims." ), "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "Topic, author name, or keywords to search source documents", }, "max_results": { "type": "integer", "description": "Max results to return (default 5)", }, }, "required": ["query"], }, }, }, { "type": "function", "function": { "name": "pr_status", "description": ( "Check the status of a pipeline PR by number. Returns eval verdicts, " "merge status, time in queue, rejection reasons, and retry counts." ), "parameters": { "type": "object", "properties": { "pr_number": { "type": "integer", "description": "PR number to look up", }, }, "required": ["pr_number"], }, }, }, { "type": "function", "function": { "name": "check_duplicate", "description": ( "Check if a claim is a near-duplicate of existing KB content. " "Returns top-3 closest matches with similarity scores. " ">=0.85 = likely duplicate, 0.70-0.85 = check manually, <0.70 = novel." ), "parameters": { "type": "object", "properties": { "text": { "type": "string", "description": "The claim text to check for duplicates", }, }, "required": ["text"], }, }, }, ] # ─── Tool implementations ──────────────────────────────────────────── def find_by_source(query: str, kb_dir: str) -> str: """Find all claims extracted from sources matching the query. Searches claim frontmatter `source:` fields for author names, titles, keywords. Returns structured list of all claims from matching sources. """ query_lower = query.lower() query_tokens = [t for t in re.findall(r'\w+', query_lower) if len(t) >= 3] # Scan all claim files for matching source fields matches: list[dict] = [] claim_dirs = [ Path(kb_dir) / "domains", Path(kb_dir) / "core", Path(kb_dir) / "foundations", ] for claim_dir in claim_dirs: if not claim_dir.exists(): continue for md_file in claim_dir.rglob("*.md"): if md_file.name.startswith("_"): continue try: fm, body = _parse_frontmatter(md_file) if not fm: continue source = fm.get("source", "") source_file = fm.get("source_file", "") searchable = f"{source} {source_file}".lower() # Score: how many query tokens appear in the source field score = sum(1 for t in query_tokens if t in searchable) if score >= max(1, len(query_tokens) // 2): matches.append({ "title": md_file.stem.replace("-", " "), "path": str(md_file.relative_to(kb_dir)), "source": source, "source_file": source_file, "domain": fm.get("domain", "unknown"), "confidence": fm.get("confidence", "unknown"), "description": fm.get("description", ""), "score": score, }) except Exception: continue if not matches: return f"No claims found from sources matching '{query}'." # Sort by score desc, group by source matches.sort(key=lambda m: m["score"], reverse=True) # Group by source by_source: dict[str, list[dict]] = {} for m in matches: key = m["source"] or "unknown" by_source.setdefault(key, []).append(m) lines = [f"Found {len(matches)} claims from {len(by_source)} matching sources:\n"] for source_name, claims in list(by_source.items())[:5]: # Cap at 5 sources lines.append(f"## Source: {source_name}") if claims[0].get("source_file"): lines.append(f"File: {claims[0]['source_file']}") for c in claims[:10]: # Cap at 10 claims per source lines.append(f"- **{c['title']}** ({c['confidence']}, {c['domain']})") if c["description"]: lines.append(f" {c['description'][:200]}") lines.append("") return "\n".join(lines)[:4000] def read_source(source_title: str, kb_dir: str) -> str: """Read the original source document from the archive. Looks in inbox/archive/ and sources/ for matching files. """ title_lower = source_title.lower() slug = re.sub(r'[^a-z0-9]+', '-', title_lower).strip('-') # Search paths for source files search_dirs = [ Path(kb_dir) / "inbox" / "archive", Path(kb_dir) / "sources", Path(kb_dir) / "inbox" / "queue", ] best_match = None best_score = 0 for search_dir in search_dirs: if not search_dir.exists(): continue for md_file in search_dir.rglob("*.md"): file_slug = md_file.stem.lower() # Score by token overlap score = 0 for token in re.findall(r'\w+', title_lower): if len(token) >= 3 and token in file_slug: score += 1 if slug in file_slug: score += 5 # Exact slug match if score > best_score: best_score = score best_match = md_file if not best_match: return f"Source document '{source_title}' not found in archive." try: content = best_match.read_text(errors="replace") # Truncate to 4K for prompt safety if len(content) > 4000: content = content[:4000] + "\n\n[... truncated, full document is longer ...]" return f"## Source: {best_match.name}\n\n{content}" except Exception as e: return f"Error reading source: {e}" def read_entity(name: str, kb_dir: str) -> str: """Read the full profile of a KB entity.""" entity_file = _find_file(name, [ Path(kb_dir) / "entities", Path(kb_dir) / "decisions", ]) if not entity_file: return f"Entity '{name}' not found." try: content = entity_file.read_text(errors="replace") return content[:4000] except Exception as e: return f"Error reading entity: {e}" def list_entity_links(name: str, kb_dir: str) -> str: """List all wiki-links from an entity file, with dedup.""" entity_file = _find_file(name, [ Path(kb_dir) / "entities", Path(kb_dir) / "decisions", ]) if not entity_file: return f"Entity '{name}' not found." try: content = entity_file.read_text(errors="replace") links = re.findall(r"\[\[([^\]]+)\]\]", content) # Dedup while preserving order seen = set() unique_links = [] for link in links: if link.lower() not in seen: seen.add(link.lower()) unique_links.append(link) if not unique_links: return f"Entity '{name}' has no wiki-links." return f"Entity '{name}' links to {len(unique_links)} items:\n" + "\n".join( f"- [[{link}]]" for link in unique_links ) except Exception as e: return f"Error reading entity links: {e}" def read_claim(title: str, kb_dir: str) -> str: """Read the full content of a claim file.""" claim_file = _find_file(title, [ Path(kb_dir) / "domains", Path(kb_dir) / "core", Path(kb_dir) / "foundations", ]) if not claim_file: return f"Claim '{title}' not found." try: content = claim_file.read_text(errors="replace") return content[:4000] except Exception as e: return f"Error reading claim: {e}" def search_kb(query: str, kb_dir: str, max_results: int = 5) -> str: """Search KB claims by keyword matching.""" from kb_retrieval import KBIndex, retrieve_context index = KBIndex(kb_dir) index.ensure_fresh() ctx = retrieve_context(query, kb_dir, index=index, max_claims=max_results) if not ctx.claims: return f"No claims found for '{query}'." lines = [f"Found {len(ctx.claims)} claims:"] for c in ctx.claims: lines.append(f"- **{c.title}** ({c.confidence}, {c.domain}, score: {c.score:.1f})") if c.description: lines.append(f" {c.description[:200]}") return "\n".join(lines) def explore_graph(claim_title: str, kb_dir: str) -> str: """Follow knowledge graph edges from a claim to find connected claims. Uses lib/search.py graph_expand() for 1-hop traversal of supports/challenges/ depends_on/related edges in frontmatter. """ # Find the claim file first claim_file = _find_file(claim_title, [ Path(kb_dir) / "domains", Path(kb_dir) / "core", Path(kb_dir) / "foundations", ]) if not claim_file: return f"Claim '{claim_title}' not found. Try a different title or use search_kb to find it first." try: rel_path = str(claim_file.relative_to(kb_dir)) except ValueError: rel_path = str(claim_file) # Use the existing graph_expand from lib/search.py try: from lib.search import graph_expand expanded = graph_expand([rel_path], repo_root=Path(kb_dir), max_expanded=20) except ImportError: # Fallback: parse edges directly from the file expanded = [] fm, body = _parse_frontmatter(claim_file) if fm: for edge_type in ("supports", "challenges", "challenged_by", "depends_on", "related"): targets = fm.get(edge_type, []) if isinstance(targets, str): targets = [targets] if isinstance(targets, list): for t in targets: expanded.append({"claim_title": t, "edge_type": edge_type, "edge_weight": 1.0}) if not expanded: return f"Claim '{claim_title}' has no graph edges (no supports, challenges, or related claims)." # Group by edge type for readability by_type: dict[str, list[dict]] = {} for e in expanded: by_type.setdefault(e["edge_type"], []).append(e) lines = [f"Graph edges from '{claim_title}' ({len(expanded)} connected claims):\n"] type_labels = { "supports": "Supports (this claim backs these up)", "challenges": "Challenges (this claim argues against these)", "challenged_by": "Challenged by (these argue against this claim)", "depends_on": "Depends on (prerequisites for this claim)", "related": "Related (connected by topic)", "wiki_links": "Wiki-linked (mentioned in body text)", } for edge_type, items in by_type.items(): label = type_labels.get(edge_type, edge_type) lines.append(f"### {label}") for item in items: title = item.get("claim_title", "unknown") weight = item.get("edge_weight", 1.0) lines.append(f"- {title}" + (f" (weight: {weight})" if weight != 1.0 else "")) lines.append("") return "\n".join(lines)[:4000] def search_sources(query: str, kb_dir: str, max_results: int = 5) -> str: """Search the source archive for original documents by topic/author/title. Scans inbox/archive/ and sources/ directories, scoring by token overlap. """ query_lower = query.lower() query_tokens = [t for t in re.findall(r'\w+', query_lower) if len(t) >= 3] if not query_tokens: return "Query too short — provide at least one keyword with 3+ characters." search_dirs = [ Path(kb_dir) / "inbox" / "archive", Path(kb_dir) / "sources", Path(kb_dir) / "inbox" / "queue", ] matches: list[dict] = [] for search_dir in search_dirs: if not search_dir.exists(): continue for md_file in search_dir.rglob("*.md"): if md_file.name.startswith("_"): continue file_stem = md_file.stem.lower().replace("-", " ") # Score by token overlap with filename score = sum(1 for t in query_tokens if t in file_stem) # Also check first 500 chars of file content for author/topic if score == 0: try: head = md_file.read_text(errors="replace")[:500].lower() score = sum(0.5 for t in query_tokens if t in head) except Exception: continue if score >= max(1, len(query_tokens) // 3): # Read first few lines for preview try: preview = md_file.read_text(errors="replace")[:300].strip() except Exception: preview = "(could not read)" matches.append({ "title": md_file.stem.replace("-", " "), "path": str(md_file.relative_to(kb_dir)), "score": score, "preview": preview, }) if not matches: return f"No source documents found matching '{query}'. Try different keywords or check find_by_source for claims from that source." matches.sort(key=lambda m: m["score"], reverse=True) matches = matches[:max_results] lines = [f"Found {len(matches)} source documents:\n"] for m in matches: lines.append(f"### {m['title']}") lines.append(f"Path: {m['path']}") lines.append(f"{m['preview'][:200]}") lines.append("") return "\n".join(lines)[:4000] # ─── Tool dispatcher ───────────────────────────────────────────────── def execute_tool(tool_name: str, args: dict, kb_dir: str) -> str: """Dispatch a tool call by name. Returns the tool's string result.""" if tool_name == "find_by_source": return find_by_source(args.get("query", ""), kb_dir) elif tool_name == "read_source": return read_source(args.get("source_title", ""), kb_dir) elif tool_name == "read_entity": return read_entity(args.get("name", ""), kb_dir) elif tool_name == "list_entity_links": return list_entity_links(args.get("name", ""), kb_dir) elif tool_name == "read_claim": return read_claim(args.get("title", ""), kb_dir) elif tool_name == "search_kb": return search_kb(args.get("query", ""), kb_dir, args.get("max_results", 5)) elif tool_name == "explore_graph": return explore_graph(args.get("claim_title", ""), kb_dir) elif tool_name == "search_sources": return search_sources(args.get("query", ""), kb_dir, args.get("max_results", 5)) elif tool_name == "pr_status": return _tool_pr_status(args.get("pr_number", 0)) elif tool_name == "check_duplicate": return _tool_check_duplicate(args.get("text", "")) else: return f"Unknown tool: {tool_name}" # ─── Helpers ───────────────────────────────────────────────────────── def _parse_frontmatter(path: Path) -> tuple[dict | None, str]: """Parse YAML frontmatter and body from a markdown file.""" try: text = path.read_text(errors="replace") except Exception: return None, "" if not text.startswith("---"): return None, text end = text.find("\n---", 3) if end == -1: return None, text try: fm = yaml.safe_load(text[3:end]) if not isinstance(fm, dict): return None, text body = text[end + 4:].strip() return fm, body except yaml.YAMLError: return None, text def _find_file(name: str, search_dirs: list[Path]) -> Path | None: """Find a markdown file by name/slug across search directories.""" slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-') name_lower = name.lower() for search_dir in search_dirs: if not search_dir.exists(): continue for md_file in search_dir.rglob("*.md"): if md_file.name.startswith("_"): continue stem_lower = md_file.stem.lower() # Exact slug match if stem_lower == slug: return md_file # Normalized match (spaces vs hyphens) if stem_lower.replace("-", " ") == name_lower.replace("-", " "): return md_file # Substring match for long titles if len(slug) >= 8 and slug in stem_lower: return md_file return None # ─── Pipeline DB tools ────────────────────────────────────────────── def _tool_pr_status(pr_number: int) -> str: """Wrapper for pr_status() — connects to pipeline DB, returns formatted string.""" import json import sqlite3 db_path = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db") try: conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row row = conn.execute( """SELECT number, branch, source_path, status, domain, agent, commit_type, tier, leo_verdict, domain_verdict, domain_agent, eval_issues, priority, origin, cost_usd, created_at, merged_at, last_attempt, last_error, transient_retries, substantive_retries, description FROM prs WHERE number = ?""", (pr_number,), ).fetchone() conn.close() if not row: return f"PR #{pr_number} not found." issues = [] try: issues = json.loads(row["eval_issues"] or "[]") except (json.JSONDecodeError, TypeError): pass lines = [ f"PR #{row['number']} — {row['status'].upper()}", f"Branch: {row['branch']}", f"Domain: {row['domain'] or 'unknown'} | Agent: {row['agent'] or 'pipeline'}", f"Type: {row['commit_type'] or 'unknown'} | Tier: {row['tier'] or 'unknown'}", f"Leo verdict: {row['leo_verdict']} | Domain verdict: {row['domain_verdict']}", ] if row["description"]: lines.append(f"Description: {row['description']}") if issues: lines.append(f"Eval issues: {', '.join(str(i) for i in issues)}") if row["last_error"]: lines.append(f"Last error: {row['last_error'][:200]}") lines.append(f"Retries: {row['transient_retries']} transient, {row['substantive_retries']} substantive") lines.append(f"Created: {row['created_at']} | Last attempt: {row['last_attempt']}") if row["merged_at"]: lines.append(f"Merged: {row['merged_at']}") if row["cost_usd"]: lines.append(f"Eval cost: ${row['cost_usd']:.4f}") return "\n".join(lines) except Exception as e: return f"Error querying PR #{pr_number}: {e}" def _tool_check_duplicate(text: str) -> str: """Wrapper for check_duplicate() — calls Qdrant, returns formatted string.""" import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from lib.search import check_duplicate as _check_dup if not text: return "Error: text is required." result = _check_dup(text) if result.get("error"): return f"Error: {result['error']}" lines = [f"Verdict: {result['verdict'].upper()} (highest score: {result['highest_score']:.4f})"] for i, m in enumerate(result["matches"], 1): lines.append( f" {i}. [{m['score']:.4f}] {m['claim_title'][:80]}" f"\n Path: {m['claim_path']}" ) if not result["matches"]: lines.append(" No matches found above minimum threshold.") return "\n".join(lines)