teleo-codex/ops/pipeline-v2/telegram/kb_tools.py

#!/usr/bin/env python3
"""KB tools for LLM function-calling — source tracing + entity/claim lookup.

These tools let the agent trace claims back to their original sources,
find all claims from a specific piece of research, and read source documents.

Epimetheus owns this module.
"""

import logging
import os
import re
from pathlib import Path

import yaml

logger = logging.getLogger("tg.kb_tools")


# ─── Tool definitions (OpenAI function-calling format) ───────────────

TOOL_DEFINITIONS = [
    {
        "type": "function",
        "function": {
            "name": "find_by_source",
            "description": (
                "Find all claims extracted from a specific source (article, paper, thread). "
                "Search by author name, source title, or keywords. Returns all claims from "
                "matching sources with their frontmatter."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "Author name, source title, or keywords to match against claim source fields",
                    },
                },
                "required": ["query"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "read_source",
            "description": (
                "Read the original source document (article, thread, paper) that claims were "
                "extracted from. Use when you need the full context behind a claim, not just "
                "the extracted summary."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "source_title": {
                        "type": "string",
                        "description": "Title or slug of the source document to read",
                    },
                },
                "required": ["source_title"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "read_entity",
            "description": "Read the full profile of a KB entity (project, person, protocol).",
            "parameters": {
                "type": "object",
                "properties": {
                    "name": {
                        "type": "string",
                        "description": "Entity name or slug",
                    },
                },
                "required": ["name"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "list_entity_links",
            "description": "List all entities and claims linked from an entity's wiki-links.",
            "parameters": {
                "type": "object",
                "properties": {
                    "name": {
                        "type": "string",
                        "description": "Entity name or slug",
                    },
                },
                "required": ["name"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "read_claim",
            "description": "Read the full content of a specific claim file.",
            "parameters": {
                "type": "object",
                "properties": {
                    "title": {
                        "type": "string",
                        "description": "Claim title or slug",
                    },
                },
                "required": ["title"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "search_kb",
            "description": "Search the KB for claims matching a query. Uses keyword matching.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "Search query",
                    },
                    "max_results": {
                        "type": "integer",
                        "description": "Max results to return (default 5)",
                    },
                },
                "required": ["query"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "explore_graph",
            "description": (
                "Follow knowledge graph edges from a claim to find connected claims. "
                "Returns all claims linked via supports, challenges, depends_on, and related edges. "
                "Use this to discover the full argument structure around a claim — what supports it, "
                "what challenges it, and what it depends on."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "claim_title": {
                        "type": "string",
                        "description": "Title or slug of the claim to explore edges from",
                    },
                },
                "required": ["claim_title"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "search_sources",
            "description": (
                "Search the source archive for original documents by topic, author, or title. "
                "Returns matching source files with their titles and first few lines. "
                "Use this when you want to find the original research/article/thread, not just extracted claims."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "Topic, author name, or keywords to search source documents",
                    },
                    "max_results": {
                        "type": "integer",
                        "description": "Max results to return (default 5)",
                    },
                },
                "required": ["query"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "pr_status",
            "description": (
                "Check the status of a pipeline PR by number. Returns eval verdicts, "
                "merge status, time in queue, rejection reasons, and retry counts."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "pr_number": {
                        "type": "integer",
                        "description": "PR number to look up",
                    },
                },
                "required": ["pr_number"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "check_duplicate",
            "description": (
                "Check if a claim is a near-duplicate of existing KB content. "
                "Returns top-3 closest matches with similarity scores. "
                ">=0.85 = likely duplicate, 0.70-0.85 = check manually, <0.70 = novel."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "text": {
                        "type": "string",
                        "description": "The claim text to check for duplicates",
                    },
                },
                "required": ["text"],
            },
        },
    },
]


# ─── Tool implementations ────────────────────────────────────────────


def find_by_source(query: str, kb_dir: str) -> str:
    """Find all claims extracted from sources matching the query.

    Searches claim frontmatter `source:` fields for author names, titles, keywords.
    Returns structured list of all claims from matching sources.
    """
    query_lower = query.lower()
    query_tokens = [t for t in re.findall(r'\w+', query_lower) if len(t) >= 3]

    # Scan all claim files for matching source fields
    matches: list[dict] = []
    claim_dirs = [
        Path(kb_dir) / "domains",
        Path(kb_dir) / "core",
        Path(kb_dir) / "foundations",
    ]

    for claim_dir in claim_dirs:
        if not claim_dir.exists():
            continue
        for md_file in claim_dir.rglob("*.md"):
            if md_file.name.startswith("_"):
                continue
            try:
                fm, body = _parse_frontmatter(md_file)
                if not fm:
                    continue
                source = fm.get("source", "")
                source_file = fm.get("source_file", "")
                searchable = f"{source} {source_file}".lower()

                # Score: how many query tokens appear in the source field
                score = sum(1 for t in query_tokens if t in searchable)
                if score >= max(1, len(query_tokens) // 2):
                    matches.append({
                        "title": md_file.stem.replace("-", " "),
                        "path": str(md_file.relative_to(kb_dir)),
                        "source": source,
                        "source_file": source_file,
                        "domain": fm.get("domain", "unknown"),
                        "confidence": fm.get("confidence", "unknown"),
                        "description": fm.get("description", ""),
                        "score": score,
                    })
            except Exception:
                continue

    if not matches:
        return f"No claims found from sources matching '{query}'."

    # Sort by score desc, group by source
    matches.sort(key=lambda m: m["score"], reverse=True)

    # Group by source
    by_source: dict[str, list[dict]] = {}
    for m in matches:
        key = m["source"] or "unknown"
        by_source.setdefault(key, []).append(m)

    lines = [f"Found {len(matches)} claims from {len(by_source)} matching sources:\n"]
    for source_name, claims in list(by_source.items())[:5]:  # Cap at 5 sources
        lines.append(f"## Source: {source_name}")
        if claims[0].get("source_file"):
            lines.append(f"File: {claims[0]['source_file']}")
        for c in claims[:10]:  # Cap at 10 claims per source
            lines.append(f"- **{c['title']}** ({c['confidence']}, {c['domain']})")
            if c["description"]:
                lines.append(f"  {c['description'][:200]}")
        lines.append("")

    return "\n".join(lines)[:4000]


def read_source(source_title: str, kb_dir: str) -> str:
    """Read the original source document from the archive.

    Looks in inbox/archive/ and sources/ for matching files.
    """
    title_lower = source_title.lower()
    slug = re.sub(r'[^a-z0-9]+', '-', title_lower).strip('-')

    # Search paths for source files
    search_dirs = [
        Path(kb_dir) / "inbox" / "archive",
        Path(kb_dir) / "sources",
        Path(kb_dir) / "inbox" / "queue",
    ]

    best_match = None
    best_score = 0

    for search_dir in search_dirs:
        if not search_dir.exists():
            continue
        for md_file in search_dir.rglob("*.md"):
            file_slug = md_file.stem.lower()
            # Score by token overlap
            score = 0
            for token in re.findall(r'\w+', title_lower):
                if len(token) >= 3 and token in file_slug:
                    score += 1
            if slug in file_slug:
                score += 5  # Exact slug match
            if score > best_score:
                best_score = score
                best_match = md_file

    if not best_match:
        return f"Source document '{source_title}' not found in archive."

    try:
        content = best_match.read_text(errors="replace")
        # Truncate to 4K for prompt safety
        if len(content) > 4000:
            content = content[:4000] + "\n\n[... truncated, full document is longer ...]"
        return f"## Source: {best_match.name}\n\n{content}"
    except Exception as e:
        return f"Error reading source: {e}"


def read_entity(name: str, kb_dir: str) -> str:
    """Read the full profile of a KB entity."""
    entity_file = _find_file(name, [
        Path(kb_dir) / "entities",
        Path(kb_dir) / "decisions",
    ])
    if not entity_file:
        return f"Entity '{name}' not found."
    try:
        content = entity_file.read_text(errors="replace")
        return content[:4000]
    except Exception as e:
        return f"Error reading entity: {e}"


def list_entity_links(name: str, kb_dir: str) -> str:
    """List all wiki-links from an entity file, with dedup."""
    entity_file = _find_file(name, [
        Path(kb_dir) / "entities",
        Path(kb_dir) / "decisions",
    ])
    if not entity_file:
        return f"Entity '{name}' not found."

    try:
        content = entity_file.read_text(errors="replace")
        links = re.findall(r"\[\[([^\]]+)\]\]", content)
        # Dedup while preserving order
        seen = set()
        unique_links = []
        for link in links:
            if link.lower() not in seen:
                seen.add(link.lower())
                unique_links.append(link)
        if not unique_links:
            return f"Entity '{name}' has no wiki-links."
        return f"Entity '{name}' links to {len(unique_links)} items:\n" + "\n".join(
            f"- [[{link}]]" for link in unique_links
        )
    except Exception as e:
        return f"Error reading entity links: {e}"


def read_claim(title: str, kb_dir: str) -> str:
    """Read the full content of a claim file."""
    claim_file = _find_file(title, [
        Path(kb_dir) / "domains",
        Path(kb_dir) / "core",
        Path(kb_dir) / "foundations",
    ])
    if not claim_file:
        return f"Claim '{title}' not found."
    try:
        content = claim_file.read_text(errors="replace")
        return content[:4000]
    except Exception as e:
        return f"Error reading claim: {e}"


def search_kb(query: str, kb_dir: str, max_results: int = 5) -> str:
    """Search KB claims by keyword matching."""
    from kb_retrieval import KBIndex, retrieve_context
    index = KBIndex(kb_dir)
    index.ensure_fresh()
    ctx = retrieve_context(query, kb_dir, index=index, max_claims=max_results)
    if not ctx.claims:
        return f"No claims found for '{query}'."
    lines = [f"Found {len(ctx.claims)} claims:"]
    for c in ctx.claims:
        lines.append(f"- **{c.title}** ({c.confidence}, {c.domain}, score: {c.score:.1f})")
        if c.description:
            lines.append(f"  {c.description[:200]}")
    return "\n".join(lines)


def explore_graph(claim_title: str, kb_dir: str) -> str:
    """Follow knowledge graph edges from a claim to find connected claims.

    Uses lib/search.py graph_expand() for 1-hop traversal of supports/challenges/
    depends_on/related edges in frontmatter.
    """
    # Find the claim file first
    claim_file = _find_file(claim_title, [
        Path(kb_dir) / "domains",
        Path(kb_dir) / "core",
        Path(kb_dir) / "foundations",
    ])
    if not claim_file:
        return f"Claim '{claim_title}' not found. Try a different title or use search_kb to find it first."

    try:
        rel_path = str(claim_file.relative_to(kb_dir))
    except ValueError:
        rel_path = str(claim_file)

    # Use the existing graph_expand from lib/search.py
    try:
        from lib.search import graph_expand
        expanded = graph_expand([rel_path], repo_root=Path(kb_dir), max_expanded=20)
    except ImportError:
        # Fallback: parse edges directly from the file
        expanded = []
        fm, body = _parse_frontmatter(claim_file)
        if fm:
            for edge_type in ("supports", "challenges", "challenged_by", "depends_on", "related"):
                targets = fm.get(edge_type, [])
                if isinstance(targets, str):
                    targets = [targets]
                if isinstance(targets, list):
                    for t in targets:
                        expanded.append({"claim_title": t, "edge_type": edge_type, "edge_weight": 1.0})

    if not expanded:
        return f"Claim '{claim_title}' has no graph edges (no supports, challenges, or related claims)."

    # Group by edge type for readability
    by_type: dict[str, list[dict]] = {}
    for e in expanded:
        by_type.setdefault(e["edge_type"], []).append(e)

    lines = [f"Graph edges from '{claim_title}' ({len(expanded)} connected claims):\n"]
    type_labels = {
        "supports": "Supports (this claim backs these up)",
        "challenges": "Challenges (this claim argues against these)",
        "challenged_by": "Challenged by (these argue against this claim)",
        "depends_on": "Depends on (prerequisites for this claim)",
        "related": "Related (connected by topic)",
        "wiki_links": "Wiki-linked (mentioned in body text)",
    }
    for edge_type, items in by_type.items():
        label = type_labels.get(edge_type, edge_type)
        lines.append(f"### {label}")
        for item in items:
            title = item.get("claim_title", "unknown")
            weight = item.get("edge_weight", 1.0)
            lines.append(f"- {title}" + (f" (weight: {weight})" if weight != 1.0 else ""))
        lines.append("")

    return "\n".join(lines)[:4000]


def search_sources(query: str, kb_dir: str, max_results: int = 5) -> str:
    """Search the source archive for original documents by topic/author/title.

    Scans inbox/archive/ and sources/ directories, scoring by token overlap.
    """
    query_lower = query.lower()
    query_tokens = [t for t in re.findall(r'\w+', query_lower) if len(t) >= 3]

    if not query_tokens:
        return "Query too short — provide at least one keyword with 3+ characters."

    search_dirs = [
        Path(kb_dir) / "inbox" / "archive",
        Path(kb_dir) / "sources",
        Path(kb_dir) / "inbox" / "queue",
    ]

    matches: list[dict] = []
    for search_dir in search_dirs:
        if not search_dir.exists():
            continue
        for md_file in search_dir.rglob("*.md"):
            if md_file.name.startswith("_"):
                continue
            file_stem = md_file.stem.lower().replace("-", " ")
            # Score by token overlap with filename
            score = sum(1 for t in query_tokens if t in file_stem)
            # Also check first 500 chars of file content for author/topic
            if score == 0:
                try:
                    head = md_file.read_text(errors="replace")[:500].lower()
                    score = sum(0.5 for t in query_tokens if t in head)
                except Exception:
                    continue
            if score >= max(1, len(query_tokens) // 3):
                # Read first few lines for preview
                try:
                    preview = md_file.read_text(errors="replace")[:300].strip()
                except Exception:
                    preview = "(could not read)"
                matches.append({
                    "title": md_file.stem.replace("-", " "),
                    "path": str(md_file.relative_to(kb_dir)),
                    "score": score,
                    "preview": preview,
                })

    if not matches:
        return f"No source documents found matching '{query}'. Try different keywords or check find_by_source for claims from that source."

    matches.sort(key=lambda m: m["score"], reverse=True)
    matches = matches[:max_results]

    lines = [f"Found {len(matches)} source documents:\n"]
    for m in matches:
        lines.append(f"### {m['title']}")
        lines.append(f"Path: {m['path']}")
        lines.append(f"{m['preview'][:200]}")
        lines.append("")

    return "\n".join(lines)[:4000]


# ─── Tool dispatcher ─────────────────────────────────────────────────


def execute_tool(tool_name: str, args: dict, kb_dir: str) -> str:
    """Dispatch a tool call by name. Returns the tool's string result."""
    if tool_name == "find_by_source":
        return find_by_source(args.get("query", ""), kb_dir)
    elif tool_name == "read_source":
        return read_source(args.get("source_title", ""), kb_dir)
    elif tool_name == "read_entity":
        return read_entity(args.get("name", ""), kb_dir)
    elif tool_name == "list_entity_links":
        return list_entity_links(args.get("name", ""), kb_dir)
    elif tool_name == "read_claim":
        return read_claim(args.get("title", ""), kb_dir)
    elif tool_name == "search_kb":
        return search_kb(args.get("query", ""), kb_dir, args.get("max_results", 5))
    elif tool_name == "explore_graph":
        return explore_graph(args.get("claim_title", ""), kb_dir)
    elif tool_name == "search_sources":
        return search_sources(args.get("query", ""), kb_dir, args.get("max_results", 5))
    elif tool_name == "pr_status":
        return _tool_pr_status(args.get("pr_number", 0))
    elif tool_name == "check_duplicate":
        return _tool_check_duplicate(args.get("text", ""))
    else:
        return f"Unknown tool: {tool_name}"


# ─── Helpers ─────────────────────────────────────────────────────────


def _parse_frontmatter(path: Path) -> tuple[dict | None, str]:
    """Parse YAML frontmatter and body from a markdown file."""
    try:
        text = path.read_text(errors="replace")
    except Exception:
        return None, ""

    if not text.startswith("---"):
        return None, text

    end = text.find("\n---", 3)
    if end == -1:
        return None, text

    try:
        fm = yaml.safe_load(text[3:end])
        if not isinstance(fm, dict):
            return None, text
        body = text[end + 4:].strip()
        return fm, body
    except yaml.YAMLError:
        return None, text


def _find_file(name: str, search_dirs: list[Path]) -> Path | None:
    """Find a markdown file by name/slug across search directories."""
    slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
    name_lower = name.lower()

    for search_dir in search_dirs:
        if not search_dir.exists():
            continue
        for md_file in search_dir.rglob("*.md"):
            if md_file.name.startswith("_"):
                continue
            stem_lower = md_file.stem.lower()
            # Exact slug match
            if stem_lower == slug:
                return md_file
            # Normalized match (spaces vs hyphens)
            if stem_lower.replace("-", " ") == name_lower.replace("-", " "):
                return md_file
            # Substring match for long titles
            if len(slug) >= 8 and slug in stem_lower:
                return md_file

    return None


# ─── Pipeline DB tools ──────────────────────────────────────────────


def _tool_pr_status(pr_number: int) -> str:
    """Wrapper for pr_status() — connects to pipeline DB, returns formatted string."""
    import json
    import sqlite3

    db_path = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
    try:
        conn = sqlite3.connect(db_path)
        conn.row_factory = sqlite3.Row

        row = conn.execute(
            """SELECT number, branch, source_path, status, domain, agent,
                      commit_type, tier, leo_verdict, domain_verdict,
                      domain_agent, eval_issues, priority, origin,
                      cost_usd, created_at, merged_at, last_attempt, last_error,
                      transient_retries, substantive_retries, description
               FROM prs WHERE number = ?""",
            (pr_number,),
        ).fetchone()
        conn.close()

        if not row:
            return f"PR #{pr_number} not found."

        issues = []
        try:
            issues = json.loads(row["eval_issues"] or "[]")
        except (json.JSONDecodeError, TypeError):
            pass

        lines = [
            f"PR #{row['number']} — {row['status'].upper()}",
            f"Branch: {row['branch']}",
            f"Domain: {row['domain'] or 'unknown'} | Agent: {row['agent'] or 'pipeline'}",
            f"Type: {row['commit_type'] or 'unknown'} | Tier: {row['tier'] or 'unknown'}",
            f"Leo verdict: {row['leo_verdict']} | Domain verdict: {row['domain_verdict']}",
        ]
        if row["description"]:
            lines.append(f"Description: {row['description']}")
        if issues:
            lines.append(f"Eval issues: {', '.join(str(i) for i in issues)}")
        if row["last_error"]:
            lines.append(f"Last error: {row['last_error'][:200]}")
        lines.append(f"Retries: {row['transient_retries']} transient, {row['substantive_retries']} substantive")
        lines.append(f"Created: {row['created_at']} | Last attempt: {row['last_attempt']}")
        if row["merged_at"]:
            lines.append(f"Merged: {row['merged_at']}")
        if row["cost_usd"]:
            lines.append(f"Eval cost: ${row['cost_usd']:.4f}")

        return "\n".join(lines)
    except Exception as e:
        return f"Error querying PR #{pr_number}: {e}"


def _tool_check_duplicate(text: str) -> str:
    """Wrapper for check_duplicate() — calls Qdrant, returns formatted string."""
    import sys
    sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
    from lib.search import check_duplicate as _check_dup

    if not text:
        return "Error: text is required."

    result = _check_dup(text)

    if result.get("error"):
        return f"Error: {result['error']}"

    lines = [f"Verdict: {result['verdict'].upper()} (highest score: {result['highest_score']:.4f})"]

    for i, m in enumerate(result["matches"], 1):
        lines.append(
            f"  {i}. [{m['score']:.4f}] {m['claim_title'][:80]}"
            f"\n     Path: {m['claim_path']}"
        )

    if not result["matches"]:
        lines.append("  No matches found above minimum threshold.")

    return "\n".join(lines)