teleo-codex/ops/pipeline-v2/lib/search.py

"""Shared Qdrant vector search library for the Teleo knowledge base.

Provides embed + search + graph expansion as a reusable library.
Any consumer (Argus dashboard, Telegram bot, agent research) imports from here.

Layer 1: Qdrant vector search (semantic similarity)
Layer 2: Graph expansion (1-hop via frontmatter edges)
Layer 3: Left to the caller (agent context, domain filtering)

Owner: Epimetheus
"""

import json
import logging
import os
import re
from pathlib import Path

import urllib.request

from . import config

logger = logging.getLogger("pipeline.search")

# --- Config (all from environment or config.py defaults) ---
QDRANT_URL = os.environ.get("QDRANT_URL", "http://localhost:6333")
QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION", "teleo-claims")
EMBEDDING_MODEL = "text-embedding-3-small"

_OPENROUTER_KEY: str | None = None

WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")

# Structural files that should never be included in graph expansion results.
# These are indexes/MOCs, not claims — expanding them pulls entire domains.
STRUCTURAL_FILES = {"_map.md", "_overview.md"}


def _get_api_key() -> str | None:
    """Load OpenRouter API key (cached after first read)."""
    global _OPENROUTER_KEY
    if _OPENROUTER_KEY:
        return _OPENROUTER_KEY
    key_file = config.SECRETS_DIR / "openrouter-key"
    if key_file.exists():
        _OPENROUTER_KEY = key_file.read_text().strip()
        return _OPENROUTER_KEY
    _OPENROUTER_KEY = os.environ.get("OPENROUTER_API_KEY")
    return _OPENROUTER_KEY


# --- Layer 1: Vector search ---


def embed_query(text: str) -> list[float] | None:
    """Embed a query string via OpenRouter (OpenAI-compatible endpoint).

    Returns 1536-dim vector or None on failure.
    """
    api_key = _get_api_key()
    if not api_key:
        logger.error("No OpenRouter API key available for embedding")
        return None

    payload = json.dumps({
        "model": f"openai/{EMBEDDING_MODEL}",
        "input": text[:8000],
    }).encode()
    req = urllib.request.Request(
        "https://openrouter.ai/api/v1/embeddings",
        data=payload,
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        },
    )
    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            data = json.loads(resp.read())
            return data["data"][0]["embedding"]
    except Exception as e:
        logger.error("Embedding failed: %s", e)
        return None


def search_qdrant(vector: list[float], limit: int = 10,
                  domain: str | None = None, confidence: str | None = None,
                  exclude: list[str] | None = None,
                  score_threshold: float = 0.3,
                  offset: int = 0) -> list[dict]:
    """Search Qdrant collection for nearest claims.

    Args:
        offset: Skip first N results (Qdrant native offset for pagination).

    Returns list of hits: [{id, score, payload: {claim_path, claim_title, ...}}]
    """
    must_filters = []
    if domain:
        must_filters.append({"key": "domain", "match": {"value": domain}})
    if confidence:
        must_filters.append({"key": "confidence", "match": {"value": confidence}})

    must_not_filters = []
    if exclude:
        for path in exclude:
            must_not_filters.append({"key": "claim_path", "match": {"value": path}})

    body = {
        "vector": vector,
        "limit": limit,
        "with_payload": True,
        "score_threshold": score_threshold,
    }
    if offset > 0:
        body["offset"] = offset
    if must_filters or must_not_filters:
        body["filter"] = {}
        if must_filters:
            body["filter"]["must"] = must_filters
        if must_not_filters:
            body["filter"]["must_not"] = must_not_filters

    req = urllib.request.Request(
        f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
        data=json.dumps(body).encode(),
        headers={"Content-Type": "application/json"},
    )
    try:
        with urllib.request.urlopen(req, timeout=10) as resp:
            data = json.loads(resp.read())
            return data.get("result", [])
    except Exception as e:
        logger.error("Qdrant search failed: %s", e)
        return []


# --- Layer 2: Graph expansion ---


def _parse_frontmatter_edges(path: Path) -> dict:
    """Extract relationship edges from a claim's frontmatter.

    Handles both YAML formats:
      depends_on: ["item1", "item2"]   (inline list)
      depends_on:                       (multi-line list)
        - item1
        - item2

    Returns {supports: [...], challenges: [...], depends_on: [...], related: [...], wiki_links: [...]}.
    wiki_links are separated from explicit related edges for differential weighting.
    """
    edges = {"supports": [], "challenges": [], "depends_on": [], "related": [], "wiki_links": []}
    try:
        text = path.read_text(errors="replace")
    except Exception:
        return edges

    if not text.startswith("---"):
        return edges
    end = text.find("\n---", 3)
    if end == -1:
        return edges

    fm_text = text[3:end]

    # Use YAML parser for reliable edge extraction
    try:
        import yaml
        fm = yaml.safe_load(fm_text)
        if isinstance(fm, dict):
            for field in ("supports", "challenges", "depends_on", "related"):
                val = fm.get(field)
                if isinstance(val, list):
                    edges[field] = [str(v).strip() for v in val if v]
                elif isinstance(val, str) and val.strip():
                    edges[field] = [val.strip()]
    except Exception:
        pass

    # Extract wiki links from body as separate edge type (lower weight)
    body = text[end + 4:]
    all_explicit = set()
    for field in ("supports", "challenges", "depends_on", "related"):
        all_explicit.update(edges[field])

    wiki_links = WIKI_LINK_RE.findall(body)
    for link in wiki_links:
        link = link.strip()
        if link and link not in all_explicit and link not in edges["wiki_links"]:
            edges["wiki_links"].append(link)

    return edges


def _resolve_claim_path(name: str, repo_root: Path) -> Path | None:
    """Resolve a claim name (from frontmatter edge or wiki link) to a file path.

    Handles both naming conventions:
      - "GLP-1 receptor agonists are..." → "GLP-1 receptor agonists are....md" (spaces)
      - "glp-1-persistence-drops..." → "glp-1-persistence-drops....md" (slugified)

    Checks domains/, core/, foundations/, decisions/ subdirectories.
    """
    # Try exact name first (spaces in filename), then slugified
    candidates = [name]
    slug = name.lower().replace(" ", "-").replace("_", "-")
    if slug != name:
        candidates.append(slug)

    for subdir in ["domains", "core", "foundations", "decisions"]:
        base = repo_root / subdir
        if not base.is_dir():
            continue
        for candidate_name in candidates:
            for md in base.rglob(f"{candidate_name}.md"):
                return md
    return None


def graph_expand(seed_paths: list[str], repo_root: Path | None = None,
                 max_expanded: int = 30,
                 challenge_weight: float = 1.5,
                 seen: set[str] | None = None) -> list[dict]:
    """Layer 2: Expand seed claims 1-hop through knowledge graph edges.

    Traverses supports/challenges/depends_on/related/wiki_links edges in frontmatter.
    Edge weights: challenges 1.5x, depends_on 1.25x, supports/related 1.0x, wiki_links 0.5x.
    Results sorted by weight descending so cap cuts low-value edges first.

    Args:
        seen: Optional set of paths already matched (e.g. from keyword search) to exclude.

    Returns list of {claim_path, claim_title, edge_type, edge_weight, from_claim}.
    Excludes claims already in seed_paths or seen set.
    """
    EDGE_WEIGHTS = {
        "challenges": 1.5,
        "challenged_by": 1.5,
        "depends_on": 1.25,
        "supports": 1.0,
        "related": 1.0,
        "wiki_links": 0.5,
    }

    root = repo_root or config.MAIN_WORKTREE
    all_expanded = []
    visited = set(seed_paths)
    if seen:
        visited.update(seen)

    for seed_path in seed_paths:
        full_path = root / seed_path
        if not full_path.exists():
            continue

        edges = _parse_frontmatter_edges(full_path)

        for edge_type, targets in edges.items():
            weight = EDGE_WEIGHTS.get(edge_type, 1.0)

            for target_name in targets:
                target_path = _resolve_claim_path(target_name, root)
                if target_path is None:
                    continue

                rel_path = str(target_path.relative_to(root))
                if rel_path in visited:
                    continue
                # Skip structural files (MOCs/indexes) — they pull entire domains
                if target_path.name in STRUCTURAL_FILES:
                    continue
                visited.add(rel_path)

                # Read title from frontmatter
                title = target_name
                try:
                    text = target_path.read_text(errors="replace")
                    if text.startswith("---"):
                        end = text.find("\n---", 3)
                        if end > 0:
                            import yaml
                            fm = yaml.safe_load(text[3:end])
                            if isinstance(fm, dict):
                                title = fm.get("name", fm.get("title", target_name))
                except Exception:
                    pass

                all_expanded.append({
                    "claim_path": rel_path,
                    "claim_title": str(title),
                    "edge_type": edge_type,
                    "edge_weight": weight,
                    "from_claim": seed_path,
                })

    # Sort by weight descending so cap cuts lowest-value edges first
    all_expanded.sort(key=lambda x: x["edge_weight"], reverse=True)
    return all_expanded[:max_expanded]


# --- Combined search (Layer 1 + Layer 2) ---

# Default thresholds — lowered Apr 5 after production audit showed 0 vector hits.
# text-embedding-3-small scores 0.50-0.60 on conceptual matches (e.g. "risks in
# investing" vs specific claims). 0.70 rejected every result. 0.50/0.40 lets
# relevant claims through while still filtering noise.
PASS1_LIMIT = 5
PASS1_THRESHOLD = 0.50
PASS2_LIMIT = 5
PASS2_THRESHOLD = 0.40
HARD_CAP = 10


def _dedup_hits(hits: list[dict], seen: set[str]) -> list[dict]:
    """Filter Qdrant hits: dedup by claim_path, exclude structural files."""
    results = []
    for hit in hits:
        payload = hit.get("payload", {})
        claim_path = payload.get("claim_path", "")
        if claim_path in seen:
            continue
        if claim_path.split("/")[-1] in STRUCTURAL_FILES:
            continue
        seen.add(claim_path)
        results.append({
            "claim_title": payload.get("claim_title", ""),
            "claim_path": claim_path,
            "score": round(hit.get("score", 0), 4),
            "domain": payload.get("domain", ""),
            "confidence": payload.get("confidence", ""),
            "snippet": payload.get("snippet", "")[:200],
            "type": payload.get("type", "claim"),
        })
    return results


def _sort_results(direct: list[dict], expanded: list[dict]) -> list[dict]:
    """Sort combined results: similarity desc → challenged_by → other expansion.

    Sort order is load-bearing: LLMs have primacy bias, so best claims first.
    """
    # Direct results already sorted by Qdrant (cosine desc)
    sorted_direct = sorted(direct, key=lambda x: x.get("score", 0), reverse=True)

    # Expansion: challenged_by first (counterpoints), then rest by weight
    challenged = [e for e in expanded if e.get("edge_type") == "challenges"]
    other_expanded = [e for e in expanded if e.get("edge_type") != "challenges"]
    challenged.sort(key=lambda x: x.get("edge_weight", 0), reverse=True)
    other_expanded.sort(key=lambda x: x.get("edge_weight", 0), reverse=True)

    return sorted_direct + challenged + other_expanded


def search(query: str, expand: bool = False,
           domain: str | None = None, confidence: str | None = None,
           exclude: list[str] | None = None) -> dict:
    """Two-pass semantic search: embed query, search Qdrant, optionally expand.

    Pass 1 (expand=False, default): Top 5 claims from Qdrant, score >= 0.70.
    Sufficient for ~80% of queries. Fast and focused.

    Pass 2 (expand=True): Next 5 claims (offset=5, score >= 0.60) plus
    graph-expanded claims (challenged_by, related edges). Hard cap 10 total.
    Agent calls this only when pass 1 didn't answer the question.

    Returns {
        "query": str,
        "direct_results": [...],     # Layer 1 Qdrant hits (sorted by score desc)
        "expanded_results": [...],   # Layer 2 graph expansion (challenges first)
        "total": int,
    }
    """
    vector = embed_query(query)
    if vector is None:
        return {"query": query, "direct_results": [], "expanded_results": [],
                "total": 0, "error": "embedding_failed"}

    # --- Pass 1: Top 5, high threshold ---
    hits = search_qdrant(vector, limit=PASS1_LIMIT, domain=domain,
                         confidence=confidence, exclude=exclude,
                         score_threshold=PASS1_THRESHOLD)

    seen_paths: set[str] = set()
    if exclude:
        seen_paths.update(exclude)
    direct = _dedup_hits(hits, seen_paths)

    expanded = []
    if expand:
        # --- Pass 2: Next 5 from Qdrant (lower threshold, offset) ---
        pass2_hits = search_qdrant(vector, limit=PASS2_LIMIT, domain=domain,
                                   confidence=confidence, exclude=exclude,
                                   score_threshold=PASS2_THRESHOLD,
                                   offset=PASS1_LIMIT)
        pass2_direct = _dedup_hits(pass2_hits, seen_paths)
        direct.extend(pass2_direct)

        # Graph expansion on all direct results (pass 1 + pass 2 seeds)
        seed_paths = [r["claim_path"] for r in direct]
        remaining_cap = HARD_CAP - len(direct)
        if remaining_cap > 0:
            expanded = graph_expand(seed_paths, max_expanded=remaining_cap,
                                    seen=seen_paths)

    # Enforce hard cap across all results
    all_sorted = _sort_results(direct, expanded)[:HARD_CAP]

    # Split back into direct vs expanded for backward compat
    direct_paths = {r["claim_path"] for r in direct}
    final_direct = [r for r in all_sorted if r.get("claim_path") in direct_paths]
    final_expanded = [r for r in all_sorted if r.get("claim_path") not in direct_paths]

    return {
        "query": query,
        "direct_results": final_direct,
        "expanded_results": final_expanded,
        "total": len(all_sorted),
    }


# --- Duplicate detection ---


def check_duplicate(text: str, threshold: float = 0.85,
                    domain: str | None = None) -> dict:
    """Check if a claim/text is a near-duplicate of existing KB content.

    Embeds the text, searches Qdrant, returns top-3 matches with scores.
    Thresholds: >=0.85 likely duplicate, 0.70-0.85 check manually, <0.70 novel.

    Args:
        text: The claim text to check.
        threshold: Minimum score to flag as potential duplicate (default 0.85).
        domain: Optional domain filter.

    Returns:
        {
            "query": str,
            "is_duplicate": bool,         # True if any match >= threshold
            "highest_score": float,        # Best match score
            "verdict": str,               # "duplicate" | "check_manually" | "novel"
            "matches": [                  # Top 3 matches
                {"score": float, "claim_path": str, "claim_title": str, "domain": str}
            ]
        }
    """
    vector = embed_query(text)
    if vector is None:
        return {"query": text[:100], "is_duplicate": False, "highest_score": 0,
                "verdict": "error", "matches": [], "error": "embedding_failed"}

    hits = search_qdrant(vector, limit=3, domain=domain, score_threshold=0.3)

    matches = []
    for hit in hits:
        payload = hit.get("payload", {})
        matches.append({
            "score": round(hit.get("score", 0), 4),
            "claim_path": payload.get("claim_path", ""),
            "claim_title": payload.get("claim_title", ""),
            "domain": payload.get("domain", ""),
        })

    highest = matches[0]["score"] if matches else 0.0

    if highest >= threshold:
        verdict = "duplicate"
    elif highest >= 0.70:
        verdict = "check_manually"
    else:
        verdict = "novel"

    return {
        "query": text[:100],
        "is_duplicate": highest >= threshold,
        "highest_score": highest,
        "verdict": verdict,
        "matches": matches,
    }