teleo-codex/ops/pipeline-v2/lib/pre_screen.py

"""Pre-screening: identify themes from source, fetch prior art from Qdrant.

Runs before extraction to show the extractor what the KB already knows.
Reduces near-duplicates (our #1 rejection cause) by turning semantic
pre-screening from a manual discipline into a pipeline feature.

Design: Leo (approved 2026-03-30). Owner: Epimetheus.

Flow:
  1. Haiku identifies 3-5 themes from source text
  2. Each theme + title (with author-stripped variant) → Tier 1 search
  3. Results injected into extraction prompt as "Prior Art"
  4. Extractor classifies extractions as NEW / ENRICHMENT / CHALLENGE
  5. ENRICHMENT/CHALLENGE must cite specific target claim (hard gate)

Cost: ~$0.002/source (Haiku theme pass) + free Qdrant queries.
"""

import json
import os
import re
import sys

import requests

# Search library (same Tier 1 path used by Argus + Telegram bot)
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from lib.search import search

OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
THEME_MODEL = "anthropic/claude-haiku-4.5"

# Regex to strip leading author/entity patterns from titles
# e.g. "Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go"
#      "Aschenbrenner — Situational Awareness" → "Situational Awareness"
# Prior art threshold — only show results above this score to the extractor.
# 0.50 catches mechanism-level matches where compound themes dilute embeddings.
# Was 0.65 but Haiku compound themes score 0.50-0.60 even on exact matches.
# False positives cost nothing (extractor sees irrelevant prior art, ignores it).
# False negatives cost wasted extraction + review + rejection.
PRIOR_ART_THRESHOLD = 0.50

AUTHOR_PREFIX_RE = re.compile(
    r"^[A-Za-z\-']+(?:\s+[A-Za-z\-']+)?\s*[:–—\-]\s*", re.UNICODE
)


def identify_themes(source_content: str, api_key: str, source_title: str = "") -> list[str]:
    """Use Haiku to identify 3-5 major themes from source text.

    Returns a list of theme strings suitable as search queries.
    Falls back to [source_title] on API failure.
    """
    # Truncate source to keep Haiku costs minimal
    snippet = source_content[:3000]

    prompt = f"""Identify the 3-5 major themes or topics in this text.
Return ONLY a JSON array of short search queries (3-8 words each).
Keep queries SHORT — 3-5 words is ideal. Compound phrases score poorly in vector search.

Example good output: ["futarchy governance", "semaglutide kidney outcomes", "ICO oversubscription"]
Example bad output: ["futarchy governance mechanisms detecting revenue misrepresentation token launches", "prediction market accuracy identifying fraudulent financial claims"]

Text:
{snippet}

Return JSON array only, no explanation."""

    try:
        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
            "HTTP-Referer": "https://livingip.xyz",
            "X-Title": "Teleo Pre-Screen",
        }
        payload = {
            "model": THEME_MODEL,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.1,
            "max_tokens": 500,
        }
        resp = requests.post(OPENROUTER_URL, headers=headers, json=payload, timeout=30)
        resp.raise_for_status()
        content = resp.json()["choices"][0]["message"]["content"].strip()

        # Strip markdown fencing if present
        if content.startswith("```"):
            content = re.sub(r"^```(?:json)?\s*\n?", "", content)
            content = re.sub(r"\n?```\s*$", "", content)

        themes = json.loads(content)
        if isinstance(themes, list) and all(isinstance(t, str) for t in themes):
            return themes[:5]
    except Exception as e:
        print(f"  WARN: Theme identification failed: {e}", file=sys.stderr)

    # Fallback: use title as the only theme
    return [source_title] if source_title else []


def _strip_author(title: str) -> str:
    """Strip leading author/entity prefix from a title.

    "Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go"
    "Noah Smith — AI and Jobs" → "AI and Jobs"
    """
    stripped = AUTHOR_PREFIX_RE.sub("", title).strip()
    # Only use stripped version if it's meaningfully different
    if stripped and len(stripped) > 10 and stripped != title:
        return stripped
    return ""


def _extract_title_from_source(source_content: str, source_file: str) -> str:
    """Get a usable title from source frontmatter or filename."""
    # Try frontmatter title
    match = re.search(r"^title:\s*[\"']?(.+?)[\"']?\s*$", source_content, re.MULTILINE)
    if match:
        return match.group(1).strip()

    # Fall back to filename
    basename = os.path.basename(source_file).replace(".md", "")
    # Strip date prefix (e.g., "2026-03-15-article-name" → "article-name")
    basename = re.sub(r"^\d{4}-\d{2}-\d{2}-", "", basename)
    return basename.replace("-", " ")


def pre_screen(source_content: str, source_file: str, api_key: str,
               domain: str | None = None) -> dict:
    """Run full pre-screening: themes → search → prior art.

    Returns:
        {
            "themes": ["theme1", "theme2", ...],
            "prior_art": [
                {"claim_path": str, "title": str, "score": float, "query": str},
                ...
            ],
            "search_queries": ["query1", "query2", ...],  # for audit trail
        }
    """
    title = _extract_title_from_source(source_content, source_file)

    # Step 1: Identify themes
    themes = identify_themes(source_content, api_key, source_title=title)

    # Step 2: Build search queries (themes + title + author-stripped title)
    queries = list(themes)
    if title and title not in queries:
        queries.append(title)
    stripped = _strip_author(title)
    if stripped and stripped not in queries:
        queries.append(stripped)

    # Step 3: Search Qdrant for each query (Tier 1: expand=False)
    seen_paths: set[str] = set()
    prior_art: list[dict] = []

    for query in queries:
        try:
            results = search(query, expand=False, domain=None)  # cross-domain on purpose
            for hit in results.get("direct_results", []):
                path = hit.get("claim_path", "")
                if path and path not in seen_paths:
                    seen_paths.add(path)
                    prior_art.append({
                        "claim_path": path,
                        "title": hit.get("title", os.path.basename(path).replace(".md", "").replace("-", " ")),
                        "score": round(hit.get("score", 0), 3),
                        "query": query,
                    })
        except Exception as e:
            print(f"  WARN: Pre-screen search failed for '{query[:50]}': {e}", file=sys.stderr)

    # Filter below threshold, sort by score descending, cap at 25
    prior_art = [p for p in prior_art if p["score"] >= PRIOR_ART_THRESHOLD]
    prior_art.sort(key=lambda x: x["score"], reverse=True)
    prior_art = prior_art[:25]

    return {
        "themes": themes,
        "prior_art": prior_art,
        "search_queries": queries,
    }


def format_prior_art_for_prompt(prior_art: list[dict]) -> str:
    """Format prior art results for injection into the extraction prompt.

    Leo's required format:
    - [claim-slug](path) — similarity: 0.82 — query: "theme that matched"
    """
    if not prior_art:
        return "No similar claims found in the KB. This source likely covers novel territory."

    lines = []
    for item in prior_art:
        slug = os.path.basename(item["claim_path"]).replace(".md", "")
        lines.append(
            f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — query: \"{item['query'][:60]}\""
        )
    return "\n".join(lines)


def format_prior_art_for_pr(prior_art: list[dict]) -> str:
    """Format prior art for PR body (structured, reviewable by Leo).

    Shows similarity score + which query matched for verification.
    """
    if not prior_art:
        return "No prior art found — source covers novel territory.\n"

    lines = ["## Prior Art (automated pre-screening)\n"]
    for item in prior_art:
        slug = os.path.basename(item["claim_path"]).replace(".md", "")
        lines.append(
            f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — matched query: \"{item['query'][:80]}\""
        )
    lines.append("")
    return "\n".join(lines)