From 8c51e47c4ef2e4626991326202aa8c9bdd15f5bd Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Mon, 30 Mar 2026 11:17:38 +0100
Subject: [PATCH] feat: extraction pre-screening via Qdrant semantic search

Before extraction, the pipeline now:
1. Identifies 3-5 themes from source (Haiku, ~$0.002/source)
2. Searches Qdrant for each theme + title (with author-stripped variant)
3. Injects "Prior Art" into extraction prompt showing existing KB claims
4. Requires ENRICHMENT/CHALLENGE to cite specific target_claim (hard gate)

Reduces near-duplicate extractions (our #1 rejection cause) by showing
the extractor what the KB already knows before it starts.

Prior art also persisted to .prior-art/ sidecar files and included in
PR body for reviewer verification.

Design: Leo. Owner: Epimetheus.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 batch-extract-50.sh      |   9 +-
 lib/extraction_prompt.py |  16 ++-
 lib/pre_screen.py        | 213 +++++++++++++++++++++++++++++++++++++++
 openrouter-extract-v2.py |  47 ++++++++-
 tests/test_pre_screen.py |  66 ++++++++++++
 5 files changed, 347 insertions(+), 4 deletions(-)
 create mode 100644 lib/pre_screen.py
 create mode 100644 tests/test_pre_screen.py

diff --git a/batch-extract-50.sh b/batch-extract-50.sh
index 924403c..a8bb669 100755
--- a/batch-extract-50.sh
+++ b/batch-extract-50.sh
@@ -235,11 +235,18 @@ Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" >> $LOG 2>&1
     # Push
     git push "http://leo:${TOKEN}@localhost:3000/teleo/teleo-codex.git" "$BRANCH" --force >> $LOG 2>&1
 
+    # Build PR body (include prior art if available)
+    PRIOR_ART_FILE="${MAIN_REPO}/inbox/archive/.prior-art/${BASENAME%.md}.txt"
+    PR_BODY=""
+    if [ -f "$PRIOR_ART_FILE" ]; then
+        PR_BODY=$(cat "$PRIOR_ART_FILE" | python3 -c "import sys,json; print(json.dumps(sys.stdin.read()))" 2>/dev/null | sed 's/^"//;s/"$//')
+    fi
+
     # Create PR
     curl -sf -X POST "http://localhost:3000/api/v1/repos/teleo/teleo-codex/pulls" \
         -H "Authorization: token $TOKEN" \
         -H "Content-Type: application/json" \
-        -d "{\"title\":\"extract: $BASENAME\",\"head\":\"$BRANCH\",\"base\":\"main\"}" >> /dev/null 2>&1
+        -d "{\"title\":\"extract: $BASENAME\",\"head\":\"$BRANCH\",\"base\":\"main\",\"body\":\"$PR_BODY\"}" >> /dev/null 2>&1
 
     SUCCESS=$((SUCCESS + 1))
     echo "  -> SUCCESS ($CHANGED files)" >> $LOG
diff --git a/lib/extraction_prompt.py b/lib/extraction_prompt.py
index 406b16c..e28f066 100644
--- a/lib/extraction_prompt.py
+++ b/lib/extraction_prompt.py
@@ -27,6 +27,7 @@ def build_extraction_prompt(
     rationale: str | None = None,
     intake_tier: str | None = None,
     proposed_by: str | None = None,
+    prior_art: str | None = None,
 ) -> str:
     """Build the lean extraction prompt.
 
@@ -40,6 +41,7 @@ def build_extraction_prompt(
         rationale: Contributor's natural-language thesis about the source (optional)
         intake_tier: undirected | directed | challenge (optional)
         proposed_by: Contributor handle who submitted the source (optional)
+        prior_art: Formatted prior art section from pre-screening (optional)
 
     Returns:
         The complete prompt string
@@ -137,7 +139,19 @@ Single source = experimental at most. Pitch rhetoric or marketing copy = specula
 
 {source_content}
 {contributor_directive}
-## KB Index (existing claims — check for duplicates and enrichment targets)
+{f"""## Prior Art (semantic pre-screening)
+
+The following existing claims were found by semantic search against the major themes of this source. Use this to decide whether your extractions are NEW, ENRICHMENT, or CHALLENGE:
+
+{prior_art}
+
+**Classification rules:**
+- If your extraction makes the same argument as a prior art claim (similarity ≥ 0.80): classify as ENRICHMENT and cite the target claim's filename in `target_file`
+- If your extraction contradicts a prior art claim: classify as CHALLENGE enrichment and cite the target in `target_file`
+- If your extraction makes a genuinely different argument not covered by prior art: classify as NEW (claim)
+- ENRICHMENT and CHALLENGE enrichments MUST cite a specific `target_file` — "ENRICHMENT (general)" is rejected by the validator
+
+""" if prior_art else ""}## KB Index (existing claims — check for duplicates and enrichment targets)
 
 {kb_index}
 
diff --git a/lib/pre_screen.py b/lib/pre_screen.py
new file mode 100644
index 0000000..459befa
--- /dev/null
+++ b/lib/pre_screen.py
@@ -0,0 +1,213 @@
+"""Pre-screening: identify themes from source, fetch prior art from Qdrant.
+
+Runs before extraction to show the extractor what the KB already knows.
+Reduces near-duplicates (our #1 rejection cause) by turning semantic
+pre-screening from a manual discipline into a pipeline feature.
+
+Design: Leo (approved 2026-03-30). Owner: Epimetheus.
+
+Flow:
+  1. Haiku identifies 3-5 themes from source text
+  2. Each theme + title (with author-stripped variant) → Tier 1 search
+  3. Results injected into extraction prompt as "Prior Art"
+  4. Extractor classifies extractions as NEW / ENRICHMENT / CHALLENGE
+  5. ENRICHMENT/CHALLENGE must cite specific target claim (hard gate)
+
+Cost: ~$0.002/source (Haiku theme pass) + free Qdrant queries.
+"""
+
+import json
+import os
+import re
+import sys
+
+import requests
+
+# Search library (same Tier 1 path used by Argus + Telegram bot)
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from lib.search import search
+
+OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
+THEME_MODEL = "anthropic/claude-haiku-4-5-20251001"
+
+# Regex to strip leading author/entity patterns from titles
+# e.g. "Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go"
+#      "Aschenbrenner — Situational Awareness" → "Situational Awareness"
+AUTHOR_PREFIX_RE = re.compile(
+    r"^[A-Za-z\-']+(?:\s+[A-Za-z\-']+)?\s*[:–—\-]\s*", re.UNICODE
+)
+
+
+def identify_themes(source_content: str, api_key: str, source_title: str = "") -> list[str]:
+    """Use Haiku to identify 3-5 major themes from source text.
+
+    Returns a list of theme strings suitable as search queries.
+    Falls back to [source_title] on API failure.
+    """
+    # Truncate source to keep Haiku costs minimal
+    snippet = source_content[:3000]
+
+    prompt = f"""Identify the 3-5 major themes or topics in this text.
+Return ONLY a JSON array of short search queries (5-15 words each) that capture the key arguments.
+Focus on the SPECIFIC mechanisms and claims, not general topic labels.
+
+Example good output: ["futarchy fundraise oversubscription dynamics", "pro-rata capital allocation in ICOs"]
+Example bad output: ["governance", "finance"]
+
+Text:
+{snippet}
+
+Return JSON array only, no explanation."""
+
+    try:
+        headers = {
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+            "HTTP-Referer": "https://livingip.xyz",
+            "X-Title": "Teleo Pre-Screen",
+        }
+        payload = {
+            "model": THEME_MODEL,
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": 0.1,
+            "max_tokens": 500,
+        }
+        resp = requests.post(OPENROUTER_URL, headers=headers, json=payload, timeout=30)
+        resp.raise_for_status()
+        content = resp.json()["choices"][0]["message"]["content"].strip()
+
+        # Strip markdown fencing if present
+        if content.startswith("```"):
+            content = re.sub(r"^```(?:json)?\s*\n?", "", content)
+            content = re.sub(r"\n?```\s*$", "", content)
+
+        themes = json.loads(content)
+        if isinstance(themes, list) and all(isinstance(t, str) for t in themes):
+            return themes[:5]
+    except Exception as e:
+        print(f"  WARN: Theme identification failed: {e}", file=sys.stderr)
+
+    # Fallback: use title as the only theme
+    return [source_title] if source_title else []
+
+
+def _strip_author(title: str) -> str:
+    """Strip leading author/entity prefix from a title.
+
+    "Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go"
+    "Noah Smith — AI and Jobs" → "AI and Jobs"
+    """
+    stripped = AUTHOR_PREFIX_RE.sub("", title).strip()
+    # Only use stripped version if it's meaningfully different
+    if stripped and len(stripped) > 10 and stripped != title:
+        return stripped
+    return ""
+
+
+def _extract_title_from_source(source_content: str, source_file: str) -> str:
+    """Get a usable title from source frontmatter or filename."""
+    # Try frontmatter title
+    match = re.search(r"^title:\s*[\"']?(.+?)[\"']?\s*$", source_content, re.MULTILINE)
+    if match:
+        return match.group(1).strip()
+
+    # Fall back to filename
+    basename = os.path.basename(source_file).replace(".md", "")
+    # Strip date prefix (e.g., "2026-03-15-article-name" → "article-name")
+    basename = re.sub(r"^\d{4}-\d{2}-\d{2}-", "", basename)
+    return basename.replace("-", " ")
+
+
+def pre_screen(source_content: str, source_file: str, api_key: str,
+               domain: str | None = None) -> dict:
+    """Run full pre-screening: themes → search → prior art.
+
+    Returns:
+        {
+            "themes": ["theme1", "theme2", ...],
+            "prior_art": [
+                {"claim_path": str, "title": str, "score": float, "query": str},
+                ...
+            ],
+            "search_queries": ["query1", "query2", ...],  # for audit trail
+        }
+    """
+    title = _extract_title_from_source(source_content, source_file)
+
+    # Step 1: Identify themes
+    themes = identify_themes(source_content, api_key, source_title=title)
+
+    # Step 2: Build search queries (themes + title + author-stripped title)
+    queries = list(themes)
+    if title and title not in queries:
+        queries.append(title)
+    stripped = _strip_author(title)
+    if stripped and stripped not in queries:
+        queries.append(stripped)
+
+    # Step 3: Search Qdrant for each query (Tier 1: expand=False)
+    seen_paths: set[str] = set()
+    prior_art: list[dict] = []
+
+    for query in queries:
+        try:
+            results = search(query, expand=False, domain=None)  # cross-domain on purpose
+            for hit in results.get("direct_results", []):
+                path = hit.get("claim_path", "")
+                if path and path not in seen_paths:
+                    seen_paths.add(path)
+                    prior_art.append({
+                        "claim_path": path,
+                        "title": hit.get("title", os.path.basename(path).replace(".md", "").replace("-", " ")),
+                        "score": round(hit.get("score", 0), 3),
+                        "query": query,
+                    })
+        except Exception as e:
+            print(f"  WARN: Pre-screen search failed for '{query[:50]}': {e}", file=sys.stderr)
+
+    # Sort by score descending, cap at 25 (5 themes × 5 results max)
+    prior_art.sort(key=lambda x: x["score"], reverse=True)
+    prior_art = prior_art[:25]
+
+    return {
+        "themes": themes,
+        "prior_art": prior_art,
+        "search_queries": queries,
+    }
+
+
+def format_prior_art_for_prompt(prior_art: list[dict]) -> str:
+    """Format prior art results for injection into the extraction prompt.
+
+    Leo's required format:
+    - [claim-slug](path) — similarity: 0.82 — query: "theme that matched"
+    """
+    if not prior_art:
+        return "No similar claims found in the KB. This source likely covers novel territory."
+
+    lines = []
+    for item in prior_art:
+        slug = os.path.basename(item["claim_path"]).replace(".md", "")
+        lines.append(
+            f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — query: \"{item['query'][:60]}\""
+        )
+    return "\n".join(lines)
+
+
+def format_prior_art_for_pr(prior_art: list[dict]) -> str:
+    """Format prior art for PR body (structured, reviewable by Leo).
+
+    Shows similarity score + which query matched for verification.
+    """
+    if not prior_art:
+        return "No prior art found — source covers novel territory.\n"
+
+    lines = ["## Prior Art (automated pre-screening)\n"]
+    for item in prior_art:
+        slug = os.path.basename(item["claim_path"]).replace(".md", "")
+        lines.append(
+            f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — matched query: \"{item['query'][:80]}\""
+        )
+    lines.append("")
+    return "\n".join(lines)
diff --git a/openrouter-extract-v2.py b/openrouter-extract-v2.py
index b8a677c..f6e4875 100644
--- a/openrouter-extract-v2.py
+++ b/openrouter-extract-v2.py
@@ -41,6 +41,7 @@ from lib.post_extract import (
     validate_and_fix_entities,
 )
 from lib.connect import connect_new_claims
+from lib.pre_screen import pre_screen, format_prior_art_for_prompt, format_prior_art_for_pr
 
 # ─── Source registration (Argus: pipeline funnel tracking) ─────────────────
 
@@ -354,6 +355,22 @@ def main():
     # Load existing claims for post-extraction validation
     existing_claims = load_existing_claims_from_repo(".")
 
+    # ── Pre-screening: identify themes, fetch prior art from Qdrant ──
+    prior_art_text = None
+    prior_art_pr_text = None
+    pre_screen_data = None
+    if api_key:
+        try:
+            pre_screen_data = pre_screen(source_content, args.source_file, api_key, domain=domain)
+            if pre_screen_data["prior_art"]:
+                prior_art_text = format_prior_art_for_prompt(pre_screen_data["prior_art"])
+                prior_art_pr_text = format_prior_art_for_pr(pre_screen_data["prior_art"])
+                print(f"Pre-screen: {len(pre_screen_data['themes'])} themes → {len(pre_screen_data['prior_art'])} prior art claims")
+            else:
+                print(f"Pre-screen: {len(pre_screen_data['themes'])} themes → no prior art (novel territory)")
+        except Exception as e:
+            print(f"  WARN: Pre-screening failed (non-fatal): {e}", file=sys.stderr)
+
     # ── Build lean prompt ──
     # Extract rationale and intake_tier from source frontmatter (directed contribution)
     rationale = None
@@ -381,6 +398,7 @@ def main():
     prompt = build_extraction_prompt(
         args.source_file, source_content, domain, agent, kb_index,
         rationale=rationale, intake_tier=intake_tier, proposed_by=proposed_by,
+        prior_art=prior_art_text,
     )
 
     if args.dry_run:
@@ -390,6 +408,13 @@ def main():
         print(f"Model: {args.model}")
         print(f"Existing claims: {len(existing_claims)}")
         print(f"Prompt length: {len(prompt)} chars")
+        if pre_screen_data:
+            print(f"\n=== PRE-SCREEN ===")
+            print(f"Themes: {pre_screen_data['themes']}")
+            print(f"Queries: {pre_screen_data['search_queries']}")
+            print(f"Prior art ({len(pre_screen_data['prior_art'])} claims):")
+            for pa in pre_screen_data['prior_art']:
+                print(f"  {pa['score']:.2f}  {pa['title'][:60]}  (query: {pa['query'][:40]})")
         print(f"\n=== PROMPT ===\n{prompt[:1000]}...")
         return
 
@@ -461,7 +486,7 @@ def main():
     if written:
         written_paths = [os.path.join(domain_dir, f) for f in written]
         try:
-            connect_stats = connect_new_claims(written_paths, domain=domain)
+            connect_stats = connect_new_claims(written_paths)
             if connect_stats["connected"] > 0:
                 print(f"  Connected: {connect_stats['connected']}/{len(written)} claims → {connect_stats['edges_added']} edges")
                 for conn in connect_stats.get("connections", []):
@@ -591,6 +616,11 @@ def main():
         source_update["entities_enqueued"] = entities_enqueued
     if facts:
         source_update["key_facts"] = facts
+    if pre_screen_data and pre_screen_data.get("prior_art"):
+        source_update["notes"] = source_update.get("notes", "")
+        if source_update["notes"]:
+            source_update["notes"] += "; "
+        source_update["notes"] += f"pre-screen: {len(pre_screen_data['prior_art'])} prior art claims from {len(pre_screen_data['themes'])} themes"
     if not written and not enriched and not entities_enqueued:
         source_update["notes"] = (
             f"LLM returned {len(raw_claims)} claims, "
@@ -604,6 +634,17 @@ def main():
     db_status = "extracted" if status == "processed" else ("null_result" if status == "null-result" else status)
     _register_source(_src_conn, args.source_file, db_status, domain, args.model, len(written))
 
+    # ── Save prior art for PR body (batch-extract reads this) ──
+    if prior_art_pr_text:
+        prior_art_path = os.path.join(
+            os.path.dirname(args.source_file) or ".",
+            ".prior-art",
+            os.path.basename(args.source_file).replace(".md", ".txt"),
+        )
+        os.makedirs(os.path.dirname(prior_art_path), exist_ok=True)
+        with open(prior_art_path, "w") as f:
+            f.write(prior_art_pr_text)
+
     # ── Save debug info for rejected claims ──
     if rejected_claims:
         debug_dir = os.path.join(os.path.dirname(args.source_file) or ".", ".extraction-debug")
@@ -626,10 +667,12 @@ def main():
 
     # ── Summary ──
     print(f"\n{'='*60}")
-    print(f"  EXTRACTION COMPLETE (v2)")
+    print(f"  EXTRACTION COMPLETE (v2 + pre-screen)")
     print(f"  Source:       {args.source_file}")
     print(f"  Agent:        {agent}")
     print(f"  Model:        {args.model} ({p1_in} in / {p1_out} out)")
+    if pre_screen_data:
+        print(f"  Pre-screen:   {len(pre_screen_data['themes'])} themes → {len(pre_screen_data['prior_art'])} prior art")
     print(f"  Pass 2:       Python validator ($0)")
     print(f"  Claims:       {len(written)} written, {claim_stats['rejected']} rejected, {claim_stats['fixed']} auto-fixed")
     print(f"  Connected:    {connect_stats.get('connected', 0)} claims → {connect_stats.get('edges_added', 0)} edges (Qdrant)")
diff --git a/tests/test_pre_screen.py b/tests/test_pre_screen.py
new file mode 100644
index 0000000..3d0ef9d
--- /dev/null
+++ b/tests/test_pre_screen.py
@@ -0,0 +1,66 @@
+"""Tests for lib/pre_screen.py — extraction pre-screening."""
+
+import pytest
+from lib.pre_screen import (
+    _strip_author,
+    _extract_title_from_source,
+    format_prior_art_for_prompt,
+    format_prior_art_for_pr,
+)
+
+
+class TestStripAuthor:
+    def test_colon_prefix(self):
+        assert _strip_author("Shapiro: How Far Will AI Video Go") == "How Far Will AI Video Go"
+
+    def test_dash_prefix(self):
+        assert _strip_author("Aschenbrenner — Situational Awareness Research") == "Situational Awareness Research"
+
+    def test_no_prefix(self):
+        assert _strip_author("How Far Will AI Video Go") == ""
+
+    def test_short_result_returns_empty(self):
+        # If stripped version is too short, return empty
+        assert _strip_author("Shapiro: AI") == ""
+
+    def test_hyphenated_name(self):
+        assert _strip_author("Noah-Smith: The Future of AI") == "The Future of AI"
+
+
+class TestExtractTitle:
+    def test_frontmatter_title(self):
+        content = '---\ntitle: "My Great Article"\ndomain: ai-alignment\n---\n\nBody text.'
+        assert _extract_title_from_source(content, "2026-03-15-some-file.md") == "My Great Article"
+
+    def test_filename_fallback(self):
+        content = "---\ndomain: ai-alignment\n---\n\nNo title field."
+        assert _extract_title_from_source(content, "2026-03-15-some-great-article.md") == "some great article"
+
+    def test_date_stripped_from_filename(self):
+        content = "no frontmatter"
+        assert _extract_title_from_source(content, "2026-03-15-article-name.md") == "article name"
+
+
+class TestFormatPriorArt:
+    def test_empty(self):
+        result = format_prior_art_for_prompt([])
+        assert "novel territory" in result
+
+    def test_with_results(self):
+        prior_art = [
+            {"claim_path": "domains/ai/claim-one.md", "title": "Claim One", "score": 0.85, "query": "AI safety"},
+            {"claim_path": "domains/ai/claim-two.md", "title": "Claim Two", "score": 0.72, "query": "alignment"},
+        ]
+        result = format_prior_art_for_prompt(prior_art)
+        assert "claim-one" in result
+        assert "0.85" in result
+        assert "claim-two" in result
+
+    def test_pr_format(self):
+        prior_art = [
+            {"claim_path": "domains/ai/claim-one.md", "title": "Claim One", "score": 0.85, "query": "AI safety"},
+        ]
+        result = format_prior_art_for_pr(prior_art)
+        assert "## Prior Art" in result
+        assert "claim-one" in result
+        assert "0.85" in result