teleo-infrastructure/lib/pre_screen.py
m3taversal 8c51e47c4e feat: extraction pre-screening via Qdrant semantic search
Before extraction, the pipeline now:
1. Identifies 3-5 themes from source (Haiku, ~$0.002/source)
2. Searches Qdrant for each theme + title (with author-stripped variant)
3. Injects "Prior Art" into extraction prompt showing existing KB claims
4. Requires ENRICHMENT/CHALLENGE to cite specific target_claim (hard gate)

Reduces near-duplicate extractions (our #1 rejection cause) by showing
the extractor what the KB already knows before it starts.

Prior art also persisted to .prior-art/ sidecar files and included in
PR body for reviewer verification.

Design: Leo. Owner: Epimetheus.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 11:17:38 +01:00

213 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Pre-screening: identify themes from source, fetch prior art from Qdrant.
Runs before extraction to show the extractor what the KB already knows.
Reduces near-duplicates (our #1 rejection cause) by turning semantic
pre-screening from a manual discipline into a pipeline feature.
Design: Leo (approved 2026-03-30). Owner: Epimetheus.
Flow:
1. Haiku identifies 3-5 themes from source text
2. Each theme + title (with author-stripped variant) → Tier 1 search
3. Results injected into extraction prompt as "Prior Art"
4. Extractor classifies extractions as NEW / ENRICHMENT / CHALLENGE
5. ENRICHMENT/CHALLENGE must cite specific target claim (hard gate)
Cost: ~$0.002/source (Haiku theme pass) + free Qdrant queries.
"""
import json
import os
import re
import sys
import requests
# Search library (same Tier 1 path used by Argus + Telegram bot)
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from lib.search import search
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
THEME_MODEL = "anthropic/claude-haiku-4-5-20251001"
# Regex to strip leading author/entity patterns from titles
# e.g. "Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go"
# "Aschenbrenner — Situational Awareness" → "Situational Awareness"
AUTHOR_PREFIX_RE = re.compile(
r"^[A-Za-z\-']+(?:\s+[A-Za-z\-']+)?\s*[:–—\-]\s*", re.UNICODE
)
def identify_themes(source_content: str, api_key: str, source_title: str = "") -> list[str]:
"""Use Haiku to identify 3-5 major themes from source text.
Returns a list of theme strings suitable as search queries.
Falls back to [source_title] on API failure.
"""
# Truncate source to keep Haiku costs minimal
snippet = source_content[:3000]
prompt = f"""Identify the 3-5 major themes or topics in this text.
Return ONLY a JSON array of short search queries (5-15 words each) that capture the key arguments.
Focus on the SPECIFIC mechanisms and claims, not general topic labels.
Example good output: ["futarchy fundraise oversubscription dynamics", "pro-rata capital allocation in ICOs"]
Example bad output: ["governance", "finance"]
Text:
{snippet}
Return JSON array only, no explanation."""
try:
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
"HTTP-Referer": "https://livingip.xyz",
"X-Title": "Teleo Pre-Screen",
}
payload = {
"model": THEME_MODEL,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1,
"max_tokens": 500,
}
resp = requests.post(OPENROUTER_URL, headers=headers, json=payload, timeout=30)
resp.raise_for_status()
content = resp.json()["choices"][0]["message"]["content"].strip()
# Strip markdown fencing if present
if content.startswith("```"):
content = re.sub(r"^```(?:json)?\s*\n?", "", content)
content = re.sub(r"\n?```\s*$", "", content)
themes = json.loads(content)
if isinstance(themes, list) and all(isinstance(t, str) for t in themes):
return themes[:5]
except Exception as e:
print(f" WARN: Theme identification failed: {e}", file=sys.stderr)
# Fallback: use title as the only theme
return [source_title] if source_title else []
def _strip_author(title: str) -> str:
"""Strip leading author/entity prefix from a title.
"Shapiro: How Far Will AI Video Go""How Far Will AI Video Go"
"Noah Smith — AI and Jobs""AI and Jobs"
"""
stripped = AUTHOR_PREFIX_RE.sub("", title).strip()
# Only use stripped version if it's meaningfully different
if stripped and len(stripped) > 10 and stripped != title:
return stripped
return ""
def _extract_title_from_source(source_content: str, source_file: str) -> str:
"""Get a usable title from source frontmatter or filename."""
# Try frontmatter title
match = re.search(r"^title:\s*[\"']?(.+?)[\"']?\s*$", source_content, re.MULTILINE)
if match:
return match.group(1).strip()
# Fall back to filename
basename = os.path.basename(source_file).replace(".md", "")
# Strip date prefix (e.g., "2026-03-15-article-name" → "article-name")
basename = re.sub(r"^\d{4}-\d{2}-\d{2}-", "", basename)
return basename.replace("-", " ")
def pre_screen(source_content: str, source_file: str, api_key: str,
domain: str | None = None) -> dict:
"""Run full pre-screening: themes → search → prior art.
Returns:
{
"themes": ["theme1", "theme2", ...],
"prior_art": [
{"claim_path": str, "title": str, "score": float, "query": str},
...
],
"search_queries": ["query1", "query2", ...], # for audit trail
}
"""
title = _extract_title_from_source(source_content, source_file)
# Step 1: Identify themes
themes = identify_themes(source_content, api_key, source_title=title)
# Step 2: Build search queries (themes + title + author-stripped title)
queries = list(themes)
if title and title not in queries:
queries.append(title)
stripped = _strip_author(title)
if stripped and stripped not in queries:
queries.append(stripped)
# Step 3: Search Qdrant for each query (Tier 1: expand=False)
seen_paths: set[str] = set()
prior_art: list[dict] = []
for query in queries:
try:
results = search(query, expand=False, domain=None) # cross-domain on purpose
for hit in results.get("direct_results", []):
path = hit.get("claim_path", "")
if path and path not in seen_paths:
seen_paths.add(path)
prior_art.append({
"claim_path": path,
"title": hit.get("title", os.path.basename(path).replace(".md", "").replace("-", " ")),
"score": round(hit.get("score", 0), 3),
"query": query,
})
except Exception as e:
print(f" WARN: Pre-screen search failed for '{query[:50]}': {e}", file=sys.stderr)
# Sort by score descending, cap at 25 (5 themes × 5 results max)
prior_art.sort(key=lambda x: x["score"], reverse=True)
prior_art = prior_art[:25]
return {
"themes": themes,
"prior_art": prior_art,
"search_queries": queries,
}
def format_prior_art_for_prompt(prior_art: list[dict]) -> str:
"""Format prior art results for injection into the extraction prompt.
Leo's required format:
- [claim-slug](path) — similarity: 0.82 — query: "theme that matched"
"""
if not prior_art:
return "No similar claims found in the KB. This source likely covers novel territory."
lines = []
for item in prior_art:
slug = os.path.basename(item["claim_path"]).replace(".md", "")
lines.append(
f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — query: \"{item['query'][:60]}\""
)
return "\n".join(lines)
def format_prior_art_for_pr(prior_art: list[dict]) -> str:
"""Format prior art for PR body (structured, reviewable by Leo).
Shows similarity score + which query matched for verification.
"""
if not prior_art:
return "No prior art found — source covers novel territory.\n"
lines = ["## Prior Art (automated pre-screening)\n"]
for item in prior_art:
slug = os.path.basename(item["claim_path"]).replace(".md", "")
lines.append(
f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — matched query: \"{item['query'][:80]}\""
)
lines.append("")
return "\n".join(lines)