"""Pre-screening: identify themes from source, fetch prior art from Qdrant. Runs before extraction to show the extractor what the KB already knows. Reduces near-duplicates (our #1 rejection cause) by turning semantic pre-screening from a manual discipline into a pipeline feature. Design: Leo (approved 2026-03-30). Owner: Epimetheus. Flow: 1. Haiku identifies 3-5 themes from source text 2. Each theme + title (with author-stripped variant) → Tier 1 search 3. Results injected into extraction prompt as "Prior Art" 4. Extractor classifies extractions as NEW / ENRICHMENT / CHALLENGE 5. ENRICHMENT/CHALLENGE must cite specific target claim (hard gate) Cost: ~$0.002/source (Haiku theme pass) + free Qdrant queries. """ import json import os import re import sys import requests # Search library (same Tier 1 path used by Argus + Telegram bot) from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from lib.search import search OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions" THEME_MODEL = "anthropic/claude-haiku-4.5" # Regex to strip leading author/entity patterns from titles # e.g. "Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go" # "Aschenbrenner — Situational Awareness" → "Situational Awareness" # Prior art threshold — only show results above this score to the extractor. # 0.50 catches mechanism-level matches where compound themes dilute embeddings. # Was 0.65 but Haiku compound themes score 0.50-0.60 even on exact matches. # False positives cost nothing (extractor sees irrelevant prior art, ignores it). # False negatives cost wasted extraction + review + rejection. PRIOR_ART_THRESHOLD = 0.50 AUTHOR_PREFIX_RE = re.compile( r"^[A-Za-z\-']+(?:\s+[A-Za-z\-']+)?\s*[:–—\-]\s*", re.UNICODE ) def identify_themes(source_content: str, api_key: str, source_title: str = "") -> list[str]: """Use Haiku to identify 3-5 major themes from source text. Returns a list of theme strings suitable as search queries. Falls back to [source_title] on API failure. """ # Truncate source to keep Haiku costs minimal snippet = source_content[:3000] prompt = f"""Identify the 3-5 major themes or topics in this text. Return ONLY a JSON array of short search queries (3-8 words each). Keep queries SHORT — 3-5 words is ideal. Compound phrases score poorly in vector search. Example good output: ["futarchy governance", "semaglutide kidney outcomes", "ICO oversubscription"] Example bad output: ["futarchy governance mechanisms detecting revenue misrepresentation token launches", "prediction market accuracy identifying fraudulent financial claims"] Text: {snippet} Return JSON array only, no explanation.""" try: headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", "HTTP-Referer": "https://livingip.xyz", "X-Title": "Teleo Pre-Screen", } payload = { "model": THEME_MODEL, "messages": [{"role": "user", "content": prompt}], "temperature": 0.1, "max_tokens": 500, } resp = requests.post(OPENROUTER_URL, headers=headers, json=payload, timeout=30) resp.raise_for_status() content = resp.json()["choices"][0]["message"]["content"].strip() # Strip markdown fencing if present if content.startswith("```"): content = re.sub(r"^```(?:json)?\s*\n?", "", content) content = re.sub(r"\n?```\s*$", "", content) themes = json.loads(content) if isinstance(themes, list) and all(isinstance(t, str) for t in themes): return themes[:5] except Exception as e: print(f" WARN: Theme identification failed: {e}", file=sys.stderr) # Fallback: use title as the only theme return [source_title] if source_title else [] def _strip_author(title: str) -> str: """Strip leading author/entity prefix from a title. "Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go" "Noah Smith — AI and Jobs" → "AI and Jobs" """ stripped = AUTHOR_PREFIX_RE.sub("", title).strip() # Only use stripped version if it's meaningfully different if stripped and len(stripped) > 10 and stripped != title: return stripped return "" def _extract_title_from_source(source_content: str, source_file: str) -> str: """Get a usable title from source frontmatter or filename.""" # Try frontmatter title match = re.search(r"^title:\s*[\"']?(.+?)[\"']?\s*$", source_content, re.MULTILINE) if match: return match.group(1).strip() # Fall back to filename basename = os.path.basename(source_file).replace(".md", "") # Strip date prefix (e.g., "2026-03-15-article-name" → "article-name") basename = re.sub(r"^\d{4}-\d{2}-\d{2}-", "", basename) return basename.replace("-", " ") def pre_screen(source_content: str, source_file: str, api_key: str, domain: str | None = None) -> dict: """Run full pre-screening: themes → search → prior art. Returns: { "themes": ["theme1", "theme2", ...], "prior_art": [ {"claim_path": str, "title": str, "score": float, "query": str}, ... ], "search_queries": ["query1", "query2", ...], # for audit trail } """ title = _extract_title_from_source(source_content, source_file) # Step 1: Identify themes themes = identify_themes(source_content, api_key, source_title=title) # Step 2: Build search queries (themes + title + author-stripped title) queries = list(themes) if title and title not in queries: queries.append(title) stripped = _strip_author(title) if stripped and stripped not in queries: queries.append(stripped) # Step 3: Search Qdrant for each query (Tier 1: expand=False) seen_paths: set[str] = set() prior_art: list[dict] = [] for query in queries: try: results = search(query, expand=False, domain=None) # cross-domain on purpose for hit in results.get("direct_results", []): path = hit.get("claim_path", "") if path and path not in seen_paths: seen_paths.add(path) prior_art.append({ "claim_path": path, "title": hit.get("title", os.path.basename(path).replace(".md", "").replace("-", " ")), "score": round(hit.get("score", 0), 3), "query": query, }) except Exception as e: print(f" WARN: Pre-screen search failed for '{query[:50]}': {e}", file=sys.stderr) # Filter below threshold, sort by score descending, cap at 25 prior_art = [p for p in prior_art if p["score"] >= PRIOR_ART_THRESHOLD] prior_art.sort(key=lambda x: x["score"], reverse=True) prior_art = prior_art[:25] return { "themes": themes, "prior_art": prior_art, "search_queries": queries, } def format_prior_art_for_prompt(prior_art: list[dict]) -> str: """Format prior art results for injection into the extraction prompt. Leo's required format: - [claim-slug](path) — similarity: 0.82 — query: "theme that matched" """ if not prior_art: return "No similar claims found in the KB. This source likely covers novel territory." lines = [] for item in prior_art: slug = os.path.basename(item["claim_path"]).replace(".md", "") lines.append( f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — query: \"{item['query'][:60]}\"" ) return "\n".join(lines) def format_prior_art_for_pr(prior_art: list[dict]) -> str: """Format prior art for PR body (structured, reviewable by Leo). Shows similarity score + which query matched for verification. """ if not prior_art: return "No prior art found — source covers novel territory.\n" lines = ["## Prior Art (automated pre-screening)\n"] for item in prior_art: slug = os.path.basename(item["claim_path"]).replace(".md", "") lines.append( f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — matched query: \"{item['query'][:80]}\"" ) lines.append("") return "\n".join(lines)