From 8c51e47c4ef2e4626991326202aa8c9bdd15f5bd Mon Sep 17 00:00:00 2001 From: m3taversal Date: Mon, 30 Mar 2026 11:17:38 +0100 Subject: [PATCH] feat: extraction pre-screening via Qdrant semantic search Before extraction, the pipeline now: 1. Identifies 3-5 themes from source (Haiku, ~$0.002/source) 2. Searches Qdrant for each theme + title (with author-stripped variant) 3. Injects "Prior Art" into extraction prompt showing existing KB claims 4. Requires ENRICHMENT/CHALLENGE to cite specific target_claim (hard gate) Reduces near-duplicate extractions (our #1 rejection cause) by showing the extractor what the KB already knows before it starts. Prior art also persisted to .prior-art/ sidecar files and included in PR body for reviewer verification. Design: Leo. Owner: Epimetheus. Co-Authored-By: Claude Opus 4.6 (1M context) --- batch-extract-50.sh | 9 +- lib/extraction_prompt.py | 16 ++- lib/pre_screen.py | 213 +++++++++++++++++++++++++++++++++++++++ openrouter-extract-v2.py | 47 ++++++++- tests/test_pre_screen.py | 66 ++++++++++++ 5 files changed, 347 insertions(+), 4 deletions(-) create mode 100644 lib/pre_screen.py create mode 100644 tests/test_pre_screen.py diff --git a/batch-extract-50.sh b/batch-extract-50.sh index 924403c..a8bb669 100755 --- a/batch-extract-50.sh +++ b/batch-extract-50.sh @@ -235,11 +235,18 @@ Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" >> $LOG 2>&1 # Push git push "http://leo:${TOKEN}@localhost:3000/teleo/teleo-codex.git" "$BRANCH" --force >> $LOG 2>&1 + # Build PR body (include prior art if available) + PRIOR_ART_FILE="${MAIN_REPO}/inbox/archive/.prior-art/${BASENAME%.md}.txt" + PR_BODY="" + if [ -f "$PRIOR_ART_FILE" ]; then + PR_BODY=$(cat "$PRIOR_ART_FILE" | python3 -c "import sys,json; print(json.dumps(sys.stdin.read()))" 2>/dev/null | sed 's/^"//;s/"$//') + fi + # Create PR curl -sf -X POST "http://localhost:3000/api/v1/repos/teleo/teleo-codex/pulls" \ -H "Authorization: token $TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"title\":\"extract: $BASENAME\",\"head\":\"$BRANCH\",\"base\":\"main\"}" >> /dev/null 2>&1 + -d "{\"title\":\"extract: $BASENAME\",\"head\":\"$BRANCH\",\"base\":\"main\",\"body\":\"$PR_BODY\"}" >> /dev/null 2>&1 SUCCESS=$((SUCCESS + 1)) echo " -> SUCCESS ($CHANGED files)" >> $LOG diff --git a/lib/extraction_prompt.py b/lib/extraction_prompt.py index 406b16c..e28f066 100644 --- a/lib/extraction_prompt.py +++ b/lib/extraction_prompt.py @@ -27,6 +27,7 @@ def build_extraction_prompt( rationale: str | None = None, intake_tier: str | None = None, proposed_by: str | None = None, + prior_art: str | None = None, ) -> str: """Build the lean extraction prompt. @@ -40,6 +41,7 @@ def build_extraction_prompt( rationale: Contributor's natural-language thesis about the source (optional) intake_tier: undirected | directed | challenge (optional) proposed_by: Contributor handle who submitted the source (optional) + prior_art: Formatted prior art section from pre-screening (optional) Returns: The complete prompt string @@ -137,7 +139,19 @@ Single source = experimental at most. Pitch rhetoric or marketing copy = specula {source_content} {contributor_directive} -## KB Index (existing claims — check for duplicates and enrichment targets) +{f"""## Prior Art (semantic pre-screening) + +The following existing claims were found by semantic search against the major themes of this source. Use this to decide whether your extractions are NEW, ENRICHMENT, or CHALLENGE: + +{prior_art} + +**Classification rules:** +- If your extraction makes the same argument as a prior art claim (similarity ≥ 0.80): classify as ENRICHMENT and cite the target claim's filename in `target_file` +- If your extraction contradicts a prior art claim: classify as CHALLENGE enrichment and cite the target in `target_file` +- If your extraction makes a genuinely different argument not covered by prior art: classify as NEW (claim) +- ENRICHMENT and CHALLENGE enrichments MUST cite a specific `target_file` — "ENRICHMENT (general)" is rejected by the validator + +""" if prior_art else ""}## KB Index (existing claims — check for duplicates and enrichment targets) {kb_index} diff --git a/lib/pre_screen.py b/lib/pre_screen.py new file mode 100644 index 0000000..459befa --- /dev/null +++ b/lib/pre_screen.py @@ -0,0 +1,213 @@ +"""Pre-screening: identify themes from source, fetch prior art from Qdrant. + +Runs before extraction to show the extractor what the KB already knows. +Reduces near-duplicates (our #1 rejection cause) by turning semantic +pre-screening from a manual discipline into a pipeline feature. + +Design: Leo (approved 2026-03-30). Owner: Epimetheus. + +Flow: + 1. Haiku identifies 3-5 themes from source text + 2. Each theme + title (with author-stripped variant) → Tier 1 search + 3. Results injected into extraction prompt as "Prior Art" + 4. Extractor classifies extractions as NEW / ENRICHMENT / CHALLENGE + 5. ENRICHMENT/CHALLENGE must cite specific target claim (hard gate) + +Cost: ~$0.002/source (Haiku theme pass) + free Qdrant queries. +""" + +import json +import os +import re +import sys + +import requests + +# Search library (same Tier 1 path used by Argus + Telegram bot) +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) +from lib.search import search + +OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions" +THEME_MODEL = "anthropic/claude-haiku-4-5-20251001" + +# Regex to strip leading author/entity patterns from titles +# e.g. "Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go" +# "Aschenbrenner — Situational Awareness" → "Situational Awareness" +AUTHOR_PREFIX_RE = re.compile( + r"^[A-Za-z\-']+(?:\s+[A-Za-z\-']+)?\s*[:–—\-]\s*", re.UNICODE +) + + +def identify_themes(source_content: str, api_key: str, source_title: str = "") -> list[str]: + """Use Haiku to identify 3-5 major themes from source text. + + Returns a list of theme strings suitable as search queries. + Falls back to [source_title] on API failure. + """ + # Truncate source to keep Haiku costs minimal + snippet = source_content[:3000] + + prompt = f"""Identify the 3-5 major themes or topics in this text. +Return ONLY a JSON array of short search queries (5-15 words each) that capture the key arguments. +Focus on the SPECIFIC mechanisms and claims, not general topic labels. + +Example good output: ["futarchy fundraise oversubscription dynamics", "pro-rata capital allocation in ICOs"] +Example bad output: ["governance", "finance"] + +Text: +{snippet} + +Return JSON array only, no explanation.""" + + try: + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + "HTTP-Referer": "https://livingip.xyz", + "X-Title": "Teleo Pre-Screen", + } + payload = { + "model": THEME_MODEL, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.1, + "max_tokens": 500, + } + resp = requests.post(OPENROUTER_URL, headers=headers, json=payload, timeout=30) + resp.raise_for_status() + content = resp.json()["choices"][0]["message"]["content"].strip() + + # Strip markdown fencing if present + if content.startswith("```"): + content = re.sub(r"^```(?:json)?\s*\n?", "", content) + content = re.sub(r"\n?```\s*$", "", content) + + themes = json.loads(content) + if isinstance(themes, list) and all(isinstance(t, str) for t in themes): + return themes[:5] + except Exception as e: + print(f" WARN: Theme identification failed: {e}", file=sys.stderr) + + # Fallback: use title as the only theme + return [source_title] if source_title else [] + + +def _strip_author(title: str) -> str: + """Strip leading author/entity prefix from a title. + + "Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go" + "Noah Smith — AI and Jobs" → "AI and Jobs" + """ + stripped = AUTHOR_PREFIX_RE.sub("", title).strip() + # Only use stripped version if it's meaningfully different + if stripped and len(stripped) > 10 and stripped != title: + return stripped + return "" + + +def _extract_title_from_source(source_content: str, source_file: str) -> str: + """Get a usable title from source frontmatter or filename.""" + # Try frontmatter title + match = re.search(r"^title:\s*[\"']?(.+?)[\"']?\s*$", source_content, re.MULTILINE) + if match: + return match.group(1).strip() + + # Fall back to filename + basename = os.path.basename(source_file).replace(".md", "") + # Strip date prefix (e.g., "2026-03-15-article-name" → "article-name") + basename = re.sub(r"^\d{4}-\d{2}-\d{2}-", "", basename) + return basename.replace("-", " ") + + +def pre_screen(source_content: str, source_file: str, api_key: str, + domain: str | None = None) -> dict: + """Run full pre-screening: themes → search → prior art. + + Returns: + { + "themes": ["theme1", "theme2", ...], + "prior_art": [ + {"claim_path": str, "title": str, "score": float, "query": str}, + ... + ], + "search_queries": ["query1", "query2", ...], # for audit trail + } + """ + title = _extract_title_from_source(source_content, source_file) + + # Step 1: Identify themes + themes = identify_themes(source_content, api_key, source_title=title) + + # Step 2: Build search queries (themes + title + author-stripped title) + queries = list(themes) + if title and title not in queries: + queries.append(title) + stripped = _strip_author(title) + if stripped and stripped not in queries: + queries.append(stripped) + + # Step 3: Search Qdrant for each query (Tier 1: expand=False) + seen_paths: set[str] = set() + prior_art: list[dict] = [] + + for query in queries: + try: + results = search(query, expand=False, domain=None) # cross-domain on purpose + for hit in results.get("direct_results", []): + path = hit.get("claim_path", "") + if path and path not in seen_paths: + seen_paths.add(path) + prior_art.append({ + "claim_path": path, + "title": hit.get("title", os.path.basename(path).replace(".md", "").replace("-", " ")), + "score": round(hit.get("score", 0), 3), + "query": query, + }) + except Exception as e: + print(f" WARN: Pre-screen search failed for '{query[:50]}': {e}", file=sys.stderr) + + # Sort by score descending, cap at 25 (5 themes × 5 results max) + prior_art.sort(key=lambda x: x["score"], reverse=True) + prior_art = prior_art[:25] + + return { + "themes": themes, + "prior_art": prior_art, + "search_queries": queries, + } + + +def format_prior_art_for_prompt(prior_art: list[dict]) -> str: + """Format prior art results for injection into the extraction prompt. + + Leo's required format: + - [claim-slug](path) — similarity: 0.82 — query: "theme that matched" + """ + if not prior_art: + return "No similar claims found in the KB. This source likely covers novel territory." + + lines = [] + for item in prior_art: + slug = os.path.basename(item["claim_path"]).replace(".md", "") + lines.append( + f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — query: \"{item['query'][:60]}\"" + ) + return "\n".join(lines) + + +def format_prior_art_for_pr(prior_art: list[dict]) -> str: + """Format prior art for PR body (structured, reviewable by Leo). + + Shows similarity score + which query matched for verification. + """ + if not prior_art: + return "No prior art found — source covers novel territory.\n" + + lines = ["## Prior Art (automated pre-screening)\n"] + for item in prior_art: + slug = os.path.basename(item["claim_path"]).replace(".md", "") + lines.append( + f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — matched query: \"{item['query'][:80]}\"" + ) + lines.append("") + return "\n".join(lines) diff --git a/openrouter-extract-v2.py b/openrouter-extract-v2.py index b8a677c..f6e4875 100644 --- a/openrouter-extract-v2.py +++ b/openrouter-extract-v2.py @@ -41,6 +41,7 @@ from lib.post_extract import ( validate_and_fix_entities, ) from lib.connect import connect_new_claims +from lib.pre_screen import pre_screen, format_prior_art_for_prompt, format_prior_art_for_pr # ─── Source registration (Argus: pipeline funnel tracking) ───────────────── @@ -354,6 +355,22 @@ def main(): # Load existing claims for post-extraction validation existing_claims = load_existing_claims_from_repo(".") + # ── Pre-screening: identify themes, fetch prior art from Qdrant ── + prior_art_text = None + prior_art_pr_text = None + pre_screen_data = None + if api_key: + try: + pre_screen_data = pre_screen(source_content, args.source_file, api_key, domain=domain) + if pre_screen_data["prior_art"]: + prior_art_text = format_prior_art_for_prompt(pre_screen_data["prior_art"]) + prior_art_pr_text = format_prior_art_for_pr(pre_screen_data["prior_art"]) + print(f"Pre-screen: {len(pre_screen_data['themes'])} themes → {len(pre_screen_data['prior_art'])} prior art claims") + else: + print(f"Pre-screen: {len(pre_screen_data['themes'])} themes → no prior art (novel territory)") + except Exception as e: + print(f" WARN: Pre-screening failed (non-fatal): {e}", file=sys.stderr) + # ── Build lean prompt ── # Extract rationale and intake_tier from source frontmatter (directed contribution) rationale = None @@ -381,6 +398,7 @@ def main(): prompt = build_extraction_prompt( args.source_file, source_content, domain, agent, kb_index, rationale=rationale, intake_tier=intake_tier, proposed_by=proposed_by, + prior_art=prior_art_text, ) if args.dry_run: @@ -390,6 +408,13 @@ def main(): print(f"Model: {args.model}") print(f"Existing claims: {len(existing_claims)}") print(f"Prompt length: {len(prompt)} chars") + if pre_screen_data: + print(f"\n=== PRE-SCREEN ===") + print(f"Themes: {pre_screen_data['themes']}") + print(f"Queries: {pre_screen_data['search_queries']}") + print(f"Prior art ({len(pre_screen_data['prior_art'])} claims):") + for pa in pre_screen_data['prior_art']: + print(f" {pa['score']:.2f} {pa['title'][:60]} (query: {pa['query'][:40]})") print(f"\n=== PROMPT ===\n{prompt[:1000]}...") return @@ -461,7 +486,7 @@ def main(): if written: written_paths = [os.path.join(domain_dir, f) for f in written] try: - connect_stats = connect_new_claims(written_paths, domain=domain) + connect_stats = connect_new_claims(written_paths) if connect_stats["connected"] > 0: print(f" Connected: {connect_stats['connected']}/{len(written)} claims → {connect_stats['edges_added']} edges") for conn in connect_stats.get("connections", []): @@ -591,6 +616,11 @@ def main(): source_update["entities_enqueued"] = entities_enqueued if facts: source_update["key_facts"] = facts + if pre_screen_data and pre_screen_data.get("prior_art"): + source_update["notes"] = source_update.get("notes", "") + if source_update["notes"]: + source_update["notes"] += "; " + source_update["notes"] += f"pre-screen: {len(pre_screen_data['prior_art'])} prior art claims from {len(pre_screen_data['themes'])} themes" if not written and not enriched and not entities_enqueued: source_update["notes"] = ( f"LLM returned {len(raw_claims)} claims, " @@ -604,6 +634,17 @@ def main(): db_status = "extracted" if status == "processed" else ("null_result" if status == "null-result" else status) _register_source(_src_conn, args.source_file, db_status, domain, args.model, len(written)) + # ── Save prior art for PR body (batch-extract reads this) ── + if prior_art_pr_text: + prior_art_path = os.path.join( + os.path.dirname(args.source_file) or ".", + ".prior-art", + os.path.basename(args.source_file).replace(".md", ".txt"), + ) + os.makedirs(os.path.dirname(prior_art_path), exist_ok=True) + with open(prior_art_path, "w") as f: + f.write(prior_art_pr_text) + # ── Save debug info for rejected claims ── if rejected_claims: debug_dir = os.path.join(os.path.dirname(args.source_file) or ".", ".extraction-debug") @@ -626,10 +667,12 @@ def main(): # ── Summary ── print(f"\n{'='*60}") - print(f" EXTRACTION COMPLETE (v2)") + print(f" EXTRACTION COMPLETE (v2 + pre-screen)") print(f" Source: {args.source_file}") print(f" Agent: {agent}") print(f" Model: {args.model} ({p1_in} in / {p1_out} out)") + if pre_screen_data: + print(f" Pre-screen: {len(pre_screen_data['themes'])} themes → {len(pre_screen_data['prior_art'])} prior art") print(f" Pass 2: Python validator ($0)") print(f" Claims: {len(written)} written, {claim_stats['rejected']} rejected, {claim_stats['fixed']} auto-fixed") print(f" Connected: {connect_stats.get('connected', 0)} claims → {connect_stats.get('edges_added', 0)} edges (Qdrant)") diff --git a/tests/test_pre_screen.py b/tests/test_pre_screen.py new file mode 100644 index 0000000..3d0ef9d --- /dev/null +++ b/tests/test_pre_screen.py @@ -0,0 +1,66 @@ +"""Tests for lib/pre_screen.py — extraction pre-screening.""" + +import pytest +from lib.pre_screen import ( + _strip_author, + _extract_title_from_source, + format_prior_art_for_prompt, + format_prior_art_for_pr, +) + + +class TestStripAuthor: + def test_colon_prefix(self): + assert _strip_author("Shapiro: How Far Will AI Video Go") == "How Far Will AI Video Go" + + def test_dash_prefix(self): + assert _strip_author("Aschenbrenner — Situational Awareness Research") == "Situational Awareness Research" + + def test_no_prefix(self): + assert _strip_author("How Far Will AI Video Go") == "" + + def test_short_result_returns_empty(self): + # If stripped version is too short, return empty + assert _strip_author("Shapiro: AI") == "" + + def test_hyphenated_name(self): + assert _strip_author("Noah-Smith: The Future of AI") == "The Future of AI" + + +class TestExtractTitle: + def test_frontmatter_title(self): + content = '---\ntitle: "My Great Article"\ndomain: ai-alignment\n---\n\nBody text.' + assert _extract_title_from_source(content, "2026-03-15-some-file.md") == "My Great Article" + + def test_filename_fallback(self): + content = "---\ndomain: ai-alignment\n---\n\nNo title field." + assert _extract_title_from_source(content, "2026-03-15-some-great-article.md") == "some great article" + + def test_date_stripped_from_filename(self): + content = "no frontmatter" + assert _extract_title_from_source(content, "2026-03-15-article-name.md") == "article name" + + +class TestFormatPriorArt: + def test_empty(self): + result = format_prior_art_for_prompt([]) + assert "novel territory" in result + + def test_with_results(self): + prior_art = [ + {"claim_path": "domains/ai/claim-one.md", "title": "Claim One", "score": 0.85, "query": "AI safety"}, + {"claim_path": "domains/ai/claim-two.md", "title": "Claim Two", "score": 0.72, "query": "alignment"}, + ] + result = format_prior_art_for_prompt(prior_art) + assert "claim-one" in result + assert "0.85" in result + assert "claim-two" in result + + def test_pr_format(self): + prior_art = [ + {"claim_path": "domains/ai/claim-one.md", "title": "Claim One", "score": 0.85, "query": "AI safety"}, + ] + result = format_prior_art_for_pr(prior_art) + assert "## Prior Art" in result + assert "claim-one" in result + assert "0.85" in result