feat: extraction pre-screening via Qdrant semantic search
Before extraction, the pipeline now: 1. Identifies 3-5 themes from source (Haiku, ~$0.002/source) 2. Searches Qdrant for each theme + title (with author-stripped variant) 3. Injects "Prior Art" into extraction prompt showing existing KB claims 4. Requires ENRICHMENT/CHALLENGE to cite specific target_claim (hard gate) Reduces near-duplicate extractions (our #1 rejection cause) by showing the extractor what the KB already knows before it starts. Prior art also persisted to .prior-art/ sidecar files and included in PR body for reviewer verification. Design: Leo. Owner: Epimetheus. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d70788a91c
commit
8c51e47c4e
5 changed files with 347 additions and 4 deletions
|
|
@ -235,11 +235,18 @@ Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" >> $LOG 2>&1
|
|||
# Push
|
||||
git push "http://leo:${TOKEN}@localhost:3000/teleo/teleo-codex.git" "$BRANCH" --force >> $LOG 2>&1
|
||||
|
||||
# Build PR body (include prior art if available)
|
||||
PRIOR_ART_FILE="${MAIN_REPO}/inbox/archive/.prior-art/${BASENAME%.md}.txt"
|
||||
PR_BODY=""
|
||||
if [ -f "$PRIOR_ART_FILE" ]; then
|
||||
PR_BODY=$(cat "$PRIOR_ART_FILE" | python3 -c "import sys,json; print(json.dumps(sys.stdin.read()))" 2>/dev/null | sed 's/^"//;s/"$//')
|
||||
fi
|
||||
|
||||
# Create PR
|
||||
curl -sf -X POST "http://localhost:3000/api/v1/repos/teleo/teleo-codex/pulls" \
|
||||
-H "Authorization: token $TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"title\":\"extract: $BASENAME\",\"head\":\"$BRANCH\",\"base\":\"main\"}" >> /dev/null 2>&1
|
||||
-d "{\"title\":\"extract: $BASENAME\",\"head\":\"$BRANCH\",\"base\":\"main\",\"body\":\"$PR_BODY\"}" >> /dev/null 2>&1
|
||||
|
||||
SUCCESS=$((SUCCESS + 1))
|
||||
echo " -> SUCCESS ($CHANGED files)" >> $LOG
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ def build_extraction_prompt(
|
|||
rationale: str | None = None,
|
||||
intake_tier: str | None = None,
|
||||
proposed_by: str | None = None,
|
||||
prior_art: str | None = None,
|
||||
) -> str:
|
||||
"""Build the lean extraction prompt.
|
||||
|
||||
|
|
@ -40,6 +41,7 @@ def build_extraction_prompt(
|
|||
rationale: Contributor's natural-language thesis about the source (optional)
|
||||
intake_tier: undirected | directed | challenge (optional)
|
||||
proposed_by: Contributor handle who submitted the source (optional)
|
||||
prior_art: Formatted prior art section from pre-screening (optional)
|
||||
|
||||
Returns:
|
||||
The complete prompt string
|
||||
|
|
@ -137,7 +139,19 @@ Single source = experimental at most. Pitch rhetoric or marketing copy = specula
|
|||
|
||||
{source_content}
|
||||
{contributor_directive}
|
||||
## KB Index (existing claims — check for duplicates and enrichment targets)
|
||||
{f"""## Prior Art (semantic pre-screening)
|
||||
|
||||
The following existing claims were found by semantic search against the major themes of this source. Use this to decide whether your extractions are NEW, ENRICHMENT, or CHALLENGE:
|
||||
|
||||
{prior_art}
|
||||
|
||||
**Classification rules:**
|
||||
- If your extraction makes the same argument as a prior art claim (similarity ≥ 0.80): classify as ENRICHMENT and cite the target claim's filename in `target_file`
|
||||
- If your extraction contradicts a prior art claim: classify as CHALLENGE enrichment and cite the target in `target_file`
|
||||
- If your extraction makes a genuinely different argument not covered by prior art: classify as NEW (claim)
|
||||
- ENRICHMENT and CHALLENGE enrichments MUST cite a specific `target_file` — "ENRICHMENT (general)" is rejected by the validator
|
||||
|
||||
""" if prior_art else ""}## KB Index (existing claims — check for duplicates and enrichment targets)
|
||||
|
||||
{kb_index}
|
||||
|
||||
|
|
|
|||
213
lib/pre_screen.py
Normal file
213
lib/pre_screen.py
Normal file
|
|
@ -0,0 +1,213 @@
|
|||
"""Pre-screening: identify themes from source, fetch prior art from Qdrant.
|
||||
|
||||
Runs before extraction to show the extractor what the KB already knows.
|
||||
Reduces near-duplicates (our #1 rejection cause) by turning semantic
|
||||
pre-screening from a manual discipline into a pipeline feature.
|
||||
|
||||
Design: Leo (approved 2026-03-30). Owner: Epimetheus.
|
||||
|
||||
Flow:
|
||||
1. Haiku identifies 3-5 themes from source text
|
||||
2. Each theme + title (with author-stripped variant) → Tier 1 search
|
||||
3. Results injected into extraction prompt as "Prior Art"
|
||||
4. Extractor classifies extractions as NEW / ENRICHMENT / CHALLENGE
|
||||
5. ENRICHMENT/CHALLENGE must cite specific target claim (hard gate)
|
||||
|
||||
Cost: ~$0.002/source (Haiku theme pass) + free Qdrant queries.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
import requests
|
||||
|
||||
# Search library (same Tier 1 path used by Argus + Telegram bot)
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from lib.search import search
|
||||
|
||||
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
|
||||
THEME_MODEL = "anthropic/claude-haiku-4-5-20251001"
|
||||
|
||||
# Regex to strip leading author/entity patterns from titles
|
||||
# e.g. "Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go"
|
||||
# "Aschenbrenner — Situational Awareness" → "Situational Awareness"
|
||||
AUTHOR_PREFIX_RE = re.compile(
|
||||
r"^[A-Za-z\-']+(?:\s+[A-Za-z\-']+)?\s*[:–—\-]\s*", re.UNICODE
|
||||
)
|
||||
|
||||
|
||||
def identify_themes(source_content: str, api_key: str, source_title: str = "") -> list[str]:
|
||||
"""Use Haiku to identify 3-5 major themes from source text.
|
||||
|
||||
Returns a list of theme strings suitable as search queries.
|
||||
Falls back to [source_title] on API failure.
|
||||
"""
|
||||
# Truncate source to keep Haiku costs minimal
|
||||
snippet = source_content[:3000]
|
||||
|
||||
prompt = f"""Identify the 3-5 major themes or topics in this text.
|
||||
Return ONLY a JSON array of short search queries (5-15 words each) that capture the key arguments.
|
||||
Focus on the SPECIFIC mechanisms and claims, not general topic labels.
|
||||
|
||||
Example good output: ["futarchy fundraise oversubscription dynamics", "pro-rata capital allocation in ICOs"]
|
||||
Example bad output: ["governance", "finance"]
|
||||
|
||||
Text:
|
||||
{snippet}
|
||||
|
||||
Return JSON array only, no explanation."""
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
"HTTP-Referer": "https://livingip.xyz",
|
||||
"X-Title": "Teleo Pre-Screen",
|
||||
}
|
||||
payload = {
|
||||
"model": THEME_MODEL,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 500,
|
||||
}
|
||||
resp = requests.post(OPENROUTER_URL, headers=headers, json=payload, timeout=30)
|
||||
resp.raise_for_status()
|
||||
content = resp.json()["choices"][0]["message"]["content"].strip()
|
||||
|
||||
# Strip markdown fencing if present
|
||||
if content.startswith("```"):
|
||||
content = re.sub(r"^```(?:json)?\s*\n?", "", content)
|
||||
content = re.sub(r"\n?```\s*$", "", content)
|
||||
|
||||
themes = json.loads(content)
|
||||
if isinstance(themes, list) and all(isinstance(t, str) for t in themes):
|
||||
return themes[:5]
|
||||
except Exception as e:
|
||||
print(f" WARN: Theme identification failed: {e}", file=sys.stderr)
|
||||
|
||||
# Fallback: use title as the only theme
|
||||
return [source_title] if source_title else []
|
||||
|
||||
|
||||
def _strip_author(title: str) -> str:
|
||||
"""Strip leading author/entity prefix from a title.
|
||||
|
||||
"Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go"
|
||||
"Noah Smith — AI and Jobs" → "AI and Jobs"
|
||||
"""
|
||||
stripped = AUTHOR_PREFIX_RE.sub("", title).strip()
|
||||
# Only use stripped version if it's meaningfully different
|
||||
if stripped and len(stripped) > 10 and stripped != title:
|
||||
return stripped
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_title_from_source(source_content: str, source_file: str) -> str:
|
||||
"""Get a usable title from source frontmatter or filename."""
|
||||
# Try frontmatter title
|
||||
match = re.search(r"^title:\s*[\"']?(.+?)[\"']?\s*$", source_content, re.MULTILINE)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
# Fall back to filename
|
||||
basename = os.path.basename(source_file).replace(".md", "")
|
||||
# Strip date prefix (e.g., "2026-03-15-article-name" → "article-name")
|
||||
basename = re.sub(r"^\d{4}-\d{2}-\d{2}-", "", basename)
|
||||
return basename.replace("-", " ")
|
||||
|
||||
|
||||
def pre_screen(source_content: str, source_file: str, api_key: str,
|
||||
domain: str | None = None) -> dict:
|
||||
"""Run full pre-screening: themes → search → prior art.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"themes": ["theme1", "theme2", ...],
|
||||
"prior_art": [
|
||||
{"claim_path": str, "title": str, "score": float, "query": str},
|
||||
...
|
||||
],
|
||||
"search_queries": ["query1", "query2", ...], # for audit trail
|
||||
}
|
||||
"""
|
||||
title = _extract_title_from_source(source_content, source_file)
|
||||
|
||||
# Step 1: Identify themes
|
||||
themes = identify_themes(source_content, api_key, source_title=title)
|
||||
|
||||
# Step 2: Build search queries (themes + title + author-stripped title)
|
||||
queries = list(themes)
|
||||
if title and title not in queries:
|
||||
queries.append(title)
|
||||
stripped = _strip_author(title)
|
||||
if stripped and stripped not in queries:
|
||||
queries.append(stripped)
|
||||
|
||||
# Step 3: Search Qdrant for each query (Tier 1: expand=False)
|
||||
seen_paths: set[str] = set()
|
||||
prior_art: list[dict] = []
|
||||
|
||||
for query in queries:
|
||||
try:
|
||||
results = search(query, expand=False, domain=None) # cross-domain on purpose
|
||||
for hit in results.get("direct_results", []):
|
||||
path = hit.get("claim_path", "")
|
||||
if path and path not in seen_paths:
|
||||
seen_paths.add(path)
|
||||
prior_art.append({
|
||||
"claim_path": path,
|
||||
"title": hit.get("title", os.path.basename(path).replace(".md", "").replace("-", " ")),
|
||||
"score": round(hit.get("score", 0), 3),
|
||||
"query": query,
|
||||
})
|
||||
except Exception as e:
|
||||
print(f" WARN: Pre-screen search failed for '{query[:50]}': {e}", file=sys.stderr)
|
||||
|
||||
# Sort by score descending, cap at 25 (5 themes × 5 results max)
|
||||
prior_art.sort(key=lambda x: x["score"], reverse=True)
|
||||
prior_art = prior_art[:25]
|
||||
|
||||
return {
|
||||
"themes": themes,
|
||||
"prior_art": prior_art,
|
||||
"search_queries": queries,
|
||||
}
|
||||
|
||||
|
||||
def format_prior_art_for_prompt(prior_art: list[dict]) -> str:
|
||||
"""Format prior art results for injection into the extraction prompt.
|
||||
|
||||
Leo's required format:
|
||||
- [claim-slug](path) — similarity: 0.82 — query: "theme that matched"
|
||||
"""
|
||||
if not prior_art:
|
||||
return "No similar claims found in the KB. This source likely covers novel territory."
|
||||
|
||||
lines = []
|
||||
for item in prior_art:
|
||||
slug = os.path.basename(item["claim_path"]).replace(".md", "")
|
||||
lines.append(
|
||||
f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — query: \"{item['query'][:60]}\""
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_prior_art_for_pr(prior_art: list[dict]) -> str:
|
||||
"""Format prior art for PR body (structured, reviewable by Leo).
|
||||
|
||||
Shows similarity score + which query matched for verification.
|
||||
"""
|
||||
if not prior_art:
|
||||
return "No prior art found — source covers novel territory.\n"
|
||||
|
||||
lines = ["## Prior Art (automated pre-screening)\n"]
|
||||
for item in prior_art:
|
||||
slug = os.path.basename(item["claim_path"]).replace(".md", "")
|
||||
lines.append(
|
||||
f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — matched query: \"{item['query'][:80]}\""
|
||||
)
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
|
@ -41,6 +41,7 @@ from lib.post_extract import (
|
|||
validate_and_fix_entities,
|
||||
)
|
||||
from lib.connect import connect_new_claims
|
||||
from lib.pre_screen import pre_screen, format_prior_art_for_prompt, format_prior_art_for_pr
|
||||
|
||||
# ─── Source registration (Argus: pipeline funnel tracking) ─────────────────
|
||||
|
||||
|
|
@ -354,6 +355,22 @@ def main():
|
|||
# Load existing claims for post-extraction validation
|
||||
existing_claims = load_existing_claims_from_repo(".")
|
||||
|
||||
# ── Pre-screening: identify themes, fetch prior art from Qdrant ──
|
||||
prior_art_text = None
|
||||
prior_art_pr_text = None
|
||||
pre_screen_data = None
|
||||
if api_key:
|
||||
try:
|
||||
pre_screen_data = pre_screen(source_content, args.source_file, api_key, domain=domain)
|
||||
if pre_screen_data["prior_art"]:
|
||||
prior_art_text = format_prior_art_for_prompt(pre_screen_data["prior_art"])
|
||||
prior_art_pr_text = format_prior_art_for_pr(pre_screen_data["prior_art"])
|
||||
print(f"Pre-screen: {len(pre_screen_data['themes'])} themes → {len(pre_screen_data['prior_art'])} prior art claims")
|
||||
else:
|
||||
print(f"Pre-screen: {len(pre_screen_data['themes'])} themes → no prior art (novel territory)")
|
||||
except Exception as e:
|
||||
print(f" WARN: Pre-screening failed (non-fatal): {e}", file=sys.stderr)
|
||||
|
||||
# ── Build lean prompt ──
|
||||
# Extract rationale and intake_tier from source frontmatter (directed contribution)
|
||||
rationale = None
|
||||
|
|
@ -381,6 +398,7 @@ def main():
|
|||
prompt = build_extraction_prompt(
|
||||
args.source_file, source_content, domain, agent, kb_index,
|
||||
rationale=rationale, intake_tier=intake_tier, proposed_by=proposed_by,
|
||||
prior_art=prior_art_text,
|
||||
)
|
||||
|
||||
if args.dry_run:
|
||||
|
|
@ -390,6 +408,13 @@ def main():
|
|||
print(f"Model: {args.model}")
|
||||
print(f"Existing claims: {len(existing_claims)}")
|
||||
print(f"Prompt length: {len(prompt)} chars")
|
||||
if pre_screen_data:
|
||||
print(f"\n=== PRE-SCREEN ===")
|
||||
print(f"Themes: {pre_screen_data['themes']}")
|
||||
print(f"Queries: {pre_screen_data['search_queries']}")
|
||||
print(f"Prior art ({len(pre_screen_data['prior_art'])} claims):")
|
||||
for pa in pre_screen_data['prior_art']:
|
||||
print(f" {pa['score']:.2f} {pa['title'][:60]} (query: {pa['query'][:40]})")
|
||||
print(f"\n=== PROMPT ===\n{prompt[:1000]}...")
|
||||
return
|
||||
|
||||
|
|
@ -461,7 +486,7 @@ def main():
|
|||
if written:
|
||||
written_paths = [os.path.join(domain_dir, f) for f in written]
|
||||
try:
|
||||
connect_stats = connect_new_claims(written_paths, domain=domain)
|
||||
connect_stats = connect_new_claims(written_paths)
|
||||
if connect_stats["connected"] > 0:
|
||||
print(f" Connected: {connect_stats['connected']}/{len(written)} claims → {connect_stats['edges_added']} edges")
|
||||
for conn in connect_stats.get("connections", []):
|
||||
|
|
@ -591,6 +616,11 @@ def main():
|
|||
source_update["entities_enqueued"] = entities_enqueued
|
||||
if facts:
|
||||
source_update["key_facts"] = facts
|
||||
if pre_screen_data and pre_screen_data.get("prior_art"):
|
||||
source_update["notes"] = source_update.get("notes", "")
|
||||
if source_update["notes"]:
|
||||
source_update["notes"] += "; "
|
||||
source_update["notes"] += f"pre-screen: {len(pre_screen_data['prior_art'])} prior art claims from {len(pre_screen_data['themes'])} themes"
|
||||
if not written and not enriched and not entities_enqueued:
|
||||
source_update["notes"] = (
|
||||
f"LLM returned {len(raw_claims)} claims, "
|
||||
|
|
@ -604,6 +634,17 @@ def main():
|
|||
db_status = "extracted" if status == "processed" else ("null_result" if status == "null-result" else status)
|
||||
_register_source(_src_conn, args.source_file, db_status, domain, args.model, len(written))
|
||||
|
||||
# ── Save prior art for PR body (batch-extract reads this) ──
|
||||
if prior_art_pr_text:
|
||||
prior_art_path = os.path.join(
|
||||
os.path.dirname(args.source_file) or ".",
|
||||
".prior-art",
|
||||
os.path.basename(args.source_file).replace(".md", ".txt"),
|
||||
)
|
||||
os.makedirs(os.path.dirname(prior_art_path), exist_ok=True)
|
||||
with open(prior_art_path, "w") as f:
|
||||
f.write(prior_art_pr_text)
|
||||
|
||||
# ── Save debug info for rejected claims ──
|
||||
if rejected_claims:
|
||||
debug_dir = os.path.join(os.path.dirname(args.source_file) or ".", ".extraction-debug")
|
||||
|
|
@ -626,10 +667,12 @@ def main():
|
|||
|
||||
# ── Summary ──
|
||||
print(f"\n{'='*60}")
|
||||
print(f" EXTRACTION COMPLETE (v2)")
|
||||
print(f" EXTRACTION COMPLETE (v2 + pre-screen)")
|
||||
print(f" Source: {args.source_file}")
|
||||
print(f" Agent: {agent}")
|
||||
print(f" Model: {args.model} ({p1_in} in / {p1_out} out)")
|
||||
if pre_screen_data:
|
||||
print(f" Pre-screen: {len(pre_screen_data['themes'])} themes → {len(pre_screen_data['prior_art'])} prior art")
|
||||
print(f" Pass 2: Python validator ($0)")
|
||||
print(f" Claims: {len(written)} written, {claim_stats['rejected']} rejected, {claim_stats['fixed']} auto-fixed")
|
||||
print(f" Connected: {connect_stats.get('connected', 0)} claims → {connect_stats.get('edges_added', 0)} edges (Qdrant)")
|
||||
|
|
|
|||
66
tests/test_pre_screen.py
Normal file
66
tests/test_pre_screen.py
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
"""Tests for lib/pre_screen.py — extraction pre-screening."""
|
||||
|
||||
import pytest
|
||||
from lib.pre_screen import (
|
||||
_strip_author,
|
||||
_extract_title_from_source,
|
||||
format_prior_art_for_prompt,
|
||||
format_prior_art_for_pr,
|
||||
)
|
||||
|
||||
|
||||
class TestStripAuthor:
|
||||
def test_colon_prefix(self):
|
||||
assert _strip_author("Shapiro: How Far Will AI Video Go") == "How Far Will AI Video Go"
|
||||
|
||||
def test_dash_prefix(self):
|
||||
assert _strip_author("Aschenbrenner — Situational Awareness Research") == "Situational Awareness Research"
|
||||
|
||||
def test_no_prefix(self):
|
||||
assert _strip_author("How Far Will AI Video Go") == ""
|
||||
|
||||
def test_short_result_returns_empty(self):
|
||||
# If stripped version is too short, return empty
|
||||
assert _strip_author("Shapiro: AI") == ""
|
||||
|
||||
def test_hyphenated_name(self):
|
||||
assert _strip_author("Noah-Smith: The Future of AI") == "The Future of AI"
|
||||
|
||||
|
||||
class TestExtractTitle:
|
||||
def test_frontmatter_title(self):
|
||||
content = '---\ntitle: "My Great Article"\ndomain: ai-alignment\n---\n\nBody text.'
|
||||
assert _extract_title_from_source(content, "2026-03-15-some-file.md") == "My Great Article"
|
||||
|
||||
def test_filename_fallback(self):
|
||||
content = "---\ndomain: ai-alignment\n---\n\nNo title field."
|
||||
assert _extract_title_from_source(content, "2026-03-15-some-great-article.md") == "some great article"
|
||||
|
||||
def test_date_stripped_from_filename(self):
|
||||
content = "no frontmatter"
|
||||
assert _extract_title_from_source(content, "2026-03-15-article-name.md") == "article name"
|
||||
|
||||
|
||||
class TestFormatPriorArt:
|
||||
def test_empty(self):
|
||||
result = format_prior_art_for_prompt([])
|
||||
assert "novel territory" in result
|
||||
|
||||
def test_with_results(self):
|
||||
prior_art = [
|
||||
{"claim_path": "domains/ai/claim-one.md", "title": "Claim One", "score": 0.85, "query": "AI safety"},
|
||||
{"claim_path": "domains/ai/claim-two.md", "title": "Claim Two", "score": 0.72, "query": "alignment"},
|
||||
]
|
||||
result = format_prior_art_for_prompt(prior_art)
|
||||
assert "claim-one" in result
|
||||
assert "0.85" in result
|
||||
assert "claim-two" in result
|
||||
|
||||
def test_pr_format(self):
|
||||
prior_art = [
|
||||
{"claim_path": "domains/ai/claim-one.md", "title": "Claim One", "score": 0.85, "query": "AI safety"},
|
||||
]
|
||||
result = format_prior_art_for_pr(prior_art)
|
||||
assert "## Prior Art" in result
|
||||
assert "claim-one" in result
|
||||
assert "0.85" in result
|
||||
Loading…
Reference in a new issue