Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source of truth. Previously only 8 of 67 files existed in repo — the rest were deployed directly to VPS via SCP, causing massive drift. Includes: - pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.) - pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh - diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics) - agent-state/: bootstrap, lib-state, cascade inbox processor, schema - systemd/: service unit files for reference - deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate - research-session.sh: updated with Step 8.5 digest + cascade inbox processing No new code written — all files are exact copies from VPS as of 2026-04-06. From this point forward: edit in repo, commit, then deploy.sh. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
221 lines
8.3 KiB
Python
221 lines
8.3 KiB
Python
"""Pre-screening: identify themes from source, fetch prior art from Qdrant.
|
|
|
|
Runs before extraction to show the extractor what the KB already knows.
|
|
Reduces near-duplicates (our #1 rejection cause) by turning semantic
|
|
pre-screening from a manual discipline into a pipeline feature.
|
|
|
|
Design: Leo (approved 2026-03-30). Owner: Epimetheus.
|
|
|
|
Flow:
|
|
1. Haiku identifies 3-5 themes from source text
|
|
2. Each theme + title (with author-stripped variant) → Tier 1 search
|
|
3. Results injected into extraction prompt as "Prior Art"
|
|
4. Extractor classifies extractions as NEW / ENRICHMENT / CHALLENGE
|
|
5. ENRICHMENT/CHALLENGE must cite specific target claim (hard gate)
|
|
|
|
Cost: ~$0.002/source (Haiku theme pass) + free Qdrant queries.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
import requests
|
|
|
|
# Search library (same Tier 1 path used by Argus + Telegram bot)
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
from lib.search import search
|
|
|
|
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
|
|
THEME_MODEL = "anthropic/claude-haiku-4.5"
|
|
|
|
# Regex to strip leading author/entity patterns from titles
|
|
# e.g. "Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go"
|
|
# "Aschenbrenner — Situational Awareness" → "Situational Awareness"
|
|
# Prior art threshold — only show results above this score to the extractor.
|
|
# 0.50 catches mechanism-level matches where compound themes dilute embeddings.
|
|
# Was 0.65 but Haiku compound themes score 0.50-0.60 even on exact matches.
|
|
# False positives cost nothing (extractor sees irrelevant prior art, ignores it).
|
|
# False negatives cost wasted extraction + review + rejection.
|
|
PRIOR_ART_THRESHOLD = 0.50
|
|
|
|
AUTHOR_PREFIX_RE = re.compile(
|
|
r"^[A-Za-z\-']+(?:\s+[A-Za-z\-']+)?\s*[:–—\-]\s*", re.UNICODE
|
|
)
|
|
|
|
|
|
def identify_themes(source_content: str, api_key: str, source_title: str = "") -> list[str]:
|
|
"""Use Haiku to identify 3-5 major themes from source text.
|
|
|
|
Returns a list of theme strings suitable as search queries.
|
|
Falls back to [source_title] on API failure.
|
|
"""
|
|
# Truncate source to keep Haiku costs minimal
|
|
snippet = source_content[:3000]
|
|
|
|
prompt = f"""Identify the 3-5 major themes or topics in this text.
|
|
Return ONLY a JSON array of short search queries (3-8 words each).
|
|
Keep queries SHORT — 3-5 words is ideal. Compound phrases score poorly in vector search.
|
|
|
|
Example good output: ["futarchy governance", "semaglutide kidney outcomes", "ICO oversubscription"]
|
|
Example bad output: ["futarchy governance mechanisms detecting revenue misrepresentation token launches", "prediction market accuracy identifying fraudulent financial claims"]
|
|
|
|
Text:
|
|
{snippet}
|
|
|
|
Return JSON array only, no explanation."""
|
|
|
|
try:
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json",
|
|
"HTTP-Referer": "https://livingip.xyz",
|
|
"X-Title": "Teleo Pre-Screen",
|
|
}
|
|
payload = {
|
|
"model": THEME_MODEL,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"temperature": 0.1,
|
|
"max_tokens": 500,
|
|
}
|
|
resp = requests.post(OPENROUTER_URL, headers=headers, json=payload, timeout=30)
|
|
resp.raise_for_status()
|
|
content = resp.json()["choices"][0]["message"]["content"].strip()
|
|
|
|
# Strip markdown fencing if present
|
|
if content.startswith("```"):
|
|
content = re.sub(r"^```(?:json)?\s*\n?", "", content)
|
|
content = re.sub(r"\n?```\s*$", "", content)
|
|
|
|
themes = json.loads(content)
|
|
if isinstance(themes, list) and all(isinstance(t, str) for t in themes):
|
|
return themes[:5]
|
|
except Exception as e:
|
|
print(f" WARN: Theme identification failed: {e}", file=sys.stderr)
|
|
|
|
# Fallback: use title as the only theme
|
|
return [source_title] if source_title else []
|
|
|
|
|
|
def _strip_author(title: str) -> str:
|
|
"""Strip leading author/entity prefix from a title.
|
|
|
|
"Shapiro: How Far Will AI Video Go" → "How Far Will AI Video Go"
|
|
"Noah Smith — AI and Jobs" → "AI and Jobs"
|
|
"""
|
|
stripped = AUTHOR_PREFIX_RE.sub("", title).strip()
|
|
# Only use stripped version if it's meaningfully different
|
|
if stripped and len(stripped) > 10 and stripped != title:
|
|
return stripped
|
|
return ""
|
|
|
|
|
|
def _extract_title_from_source(source_content: str, source_file: str) -> str:
|
|
"""Get a usable title from source frontmatter or filename."""
|
|
# Try frontmatter title
|
|
match = re.search(r"^title:\s*[\"']?(.+?)[\"']?\s*$", source_content, re.MULTILINE)
|
|
if match:
|
|
return match.group(1).strip()
|
|
|
|
# Fall back to filename
|
|
basename = os.path.basename(source_file).replace(".md", "")
|
|
# Strip date prefix (e.g., "2026-03-15-article-name" → "article-name")
|
|
basename = re.sub(r"^\d{4}-\d{2}-\d{2}-", "", basename)
|
|
return basename.replace("-", " ")
|
|
|
|
|
|
def pre_screen(source_content: str, source_file: str, api_key: str,
|
|
domain: str | None = None) -> dict:
|
|
"""Run full pre-screening: themes → search → prior art.
|
|
|
|
Returns:
|
|
{
|
|
"themes": ["theme1", "theme2", ...],
|
|
"prior_art": [
|
|
{"claim_path": str, "title": str, "score": float, "query": str},
|
|
...
|
|
],
|
|
"search_queries": ["query1", "query2", ...], # for audit trail
|
|
}
|
|
"""
|
|
title = _extract_title_from_source(source_content, source_file)
|
|
|
|
# Step 1: Identify themes
|
|
themes = identify_themes(source_content, api_key, source_title=title)
|
|
|
|
# Step 2: Build search queries (themes + title + author-stripped title)
|
|
queries = list(themes)
|
|
if title and title not in queries:
|
|
queries.append(title)
|
|
stripped = _strip_author(title)
|
|
if stripped and stripped not in queries:
|
|
queries.append(stripped)
|
|
|
|
# Step 3: Search Qdrant for each query (Tier 1: expand=False)
|
|
seen_paths: set[str] = set()
|
|
prior_art: list[dict] = []
|
|
|
|
for query in queries:
|
|
try:
|
|
results = search(query, expand=False, domain=None) # cross-domain on purpose
|
|
for hit in results.get("direct_results", []):
|
|
path = hit.get("claim_path", "")
|
|
if path and path not in seen_paths:
|
|
seen_paths.add(path)
|
|
prior_art.append({
|
|
"claim_path": path,
|
|
"title": hit.get("title", os.path.basename(path).replace(".md", "").replace("-", " ")),
|
|
"score": round(hit.get("score", 0), 3),
|
|
"query": query,
|
|
})
|
|
except Exception as e:
|
|
print(f" WARN: Pre-screen search failed for '{query[:50]}': {e}", file=sys.stderr)
|
|
|
|
# Filter below threshold, sort by score descending, cap at 25
|
|
prior_art = [p for p in prior_art if p["score"] >= PRIOR_ART_THRESHOLD]
|
|
prior_art.sort(key=lambda x: x["score"], reverse=True)
|
|
prior_art = prior_art[:25]
|
|
|
|
return {
|
|
"themes": themes,
|
|
"prior_art": prior_art,
|
|
"search_queries": queries,
|
|
}
|
|
|
|
|
|
def format_prior_art_for_prompt(prior_art: list[dict]) -> str:
|
|
"""Format prior art results for injection into the extraction prompt.
|
|
|
|
Leo's required format:
|
|
- [claim-slug](path) — similarity: 0.82 — query: "theme that matched"
|
|
"""
|
|
if not prior_art:
|
|
return "No similar claims found in the KB. This source likely covers novel territory."
|
|
|
|
lines = []
|
|
for item in prior_art:
|
|
slug = os.path.basename(item["claim_path"]).replace(".md", "")
|
|
lines.append(
|
|
f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — query: \"{item['query'][:60]}\""
|
|
)
|
|
return "\n".join(lines)
|
|
|
|
|
|
def format_prior_art_for_pr(prior_art: list[dict]) -> str:
|
|
"""Format prior art for PR body (structured, reviewable by Leo).
|
|
|
|
Shows similarity score + which query matched for verification.
|
|
"""
|
|
if not prior_art:
|
|
return "No prior art found — source covers novel territory.\n"
|
|
|
|
lines = ["## Prior Art (automated pre-screening)\n"]
|
|
for item in prior_art:
|
|
slug = os.path.basename(item["claim_path"]).replace(".md", "")
|
|
lines.append(
|
|
f"- [{slug}]({item['claim_path']}) — similarity: {item['score']:.2f} — matched query: \"{item['query'][:80]}\""
|
|
)
|
|
lines.append("")
|
|
return "\n".join(lines)
|