teleo-infrastructure/tests/test_pre_screen.py
m3taversal 8c51e47c4e feat: extraction pre-screening via Qdrant semantic search
Before extraction, the pipeline now:
1. Identifies 3-5 themes from source (Haiku, ~$0.002/source)
2. Searches Qdrant for each theme + title (with author-stripped variant)
3. Injects "Prior Art" into extraction prompt showing existing KB claims
4. Requires ENRICHMENT/CHALLENGE to cite specific target_claim (hard gate)

Reduces near-duplicate extractions (our #1 rejection cause) by showing
the extractor what the KB already knows before it starts.

Prior art also persisted to .prior-art/ sidecar files and included in
PR body for reviewer verification.

Design: Leo. Owner: Epimetheus.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 11:17:38 +01:00

66 lines
2.4 KiB
Python

"""Tests for lib/pre_screen.py — extraction pre-screening."""
import pytest
from lib.pre_screen import (
_strip_author,
_extract_title_from_source,
format_prior_art_for_prompt,
format_prior_art_for_pr,
)
class TestStripAuthor:
def test_colon_prefix(self):
assert _strip_author("Shapiro: How Far Will AI Video Go") == "How Far Will AI Video Go"
def test_dash_prefix(self):
assert _strip_author("Aschenbrenner — Situational Awareness Research") == "Situational Awareness Research"
def test_no_prefix(self):
assert _strip_author("How Far Will AI Video Go") == ""
def test_short_result_returns_empty(self):
# If stripped version is too short, return empty
assert _strip_author("Shapiro: AI") == ""
def test_hyphenated_name(self):
assert _strip_author("Noah-Smith: The Future of AI") == "The Future of AI"
class TestExtractTitle:
def test_frontmatter_title(self):
content = '---\ntitle: "My Great Article"\ndomain: ai-alignment\n---\n\nBody text.'
assert _extract_title_from_source(content, "2026-03-15-some-file.md") == "My Great Article"
def test_filename_fallback(self):
content = "---\ndomain: ai-alignment\n---\n\nNo title field."
assert _extract_title_from_source(content, "2026-03-15-some-great-article.md") == "some great article"
def test_date_stripped_from_filename(self):
content = "no frontmatter"
assert _extract_title_from_source(content, "2026-03-15-article-name.md") == "article name"
class TestFormatPriorArt:
def test_empty(self):
result = format_prior_art_for_prompt([])
assert "novel territory" in result
def test_with_results(self):
prior_art = [
{"claim_path": "domains/ai/claim-one.md", "title": "Claim One", "score": 0.85, "query": "AI safety"},
{"claim_path": "domains/ai/claim-two.md", "title": "Claim Two", "score": 0.72, "query": "alignment"},
]
result = format_prior_art_for_prompt(prior_art)
assert "claim-one" in result
assert "0.85" in result
assert "claim-two" in result
def test_pr_format(self):
prior_art = [
{"claim_path": "domains/ai/claim-one.md", "title": "Claim One", "score": 0.85, "query": "AI safety"},
]
result = format_prior_art_for_pr(prior_art)
assert "## Prior Art" in result
assert "claim-one" in result
assert "0.85" in result