Before extraction, the pipeline now: 1. Identifies 3-5 themes from source (Haiku, ~$0.002/source) 2. Searches Qdrant for each theme + title (with author-stripped variant) 3. Injects "Prior Art" into extraction prompt showing existing KB claims 4. Requires ENRICHMENT/CHALLENGE to cite specific target_claim (hard gate) Reduces near-duplicate extractions (our #1 rejection cause) by showing the extractor what the KB already knows before it starts. Prior art also persisted to .prior-art/ sidecar files and included in PR body for reviewer verification. Design: Leo. Owner: Epimetheus. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
66 lines
2.4 KiB
Python
66 lines
2.4 KiB
Python
"""Tests for lib/pre_screen.py — extraction pre-screening."""
|
|
|
|
import pytest
|
|
from lib.pre_screen import (
|
|
_strip_author,
|
|
_extract_title_from_source,
|
|
format_prior_art_for_prompt,
|
|
format_prior_art_for_pr,
|
|
)
|
|
|
|
|
|
class TestStripAuthor:
|
|
def test_colon_prefix(self):
|
|
assert _strip_author("Shapiro: How Far Will AI Video Go") == "How Far Will AI Video Go"
|
|
|
|
def test_dash_prefix(self):
|
|
assert _strip_author("Aschenbrenner — Situational Awareness Research") == "Situational Awareness Research"
|
|
|
|
def test_no_prefix(self):
|
|
assert _strip_author("How Far Will AI Video Go") == ""
|
|
|
|
def test_short_result_returns_empty(self):
|
|
# If stripped version is too short, return empty
|
|
assert _strip_author("Shapiro: AI") == ""
|
|
|
|
def test_hyphenated_name(self):
|
|
assert _strip_author("Noah-Smith: The Future of AI") == "The Future of AI"
|
|
|
|
|
|
class TestExtractTitle:
|
|
def test_frontmatter_title(self):
|
|
content = '---\ntitle: "My Great Article"\ndomain: ai-alignment\n---\n\nBody text.'
|
|
assert _extract_title_from_source(content, "2026-03-15-some-file.md") == "My Great Article"
|
|
|
|
def test_filename_fallback(self):
|
|
content = "---\ndomain: ai-alignment\n---\n\nNo title field."
|
|
assert _extract_title_from_source(content, "2026-03-15-some-great-article.md") == "some great article"
|
|
|
|
def test_date_stripped_from_filename(self):
|
|
content = "no frontmatter"
|
|
assert _extract_title_from_source(content, "2026-03-15-article-name.md") == "article name"
|
|
|
|
|
|
class TestFormatPriorArt:
|
|
def test_empty(self):
|
|
result = format_prior_art_for_prompt([])
|
|
assert "novel territory" in result
|
|
|
|
def test_with_results(self):
|
|
prior_art = [
|
|
{"claim_path": "domains/ai/claim-one.md", "title": "Claim One", "score": 0.85, "query": "AI safety"},
|
|
{"claim_path": "domains/ai/claim-two.md", "title": "Claim Two", "score": 0.72, "query": "alignment"},
|
|
]
|
|
result = format_prior_art_for_prompt(prior_art)
|
|
assert "claim-one" in result
|
|
assert "0.85" in result
|
|
assert "claim-two" in result
|
|
|
|
def test_pr_format(self):
|
|
prior_art = [
|
|
{"claim_path": "domains/ai/claim-one.md", "title": "Claim One", "score": 0.85, "query": "AI safety"},
|
|
]
|
|
result = format_prior_art_for_pr(prior_art)
|
|
assert "## Prior Art" in result
|
|
assert "claim-one" in result
|
|
assert "0.85" in result
|