Layer 1: Insertion-time dedup in openrouter-extract-v2.py — skip if source_slug already appears in claim content. Layer 2: Insertion-time dedup in entity_batch.py — skip if PR number already enriched this claim. Layer 3: Post-rebase dedup in merge.py — scan rebased files for duplicate evidence blocks (same source reference) and remove them before force-push. Root cause: multiple enrichment branches modify the same claim at the same insertion point. When rebased sequentially, evidence blocks are duplicated. (Leo: PRs #1751, #1752) lib/dedup.py: standalone module — parses evidence headers, deduplicates by source key, preserves trailing content (Relevant Notes, Topics sections). 9 tests covering all patterns including the real PR #1751 duplication case. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
159 lines
6.5 KiB
Python
159 lines
6.5 KiB
Python
"""Tests for enrichment idempotency — dedup at insertion and post-rebase."""
|
|
|
|
import os
|
|
import tempfile
|
|
|
|
import pytest
|
|
|
|
|
|
# ─── Unit tests for dedup_evidence_blocks ────────────────────────────────
|
|
|
|
|
|
from lib.dedup import dedup_evidence_blocks
|
|
|
|
|
|
class TestDedupEvidenceBlocks:
|
|
"""Test the post-rebase evidence block deduplication."""
|
|
|
|
def test_no_blocks_unchanged(self):
|
|
content = "---\ntype: claim\n---\n\nSome claim body.\n"
|
|
assert dedup_evidence_blocks(content) == content
|
|
|
|
def test_single_block_unchanged(self):
|
|
content = (
|
|
"---\ntype: claim\n---\n\nClaim body.\n\n"
|
|
"### Additional Evidence (extend)\n"
|
|
"*Source: [[some-source-2026-03-19]] | Added: 2026-03-19*\n\n"
|
|
"Evidence text here.\n"
|
|
)
|
|
assert dedup_evidence_blocks(content) == content
|
|
|
|
def test_duplicate_blocks_removed(self):
|
|
"""Two evidence blocks from the same source — second is removed."""
|
|
block = (
|
|
"\n\n### Additional Evidence (extend)\n"
|
|
"*Source: [[interlune-he3-quantum-demand]] | Added: 2026-03-19*\n\n"
|
|
"Some evidence text.\n"
|
|
)
|
|
content = f"---\ntype: claim\n---\n\nClaim body.{block}{block}\nRelevant Notes:\n"
|
|
result = dedup_evidence_blocks(content)
|
|
# Should contain exactly one occurrence
|
|
assert result.count("[[interlune-he3-quantum-demand]]") == 1
|
|
assert "Relevant Notes:" in result
|
|
|
|
def test_different_sources_kept(self):
|
|
"""Two evidence blocks from different sources — both kept."""
|
|
block1 = (
|
|
"\n\n### Additional Evidence (extend)\n"
|
|
"*Source: [[source-a]] | Added: 2026-03-19*\n\n"
|
|
"Evidence A.\n"
|
|
)
|
|
block2 = (
|
|
"\n\n### Additional Evidence (challenge)\n"
|
|
"*Source: [[source-b]] | Added: 2026-03-20*\n\n"
|
|
"Evidence B.\n"
|
|
)
|
|
content = f"---\ntype: claim\n---\n\nClaim body.{block1}{block2}"
|
|
result = dedup_evidence_blocks(content)
|
|
assert "[[source-a]]" in result
|
|
assert "[[source-b]]" in result
|
|
|
|
def test_auto_enrichment_dedup(self):
|
|
"""Duplicate auto-enrichment blocks from substantive fixer."""
|
|
block = (
|
|
"\n\n### Auto-enrichment (near-duplicate conversion, similarity=0.92)\n"
|
|
"*Source: PR #1234 — \"Some duplicate claim\"*\n\n"
|
|
"Converted evidence.\n"
|
|
)
|
|
content = f"---\ntype: claim\n---\n\nBody.{block}{block}"
|
|
result = dedup_evidence_blocks(content)
|
|
assert result.count("PR #1234") == 1
|
|
|
|
def test_mixed_types_dedup(self):
|
|
"""Same source appears in both Additional Evidence and Auto-enrichment."""
|
|
block1 = (
|
|
"\n\n### Additional Evidence (extend)\n"
|
|
"*Source: [[my-source]] | Added: 2026-03-19*\n\n"
|
|
"First version of evidence.\n"
|
|
)
|
|
block2 = (
|
|
"\n\n### Additional Evidence (extend)\n"
|
|
"*Source: [[my-source]] | Added: 2026-03-19*\n\n"
|
|
"Second version of evidence (rebase duplicate).\n"
|
|
)
|
|
content = f"---\ntype: claim\n---\n\nBody.{block1}{block2}"
|
|
result = dedup_evidence_blocks(content)
|
|
assert result.count("[[my-source]]") == 1
|
|
# First occurrence kept
|
|
assert "First version" in result
|
|
assert "Second version" not in result
|
|
|
|
def test_real_pr1751_pattern(self):
|
|
"""Reproduce the actual PR #1751 duplicate pattern from space-development."""
|
|
content = (
|
|
"---\ntype: claim\ndomain: space-development\n---\n\n"
|
|
"Claim about SpaceX vertical integration.\n\n"
|
|
"### Additional Evidence (extend)\n"
|
|
"*Source: [[2026-03-00-commercial-stations-haven1-slip-orbital-reef-delays]] | Added: 2026-03-19*\n\n"
|
|
"Orbital Reef multi-party structure experiencing delays.\n\n"
|
|
"### Additional Evidence (extend)\n"
|
|
"*Source: [[2026-03-00-commercial-stations-haven1-slip-orbital-reef-delays]] | Added: 2026-03-19*\n\n"
|
|
"Orbital Reef multi-party structure experiencing delays (duplicate from rebase).\n\n"
|
|
"---\n\n"
|
|
"### Additional Evidence (extend)\n"
|
|
"*Source: [[2026-03-19-space-com-starship-v3-first-static-fire]] | Added: 2026-03-24*\n\n"
|
|
"V3 Starship static fire completed.\n"
|
|
)
|
|
result = dedup_evidence_blocks(content)
|
|
assert result.count("[[2026-03-00-commercial-stations-haven1-slip-orbital-reef-delays]]") == 1
|
|
assert result.count("[[2026-03-19-space-com-starship-v3-first-static-fire]]") == 1
|
|
|
|
|
|
# ─── Insertion-time dedup tests ──────────────────────────────────────────
|
|
|
|
|
|
class TestInsertionDedup:
|
|
"""Test that enrichment insertion skips already-enriched claims."""
|
|
|
|
def test_entity_batch_dedup(self):
|
|
"""_apply_claim_enrichment skips if PR already enriched the claim."""
|
|
from lib.entity_batch import _apply_claim_enrichment
|
|
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
|
|
f.write(
|
|
"---\ntype: claim\n---\n\nClaim body.\n\n"
|
|
"### Auto-enrichment (near-duplicate conversion, similarity=0.90)\n"
|
|
"*Source: PR #100 — \"Existing enrichment\"*\n\n"
|
|
"Already enriched evidence.\n"
|
|
)
|
|
f.flush()
|
|
path = f.name
|
|
|
|
try:
|
|
ok, msg = _apply_claim_enrichment(path, "New evidence", 100, "Duplicate", 0.91)
|
|
assert not ok
|
|
assert "already enriched" in msg
|
|
|
|
# Different PR should succeed
|
|
ok2, msg2 = _apply_claim_enrichment(path, "New evidence", 200, "Different PR", 0.88)
|
|
assert ok2
|
|
finally:
|
|
os.unlink(path)
|
|
|
|
def test_entity_batch_first_enrichment_succeeds(self):
|
|
"""First enrichment of a claim by a PR should succeed."""
|
|
from lib.entity_batch import _apply_claim_enrichment
|
|
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
|
|
f.write("---\ntype: claim\n---\n\nClaim body with no enrichments yet.\n")
|
|
f.flush()
|
|
path = f.name
|
|
|
|
try:
|
|
ok, msg = _apply_claim_enrichment(path, "New evidence", 500, "First enrichment", 0.92)
|
|
assert ok
|
|
with open(path) as rf:
|
|
content = rf.read()
|
|
assert "PR #500" in content
|
|
finally:
|
|
os.unlink(path)
|