teleo-infrastructure/lib/dedup.py
m3taversal f43f8f923f fix: enrichment idempotency — three-layer dedup prevents duplicate evidence blocks
Layer 1: Insertion-time dedup in openrouter-extract-v2.py — skip if source_slug
already appears in claim content.
Layer 2: Insertion-time dedup in entity_batch.py — skip if PR number already
enriched this claim.
Layer 3: Post-rebase dedup in merge.py — scan rebased files for duplicate
evidence blocks (same source reference) and remove them before force-push.

Root cause: multiple enrichment branches modify the same claim at the same
insertion point. When rebased sequentially, evidence blocks are duplicated.
(Leo: PRs #1751, #1752)

lib/dedup.py: standalone module — parses evidence headers, deduplicates by
source key, preserves trailing content (Relevant Notes, Topics sections).
9 tests covering all patterns including the real PR #1751 duplication case.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 13:18:23 +01:00

113 lines
4.1 KiB
Python

"""Evidence block deduplication for enrichment idempotency.
Removes duplicate '### Additional Evidence' and '### Auto-enrichment' blocks
that arise from rebase of enrichment branches. (Leo: PRs #1751, #1752)
"""
import logging
import re
logger = logging.getLogger("pipeline.dedup")
# Matches start of an evidence block header
_EVIDENCE_HEADER = re.compile(
r'^### (?:Additional Evidence|Auto-enrichment) \(',
re.MULTILINE,
)
# Extracts source key from the *Source: ...* line
_SOURCE_LINE = re.compile(r'^\*Source: (.+)\*', re.MULTILINE)
def dedup_evidence_blocks(content: str) -> str:
"""Remove duplicate evidence blocks from a claim file.
After rebase, two enrichment branches can produce duplicate
evidence blocks with the same source reference. Keeps the first
occurrence of each source, removes subsequent duplicates.
"""
# Find all evidence block start positions
headers = list(_EVIDENCE_HEADER.finditer(content))
if len(headers) < 2:
return content
# Parse each block: find its extent and source key
blocks = [] # (start, end, source_key)
for i, hdr in enumerate(headers):
block_start = hdr.start()
# Block extends to just before the next evidence header
# (or to end of file for the last block).
# But we need to be careful: content after the last evidence
# block that ISN'T evidence (Relevant Notes, ---, etc.) should
# NOT be considered part of the block.
if i + 1 < len(headers):
block_end = headers[i + 1].start()
else:
# Last block: find where evidence content ends.
# Look for the next non-evidence section marker after the
# source line and evidence body.
rest = content[block_start:]
# Find end of this evidence block's text by looking for
# a section boundary: ---, ## heading, Relevant Notes, Topics
# Skip the first line (the ### header itself)
lines = rest.split("\n")
end_offset = len(rest)
past_source = False
past_body = False
line_pos = 0
for j, line in enumerate(lines):
if j == 0:
line_pos += len(line) + 1
continue
if line.startswith("*Source:"):
past_source = True
line_pos += len(line) + 1
continue
if past_source and line.strip() == "":
# Blank line after source — start of body
line_pos += len(line) + 1
continue
if past_source and line.strip():
past_body = True
# After we've seen body content, a blank line followed by
# a section marker means the block is done
if past_body and (
line.startswith("---")
or line.startswith("## ")
or line.startswith("### ") # next evidence or other heading
or re.match(r'^(?:Relevant Notes|Topics)\s*:?', line)
):
end_offset = line_pos
break
line_pos += len(line) + 1
block_end = block_start + end_offset
# Extract source key
block_text = content[block_start:block_end]
src_match = _SOURCE_LINE.search(block_text)
source_key = src_match.group(1).strip() if src_match else f"_unknown_{i}"
blocks.append((block_start, block_end, source_key))
# Now rebuild content, skipping duplicate sources
seen: set[str] = set()
result_parts = [content[:blocks[0][0]]]
removed = 0
for start, end, source_key in blocks:
if source_key in seen:
removed += 1
continue
seen.add(source_key)
result_parts.append(content[start:end])
# Append any content after the last block
last_end = blocks[-1][1]
if last_end < len(content):
result_parts.append(content[last_end:])
if removed > 0:
logger.info("Deduped %d duplicate evidence block(s)", removed)
return "".join(result_parts)