Layer 1: Insertion-time dedup in openrouter-extract-v2.py — skip if source_slug already appears in claim content. Layer 2: Insertion-time dedup in entity_batch.py — skip if PR number already enriched this claim. Layer 3: Post-rebase dedup in merge.py — scan rebased files for duplicate evidence blocks (same source reference) and remove them before force-push. Root cause: multiple enrichment branches modify the same claim at the same insertion point. When rebased sequentially, evidence blocks are duplicated. (Leo: PRs #1751, #1752) lib/dedup.py: standalone module — parses evidence headers, deduplicates by source key, preserves trailing content (Relevant Notes, Topics sections). 9 tests covering all patterns including the real PR #1751 duplication case. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
113 lines
4.1 KiB
Python
113 lines
4.1 KiB
Python
"""Evidence block deduplication for enrichment idempotency.
|
|
|
|
Removes duplicate '### Additional Evidence' and '### Auto-enrichment' blocks
|
|
that arise from rebase of enrichment branches. (Leo: PRs #1751, #1752)
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
|
|
logger = logging.getLogger("pipeline.dedup")
|
|
|
|
# Matches start of an evidence block header
|
|
_EVIDENCE_HEADER = re.compile(
|
|
r'^### (?:Additional Evidence|Auto-enrichment) \(',
|
|
re.MULTILINE,
|
|
)
|
|
|
|
# Extracts source key from the *Source: ...* line
|
|
_SOURCE_LINE = re.compile(r'^\*Source: (.+)\*', re.MULTILINE)
|
|
|
|
|
|
def dedup_evidence_blocks(content: str) -> str:
|
|
"""Remove duplicate evidence blocks from a claim file.
|
|
|
|
After rebase, two enrichment branches can produce duplicate
|
|
evidence blocks with the same source reference. Keeps the first
|
|
occurrence of each source, removes subsequent duplicates.
|
|
"""
|
|
# Find all evidence block start positions
|
|
headers = list(_EVIDENCE_HEADER.finditer(content))
|
|
if len(headers) < 2:
|
|
return content
|
|
|
|
# Parse each block: find its extent and source key
|
|
blocks = [] # (start, end, source_key)
|
|
for i, hdr in enumerate(headers):
|
|
block_start = hdr.start()
|
|
# Block extends to just before the next evidence header
|
|
# (or to end of file for the last block).
|
|
# But we need to be careful: content after the last evidence
|
|
# block that ISN'T evidence (Relevant Notes, ---, etc.) should
|
|
# NOT be considered part of the block.
|
|
if i + 1 < len(headers):
|
|
block_end = headers[i + 1].start()
|
|
else:
|
|
# Last block: find where evidence content ends.
|
|
# Look for the next non-evidence section marker after the
|
|
# source line and evidence body.
|
|
rest = content[block_start:]
|
|
# Find end of this evidence block's text by looking for
|
|
# a section boundary: ---, ## heading, Relevant Notes, Topics
|
|
# Skip the first line (the ### header itself)
|
|
lines = rest.split("\n")
|
|
end_offset = len(rest)
|
|
past_source = False
|
|
past_body = False
|
|
line_pos = 0
|
|
for j, line in enumerate(lines):
|
|
if j == 0:
|
|
line_pos += len(line) + 1
|
|
continue
|
|
if line.startswith("*Source:"):
|
|
past_source = True
|
|
line_pos += len(line) + 1
|
|
continue
|
|
if past_source and line.strip() == "":
|
|
# Blank line after source — start of body
|
|
line_pos += len(line) + 1
|
|
continue
|
|
if past_source and line.strip():
|
|
past_body = True
|
|
# After we've seen body content, a blank line followed by
|
|
# a section marker means the block is done
|
|
if past_body and (
|
|
line.startswith("---")
|
|
or line.startswith("## ")
|
|
or line.startswith("### ") # next evidence or other heading
|
|
or re.match(r'^(?:Relevant Notes|Topics)\s*:?', line)
|
|
):
|
|
end_offset = line_pos
|
|
break
|
|
line_pos += len(line) + 1
|
|
|
|
block_end = block_start + end_offset
|
|
|
|
# Extract source key
|
|
block_text = content[block_start:block_end]
|
|
src_match = _SOURCE_LINE.search(block_text)
|
|
source_key = src_match.group(1).strip() if src_match else f"_unknown_{i}"
|
|
|
|
blocks.append((block_start, block_end, source_key))
|
|
|
|
# Now rebuild content, skipping duplicate sources
|
|
seen: set[str] = set()
|
|
result_parts = [content[:blocks[0][0]]]
|
|
removed = 0
|
|
|
|
for start, end, source_key in blocks:
|
|
if source_key in seen:
|
|
removed += 1
|
|
continue
|
|
seen.add(source_key)
|
|
result_parts.append(content[start:end])
|
|
|
|
# Append any content after the last block
|
|
last_end = blocks[-1][1]
|
|
if last_end < len(content):
|
|
result_parts.append(content[last_end:])
|
|
|
|
if removed > 0:
|
|
logger.info("Deduped %d duplicate evidence block(s)", removed)
|
|
|
|
return "".join(result_parts)
|