teleo-infrastructure/lib/dedup.py

"""Evidence block deduplication for enrichment idempotency.

Removes duplicate '### Additional Evidence' and '### Auto-enrichment' blocks
that arise from rebase of enrichment branches. (Leo: PRs #1751, #1752)
"""

import logging
import re

logger = logging.getLogger("pipeline.dedup")

# Matches start of an evidence block header
_EVIDENCE_HEADER = re.compile(
    r'^### (?:Additional Evidence|Auto-enrichment) \(',
    re.MULTILINE,
)

# Extracts source key from the *Source: ...* line
_SOURCE_LINE = re.compile(r'^\*Source: (.+)\*', re.MULTILINE)


def dedup_evidence_blocks(content: str) -> str:
    """Remove duplicate evidence blocks from a claim file.

    After rebase, two enrichment branches can produce duplicate
    evidence blocks with the same source reference. Keeps the first
    occurrence of each source, removes subsequent duplicates.
    """
    # Find all evidence block start positions
    headers = list(_EVIDENCE_HEADER.finditer(content))
    if len(headers) < 2:
        return content

    # Parse each block: find its extent and source key
    blocks = []  # (start, end, source_key)
    for i, hdr in enumerate(headers):
        block_start = hdr.start()
        # Block extends to just before the next evidence header
        # (or to end of file for the last block).
        # But we need to be careful: content after the last evidence
        # block that ISN'T evidence (Relevant Notes, ---, etc.) should
        # NOT be considered part of the block.
        if i + 1 < len(headers):
            block_end = headers[i + 1].start()
        else:
            # Last block: find where evidence content ends.
            # Look for the next non-evidence section marker after the
            # source line and evidence body.
            rest = content[block_start:]
            # Find end of this evidence block's text by looking for
            # a section boundary: ---, ## heading, Relevant Notes, Topics
            # Skip the first line (the ### header itself)
            lines = rest.split("\n")
            end_offset = len(rest)
            past_source = False
            past_body = False
            line_pos = 0
            for j, line in enumerate(lines):
                if j == 0:
                    line_pos += len(line) + 1
                    continue
                if line.startswith("*Source:"):
                    past_source = True
                    line_pos += len(line) + 1
                    continue
                if past_source and line.strip() == "":
                    # Blank line after source — start of body
                    line_pos += len(line) + 1
                    continue
                if past_source and line.strip():
                    past_body = True
                # After we've seen body content, a blank line followed by
                # a section marker means the block is done
                if past_body and (
                    line.startswith("---")
                    or line.startswith("## ")
                    or line.startswith("### ")  # next evidence or other heading
                    or re.match(r'^(?:Relevant Notes|Topics)\s*:?', line)
                ):
                    end_offset = line_pos
                    break
                line_pos += len(line) + 1

            block_end = block_start + end_offset

        # Extract source key
        block_text = content[block_start:block_end]
        src_match = _SOURCE_LINE.search(block_text)
        source_key = src_match.group(1).strip() if src_match else f"_unknown_{i}"

        blocks.append((block_start, block_end, source_key))

    # Now rebuild content, skipping duplicate sources
    seen: set[str] = set()
    result_parts = [content[:blocks[0][0]]]
    removed = 0

    for start, end, source_key in blocks:
        if source_key in seen:
            removed += 1
            continue
        seen.add(source_key)
        result_parts.append(content[start:end])

    # Append any content after the last block
    last_end = blocks[-1][1]
    if last_end < len(content):
        result_parts.append(content[last_end:])

    if removed > 0:
        logger.info("Deduped %d duplicate evidence block(s)", removed)

    return "".join(result_parts)