teleo-codex/ops/pipeline-v2/lib/dedup.py
m3taversal 05d74d5e32 sync: import all VPS pipeline + diagnostics code as baseline
Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source
of truth. Previously only 8 of 67 files existed in repo — the rest were
deployed directly to VPS via SCP, causing massive drift.

Includes:
- pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.)
- pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh
- diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics)
- agent-state/: bootstrap, lib-state, cascade inbox processor, schema
- systemd/: service unit files for reference
- deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate
- research-session.sh: updated with Step 8.5 digest + cascade inbox processing

No new code written — all files are exact copies from VPS as of 2026-04-06.
From this point forward: edit in repo, commit, then deploy.sh.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 00:00:00 +01:00

113 lines
4.1 KiB
Python

"""Evidence block deduplication for enrichment idempotency.
Removes duplicate '### Additional Evidence' and '### Auto-enrichment' blocks
that arise from rebase of enrichment branches. (Leo: PRs #1751, #1752)
"""
import logging
import re
logger = logging.getLogger("pipeline.dedup")
# Matches start of an evidence block header
_EVIDENCE_HEADER = re.compile(
r'^### (?:Additional Evidence|Auto-enrichment) \(',
re.MULTILINE,
)
# Extracts source key from the *Source: ...* line
_SOURCE_LINE = re.compile(r'^\*Source: (.+)\*', re.MULTILINE)
def dedup_evidence_blocks(content: str) -> str:
"""Remove duplicate evidence blocks from a claim file.
After rebase, two enrichment branches can produce duplicate
evidence blocks with the same source reference. Keeps the first
occurrence of each source, removes subsequent duplicates.
"""
# Find all evidence block start positions
headers = list(_EVIDENCE_HEADER.finditer(content))
if len(headers) < 2:
return content
# Parse each block: find its extent and source key
blocks = [] # (start, end, source_key)
for i, hdr in enumerate(headers):
block_start = hdr.start()
# Block extends to just before the next evidence header
# (or to end of file for the last block).
# But we need to be careful: content after the last evidence
# block that ISN'T evidence (Relevant Notes, ---, etc.) should
# NOT be considered part of the block.
if i + 1 < len(headers):
block_end = headers[i + 1].start()
else:
# Last block: find where evidence content ends.
# Look for the next non-evidence section marker after the
# source line and evidence body.
rest = content[block_start:]
# Find end of this evidence block's text by looking for
# a section boundary: ---, ## heading, Relevant Notes, Topics
# Skip the first line (the ### header itself)
lines = rest.split("\n")
end_offset = len(rest)
past_source = False
past_body = False
line_pos = 0
for j, line in enumerate(lines):
if j == 0:
line_pos += len(line) + 1
continue
if line.startswith("*Source:"):
past_source = True
line_pos += len(line) + 1
continue
if past_source and line.strip() == "":
# Blank line after source — start of body
line_pos += len(line) + 1
continue
if past_source and line.strip():
past_body = True
# After we've seen body content, a blank line followed by
# a section marker means the block is done
if past_body and (
line.startswith("---")
or line.startswith("## ")
or line.startswith("### ") # next evidence or other heading
or re.match(r'^(?:Relevant Notes|Topics)\s*:?', line)
):
end_offset = line_pos
break
line_pos += len(line) + 1
block_end = block_start + end_offset
# Extract source key
block_text = content[block_start:block_end]
src_match = _SOURCE_LINE.search(block_text)
source_key = src_match.group(1).strip() if src_match else f"_unknown_{i}"
blocks.append((block_start, block_end, source_key))
# Now rebuild content, skipping duplicate sources
seen: set[str] = set()
result_parts = [content[:blocks[0][0]]]
removed = 0
for start, end, source_key in blocks:
if source_key in seen:
removed += 1
continue
seen.add(source_key)
result_parts.append(content[start:end])
# Append any content after the last block
last_end = blocks[-1][1]
if last_end < len(content):
result_parts.append(content[last_end:])
if removed > 0:
logger.info("Deduped %d duplicate evidence block(s)", removed)
return "".join(result_parts)