Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source of truth. Previously only 8 of 67 files existed in repo — the rest were deployed directly to VPS via SCP, causing massive drift. Includes: - pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.) - pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh - diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics) - agent-state/: bootstrap, lib-state, cascade inbox processor, schema - systemd/: service unit files for reference - deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate - research-session.sh: updated with Step 8.5 digest + cascade inbox processing No new code written — all files are exact copies from VPS as of 2026-04-06. From this point forward: edit in repo, commit, then deploy.sh. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
113 lines
4.1 KiB
Python
113 lines
4.1 KiB
Python
"""Evidence block deduplication for enrichment idempotency.
|
|
|
|
Removes duplicate '### Additional Evidence' and '### Auto-enrichment' blocks
|
|
that arise from rebase of enrichment branches. (Leo: PRs #1751, #1752)
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
|
|
logger = logging.getLogger("pipeline.dedup")
|
|
|
|
# Matches start of an evidence block header
|
|
_EVIDENCE_HEADER = re.compile(
|
|
r'^### (?:Additional Evidence|Auto-enrichment) \(',
|
|
re.MULTILINE,
|
|
)
|
|
|
|
# Extracts source key from the *Source: ...* line
|
|
_SOURCE_LINE = re.compile(r'^\*Source: (.+)\*', re.MULTILINE)
|
|
|
|
|
|
def dedup_evidence_blocks(content: str) -> str:
|
|
"""Remove duplicate evidence blocks from a claim file.
|
|
|
|
After rebase, two enrichment branches can produce duplicate
|
|
evidence blocks with the same source reference. Keeps the first
|
|
occurrence of each source, removes subsequent duplicates.
|
|
"""
|
|
# Find all evidence block start positions
|
|
headers = list(_EVIDENCE_HEADER.finditer(content))
|
|
if len(headers) < 2:
|
|
return content
|
|
|
|
# Parse each block: find its extent and source key
|
|
blocks = [] # (start, end, source_key)
|
|
for i, hdr in enumerate(headers):
|
|
block_start = hdr.start()
|
|
# Block extends to just before the next evidence header
|
|
# (or to end of file for the last block).
|
|
# But we need to be careful: content after the last evidence
|
|
# block that ISN'T evidence (Relevant Notes, ---, etc.) should
|
|
# NOT be considered part of the block.
|
|
if i + 1 < len(headers):
|
|
block_end = headers[i + 1].start()
|
|
else:
|
|
# Last block: find where evidence content ends.
|
|
# Look for the next non-evidence section marker after the
|
|
# source line and evidence body.
|
|
rest = content[block_start:]
|
|
# Find end of this evidence block's text by looking for
|
|
# a section boundary: ---, ## heading, Relevant Notes, Topics
|
|
# Skip the first line (the ### header itself)
|
|
lines = rest.split("\n")
|
|
end_offset = len(rest)
|
|
past_source = False
|
|
past_body = False
|
|
line_pos = 0
|
|
for j, line in enumerate(lines):
|
|
if j == 0:
|
|
line_pos += len(line) + 1
|
|
continue
|
|
if line.startswith("*Source:"):
|
|
past_source = True
|
|
line_pos += len(line) + 1
|
|
continue
|
|
if past_source and line.strip() == "":
|
|
# Blank line after source — start of body
|
|
line_pos += len(line) + 1
|
|
continue
|
|
if past_source and line.strip():
|
|
past_body = True
|
|
# After we've seen body content, a blank line followed by
|
|
# a section marker means the block is done
|
|
if past_body and (
|
|
line.startswith("---")
|
|
or line.startswith("## ")
|
|
or line.startswith("### ") # next evidence or other heading
|
|
or re.match(r'^(?:Relevant Notes|Topics)\s*:?', line)
|
|
):
|
|
end_offset = line_pos
|
|
break
|
|
line_pos += len(line) + 1
|
|
|
|
block_end = block_start + end_offset
|
|
|
|
# Extract source key
|
|
block_text = content[block_start:block_end]
|
|
src_match = _SOURCE_LINE.search(block_text)
|
|
source_key = src_match.group(1).strip() if src_match else f"_unknown_{i}"
|
|
|
|
blocks.append((block_start, block_end, source_key))
|
|
|
|
# Now rebuild content, skipping duplicate sources
|
|
seen: set[str] = set()
|
|
result_parts = [content[:blocks[0][0]]]
|
|
removed = 0
|
|
|
|
for start, end, source_key in blocks:
|
|
if source_key in seen:
|
|
removed += 1
|
|
continue
|
|
seen.add(source_key)
|
|
result_parts.append(content[start:end])
|
|
|
|
# Append any content after the last block
|
|
last_end = blocks[-1][1]
|
|
if last_end < len(content):
|
|
result_parts.append(content[last_end:])
|
|
|
|
if removed > 0:
|
|
logger.info("Deduped %d duplicate evidence block(s)", removed)
|
|
|
|
return "".join(result_parts)
|