"""Evidence block deduplication for enrichment idempotency. Removes duplicate '### Additional Evidence' and '### Auto-enrichment' blocks that arise from rebase of enrichment branches. (Leo: PRs #1751, #1752) """ import logging import re logger = logging.getLogger("pipeline.dedup") # Matches start of an evidence block header _EVIDENCE_HEADER = re.compile( r'^### (?:Additional Evidence|Auto-enrichment) \(', re.MULTILINE, ) # Extracts source key from the *Source: ...* line _SOURCE_LINE = re.compile(r'^\*Source: (.+)\*', re.MULTILINE) def dedup_evidence_blocks(content: str) -> str: """Remove duplicate evidence blocks from a claim file. After rebase, two enrichment branches can produce duplicate evidence blocks with the same source reference. Keeps the first occurrence of each source, removes subsequent duplicates. """ # Find all evidence block start positions headers = list(_EVIDENCE_HEADER.finditer(content)) if len(headers) < 2: return content # Parse each block: find its extent and source key blocks = [] # (start, end, source_key) for i, hdr in enumerate(headers): block_start = hdr.start() # Block extends to just before the next evidence header # (or to end of file for the last block). # But we need to be careful: content after the last evidence # block that ISN'T evidence (Relevant Notes, ---, etc.) should # NOT be considered part of the block. if i + 1 < len(headers): block_end = headers[i + 1].start() else: # Last block: find where evidence content ends. # Look for the next non-evidence section marker after the # source line and evidence body. rest = content[block_start:] # Find end of this evidence block's text by looking for # a section boundary: ---, ## heading, Relevant Notes, Topics # Skip the first line (the ### header itself) lines = rest.split("\n") end_offset = len(rest) past_source = False past_body = False line_pos = 0 for j, line in enumerate(lines): if j == 0: line_pos += len(line) + 1 continue if line.startswith("*Source:"): past_source = True line_pos += len(line) + 1 continue if past_source and line.strip() == "": # Blank line after source — start of body line_pos += len(line) + 1 continue if past_source and line.strip(): past_body = True # After we've seen body content, a blank line followed by # a section marker means the block is done if past_body and ( line.startswith("---") or line.startswith("## ") or line.startswith("### ") # next evidence or other heading or re.match(r'^(?:Relevant Notes|Topics)\s*:?', line) ): end_offset = line_pos break line_pos += len(line) + 1 block_end = block_start + end_offset # Extract source key block_text = content[block_start:block_end] src_match = _SOURCE_LINE.search(block_text) source_key = src_match.group(1).strip() if src_match else f"_unknown_{i}" blocks.append((block_start, block_end, source_key)) # Now rebuild content, skipping duplicate sources seen: set[str] = set() result_parts = [content[:blocks[0][0]]] removed = 0 for start, end, source_key in blocks: if source_key in seen: removed += 1 continue seen.add(source_key) result_parts.append(content[start:end]) # Append any content after the last block last_end = blocks[-1][1] if last_end < len(content): result_parts.append(content[last_end:]) if removed > 0: logger.info("Deduped %d duplicate evidence block(s)", removed) return "".join(result_parts)