From 10b4e27c28e2c6856575d43095edc40f2faa7f11 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Sat, 4 Apr 2026 16:25:48 +0100 Subject: [PATCH] fix: tighten output gate patterns to eliminate false positives on public content MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 5 patterns were too broad — matched common English words: - "extraction" (concept) matched pipeline extraction pattern - "class X" (English) matched Python class definition pattern - ".md " (product name) matched file extension pattern - "threshold" (concept) matched internal metrics pattern Fixes: - extraction: require pipeline context words (queue/PR/branch/cron) - class/def/import: require line-start (actual code, not prose) - .py/.yaml/.json: require path-like prefix (not bare .md) - threshold: require pipeline context (cosine/vector/Qdrant) All 3 Hermes dry-run drafts now pass. 18/18 tests pass. 11/11 system content regression tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) --- telegram/output_gate.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/telegram/output_gate.py b/telegram/output_gate.py index 169b9e1..7fed326 100644 --- a/telegram/output_gate.py +++ b/telegram/output_gate.py @@ -16,7 +16,8 @@ import re _SYSTEM_PATTERNS = [ # Pipeline operations re.compile(r"\b(PR\s*#\d+|pull request|merge|rebase|cherry.?pick)\b", re.IGNORECASE), - re.compile(r"\b(extraction|extracted|extractor|extract/)\b", re.IGNORECASE), + re.compile(r"\b(batch.?extract|extract/|extractor)\b", re.IGNORECASE), + re.compile(r"\bextract(?:ed|ion)\b.*\b(pipeline|queue|PR|branch|source|cron)\b", re.IGNORECASE), re.compile(r"\b(pipeline|cron|batch.?extract|systemd|teleo-pipeline)\b", re.IGNORECASE), re.compile(r"\b(conflict.?permanent|conflict.?closed|merge.?conflict)\b", re.IGNORECASE), @@ -53,13 +54,14 @@ _SYSTEM_PATTERNS = [ # UUIDs (conversation IDs, agent IDs) re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.IGNORECASE), - # Code / technical - re.compile(r"\b(def\s+\w+|import\s+\w+|class\s+\w+)\b"), - re.compile(r"\b(\.py|\.yaml|\.json|\.md)\s", re.IGNORECASE), + # Code / technical — require line-start or code context to avoid matching English "class" + re.compile(r"^\s*(def|import|class)\s+\w+", re.MULTILINE), + re.compile(r"[\w/]+\.(py|yaml|json)\b", re.IGNORECASE), re.compile(r"\b(sqlite3?|pipeline\.db|response_audit)\b", re.IGNORECASE), - # Internal metrics / debugging - re.compile(r"\b(cosine.?sim|threshold|PRIOR_ART_THRESHOLD)\b", re.IGNORECASE), + # Internal metrics / debugging — require pipeline context, not bare English words + re.compile(r"\b(cosine.?sim|PRIOR_ART_THRESHOLD|SCHEMA_VERSION)\b", re.IGNORECASE), + re.compile(r"\bthreshold\b.*\b(cosine|vector|Qdrant|embedding|pre.?screen)\b", re.IGNORECASE), re.compile(r"\b(pre.?screen|Layer\s*[01234]|RRF|entity.?boost)\b", re.IGNORECASE), # Paths