From 0854375fd0f7f39c6ee56916cc78e8c20bf1a927 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Thu, 26 Mar 2026 12:02:57 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20skip=20format:=20conversation=20in=20ext?= =?UTF-8?q?raction=20=E2=80=94=20archive=20directly=20instead?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Conversation archives produce low-quality claims (26x schema failures, 22x near-duplicates in 24h). Valuable content from conversations now enters through three other paths: 1. Standalone sources (URLs shared → x-article/x-tweet files) 2. Inline tags (SOURCE:/CLAIM: → curated source files) 3. Transcript review (1-hour JSONL dumps → periodic safety net) Conversations moved to inbox/archive/telegram/ for provenance without burning extraction cycles. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- batch-extract-50.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/batch-extract-50.sh b/batch-extract-50.sh index 88ce9b8..58347ba 100755 --- a/batch-extract-50.sh +++ b/batch-extract-50.sh @@ -97,6 +97,17 @@ for SOURCE in $SOURCES; do BASENAME=$(basename "$SOURCE" .md) BRANCH="extract/$BASENAME" + # Skip conversation archives — valuable content enters through standalone sources, + # inline tags (SOURCE:/CLAIM:), and transcript review. Raw conversations produce + # low-quality claims with schema failures. (Epimetheus session 4) + if grep -q "^format: conversation" "$SOURCE" 2>/dev/null; then + # Move to archive instead of leaving in queue (prevents re-processing) + mv "$SOURCE" "$MAIN_REPO/inbox/archive/telegram/" 2>/dev/null + echo "[$(date)] [$COUNT/$MAX] ARCHIVE $BASENAME (conversation — skipped extraction)" >> $LOG + SKIPPED=$((SKIPPED + 1)) + continue + fi + # Gate 1: Already in archive? Source was already processed — dedup (Ganymede) if find "$MAIN_REPO/inbox/archive" -name "$BASENAME.md" 2>/dev/null | grep -q .; then echo "[$(date)] [$COUNT/$MAX] SKIP $BASENAME (already in archive)" >> $LOG