diff --git a/batch-extract-50.sh b/batch-extract-50.sh index 88ce9b8..58347ba 100755 --- a/batch-extract-50.sh +++ b/batch-extract-50.sh @@ -97,6 +97,17 @@ for SOURCE in $SOURCES; do BASENAME=$(basename "$SOURCE" .md) BRANCH="extract/$BASENAME" + # Skip conversation archives — valuable content enters through standalone sources, + # inline tags (SOURCE:/CLAIM:), and transcript review. Raw conversations produce + # low-quality claims with schema failures. (Epimetheus session 4) + if grep -q "^format: conversation" "$SOURCE" 2>/dev/null; then + # Move to archive instead of leaving in queue (prevents re-processing) + mv "$SOURCE" "$MAIN_REPO/inbox/archive/telegram/" 2>/dev/null + echo "[$(date)] [$COUNT/$MAX] ARCHIVE $BASENAME (conversation — skipped extraction)" >> $LOG + SKIPPED=$((SKIPPED + 1)) + continue + fi + # Gate 1: Already in archive? Source was already processed — dedup (Ganymede) if find "$MAIN_REPO/inbox/archive" -name "$BASENAME.md" 2>/dev/null | grep -q .; then echo "[$(date)] [$COUNT/$MAX] SKIP $BASENAME (already in archive)" >> $LOG