#!/bin/bash # Batch extract sources from inbox/queue/ — v3 with two-gate skip logic # # Uses separate extract/ worktree (not main/ — prevents daemon race condition). # Skip logic uses two checks instead of local marker files (Ganymede v3 review): # Gate 1: Is source already in archive/{domain}/? → already processed, dedup # Gate 2: Does extraction branch exist on Forgejo? → extraction in progress # Neither → extract # # Architecture: Ganymede (two-gate) + Rhea (separate worktrees) REPO=/opt/teleo-eval/workspaces/extract MAIN_REPO=/opt/teleo-eval/workspaces/main EXTRACT=/opt/teleo-eval/openrouter-extract-v2.py CLEANUP=/opt/teleo-eval/post-extract-cleanup.py LOG=/opt/teleo-eval/logs/batch-extract-50.log TOKEN=$(cat /opt/teleo-eval/secrets/forgejo-leo-token) FORGEJO_URL="http://localhost:3000" MAX=50 COUNT=0 SUCCESS=0 FAILED=0 SKIPPED=0 # Lockfile to prevent concurrent runs LOCKFILE="/tmp/batch-extract.lock" if [ -f "$LOCKFILE" ]; then pid=$(cat "$LOCKFILE" 2>/dev/null) if kill -0 "$pid" 2>/dev/null; then echo "[$(date)] SKIP: batch extract already running (pid $pid)" >> $LOG exit 0 fi rm -f "$LOCKFILE" fi echo $$ > "$LOCKFILE" trap 'rm -f "$LOCKFILE"' EXIT echo "[$(date)] Starting batch extraction of $MAX sources" >> $LOG cd $REPO || exit 1 git fetch origin main 2>/dev/null git checkout -f main 2>/dev/null git reset --hard origin/main 2>/dev/null # Pre-extraction cleanup: remove queue files that already exist in archive # This runs on the MAIN worktree (not extract/) so deletions are committed to git. # Prevents the "queue duplicate reappears after reset --hard" problem. CLEANED=0 for qfile in $MAIN_REPO/inbox/queue/*.md; do [ -f "$qfile" ] || continue qbase=$(basename "$qfile") if find "$MAIN_REPO/inbox/archive" -name "$qbase" 2>/dev/null | grep -q .; then rm -f "$qfile" CLEANED=$((CLEANED + 1)) fi done if [ "$CLEANED" -gt 0 ]; then echo "[$(date)] Cleaned $CLEANED stale queue duplicates" >> $LOG cd $MAIN_REPO git add -A inbox/queue/ 2>/dev/null git commit -m "pipeline: clean $CLEANED stale queue duplicates Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" 2>/dev/null # Push with retry for attempt in 1 2 3; do git pull --rebase origin main 2>/dev/null git push origin main 2>/dev/null && break sleep 2 done cd $REPO git fetch origin main 2>/dev/null git reset --hard origin/main 2>/dev/null fi # Get sources in queue SOURCES=$(ls inbox/queue/*.md 2>/dev/null | head -$MAX) # Batch fetch all remote branches once (Ganymede: 1 call instead of 84) REMOTE_BRANCHES=$(git ls-remote --heads origin 2>/dev/null) if [ $? -ne 0 ]; then echo "[$(date)] ABORT: git ls-remote failed — remote unreachable, skipping cycle" >> $LOG exit 0 fi for SOURCE in $SOURCES; do COUNT=$((COUNT + 1)) BASENAME=$(basename "$SOURCE" .md) BRANCH="extract/$BASENAME" # Gate 1: Already in archive? Source was already processed — dedup (Ganymede) if find "$MAIN_REPO/inbox/archive" -name "$BASENAME.md" 2>/dev/null | grep -q .; then echo "[$(date)] [$COUNT/$MAX] SKIP $BASENAME (already in archive)" >> $LOG # Delete the queue duplicate rm -f "$MAIN_REPO/inbox/queue/$BASENAME.md" 2>/dev/null SKIPPED=$((SKIPPED + 1)) continue fi # Gate 2: Branch exists on Forgejo? Extraction already in progress (cached lookup) if echo "$REMOTE_BRANCHES" | grep -q "refs/heads/$BRANCH$"; then echo "[$(date)] [$COUNT/$MAX] SKIP $BASENAME (branch exists — in progress)" >> $LOG SKIPPED=$((SKIPPED + 1)) continue fi echo "[$(date)] [$COUNT/$MAX] Processing $BASENAME" >> $LOG # Reset to main git checkout -f main 2>/dev/null git fetch origin main 2>/dev/null git reset --hard origin/main 2>/dev/null # Clean stale remote branch (Leo's catch — prevents checkout conflicts) git push origin --delete "$BRANCH" 2>/dev/null # Create fresh branch git branch -D "$BRANCH" 2>/dev/null git checkout -b "$BRANCH" 2>/dev/null if [ $? -ne 0 ]; then echo " -> SKIP (branch creation failed)" >> $LOG SKIPPED=$((SKIPPED + 1)) continue fi # Run extraction python3 $EXTRACT "$SOURCE" --no-review >> $LOG 2>&1 EXTRACT_RC=$? if [ $EXTRACT_RC -ne 0 ]; then FAILED=$((FAILED + 1)) echo " -> FAILED (extract rc=$EXTRACT_RC)" >> $LOG continue fi # Post-extraction cleanup python3 $CLEANUP $REPO >> $LOG 2>&1 # Check if any files were created/modified CHANGED=$(git status --porcelain | wc -l | tr -d " ") if [ "$CHANGED" -eq 0 ]; then echo " -> No changes (enrichment/null-result only)" >> $LOG continue fi # Commit git add -A git commit -m "extract: $BASENAME Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" >> $LOG 2>&1 # Push git push "http://leo:${TOKEN}@localhost:3000/teleo/teleo-codex.git" "$BRANCH" --force >> $LOG 2>&1 # Create PR curl -sf -X POST "http://localhost:3000/api/v1/repos/teleo/teleo-codex/pulls" \ -H "Authorization: token $TOKEN" \ -H "Content-Type: application/json" \ -d "{\"title\":\"extract: $BASENAME\",\"head\":\"$BRANCH\",\"base\":\"main\"}" >> /dev/null 2>&1 SUCCESS=$((SUCCESS + 1)) echo " -> SUCCESS ($CHANGED files)" >> $LOG # Back to main git checkout -f main 2>/dev/null # Rate limit sleep 2 done echo "[$(date)] Batch complete: $SUCCESS success, $FAILED failed, $SKIPPED skipped (already attempted)" >> $LOG git checkout -f main 2>/dev/null git reset --hard origin/main 2>/dev/null