teleo-infrastructure/batch-extract-50.sh
m3taversal e233dbbcee epimetheus: auto-clean stale queue duplicates at start of each extract cycle
Pre-extraction step: find queue files already in archive, delete them,
commit + push. Runs on main worktree before extraction starts on extract
worktree. Prevents "queue duplicate reappears after reset --hard" problem
that produced 16 stale entries overnight.

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-21 14:20:29 +00:00

175 lines
5.6 KiB
Bash
Executable file

#!/bin/bash
# Batch extract sources from inbox/queue/ — v3 with two-gate skip logic
#
# Uses separate extract/ worktree (not main/ — prevents daemon race condition).
# Skip logic uses two checks instead of local marker files (Ganymede v3 review):
# Gate 1: Is source already in archive/{domain}/? → already processed, dedup
# Gate 2: Does extraction branch exist on Forgejo? → extraction in progress
# Neither → extract
#
# Architecture: Ganymede (two-gate) + Rhea (separate worktrees)
REPO=/opt/teleo-eval/workspaces/extract
MAIN_REPO=/opt/teleo-eval/workspaces/main
EXTRACT=/opt/teleo-eval/openrouter-extract-v2.py
CLEANUP=/opt/teleo-eval/post-extract-cleanup.py
LOG=/opt/teleo-eval/logs/batch-extract-50.log
TOKEN=$(cat /opt/teleo-eval/secrets/forgejo-leo-token)
FORGEJO_URL="http://localhost:3000"
MAX=50
COUNT=0
SUCCESS=0
FAILED=0
SKIPPED=0
# Lockfile to prevent concurrent runs
LOCKFILE="/tmp/batch-extract.lock"
if [ -f "$LOCKFILE" ]; then
pid=$(cat "$LOCKFILE" 2>/dev/null)
if kill -0 "$pid" 2>/dev/null; then
echo "[$(date)] SKIP: batch extract already running (pid $pid)" >> $LOG
exit 0
fi
rm -f "$LOCKFILE"
fi
echo $$ > "$LOCKFILE"
trap 'rm -f "$LOCKFILE"' EXIT
echo "[$(date)] Starting batch extraction of $MAX sources" >> $LOG
cd $REPO || exit 1
git fetch origin main 2>/dev/null
git checkout -f main 2>/dev/null
git reset --hard origin/main 2>/dev/null
# Pre-extraction cleanup: remove queue files that already exist in archive
# This runs on the MAIN worktree (not extract/) so deletions are committed to git.
# Prevents the "queue duplicate reappears after reset --hard" problem.
CLEANED=0
for qfile in $MAIN_REPO/inbox/queue/*.md; do
[ -f "$qfile" ] || continue
qbase=$(basename "$qfile")
if find "$MAIN_REPO/inbox/archive" -name "$qbase" 2>/dev/null | grep -q .; then
rm -f "$qfile"
CLEANED=$((CLEANED + 1))
fi
done
if [ "$CLEANED" -gt 0 ]; then
echo "[$(date)] Cleaned $CLEANED stale queue duplicates" >> $LOG
cd $MAIN_REPO
git add -A inbox/queue/ 2>/dev/null
git commit -m "pipeline: clean $CLEANED stale queue duplicates
Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" 2>/dev/null
# Push with retry
for attempt in 1 2 3; do
git pull --rebase origin main 2>/dev/null
git push origin main 2>/dev/null && break
sleep 2
done
cd $REPO
git fetch origin main 2>/dev/null
git reset --hard origin/main 2>/dev/null
fi
# Get sources in queue
SOURCES=$(ls inbox/queue/*.md 2>/dev/null | head -$MAX)
# Batch fetch all remote branches once (Ganymede: 1 call instead of 84)
REMOTE_BRANCHES=$(git ls-remote --heads origin 2>/dev/null)
if [ $? -ne 0 ]; then
echo "[$(date)] ABORT: git ls-remote failed — remote unreachable, skipping cycle" >> $LOG
exit 0
fi
for SOURCE in $SOURCES; do
COUNT=$((COUNT + 1))
BASENAME=$(basename "$SOURCE" .md)
BRANCH="extract/$BASENAME"
# Gate 1: Already in archive? Source was already processed — dedup (Ganymede)
if find "$MAIN_REPO/inbox/archive" -name "$BASENAME.md" 2>/dev/null | grep -q .; then
echo "[$(date)] [$COUNT/$MAX] SKIP $BASENAME (already in archive)" >> $LOG
# Delete the queue duplicate
rm -f "$MAIN_REPO/inbox/queue/$BASENAME.md" 2>/dev/null
SKIPPED=$((SKIPPED + 1))
continue
fi
# Gate 2: Branch exists on Forgejo? Extraction already in progress (cached lookup)
if echo "$REMOTE_BRANCHES" | grep -q "refs/heads/$BRANCH$"; then
echo "[$(date)] [$COUNT/$MAX] SKIP $BASENAME (branch exists — in progress)" >> $LOG
SKIPPED=$((SKIPPED + 1))
continue
fi
echo "[$(date)] [$COUNT/$MAX] Processing $BASENAME" >> $LOG
# Reset to main
git checkout -f main 2>/dev/null
git fetch origin main 2>/dev/null
git reset --hard origin/main 2>/dev/null
# Clean stale remote branch (Leo's catch — prevents checkout conflicts)
git push origin --delete "$BRANCH" 2>/dev/null
# Create fresh branch
git branch -D "$BRANCH" 2>/dev/null
git checkout -b "$BRANCH" 2>/dev/null
if [ $? -ne 0 ]; then
echo " -> SKIP (branch creation failed)" >> $LOG
SKIPPED=$((SKIPPED + 1))
continue
fi
# Run extraction
python3 $EXTRACT "$SOURCE" --no-review >> $LOG 2>&1
EXTRACT_RC=$?
if [ $EXTRACT_RC -ne 0 ]; then
FAILED=$((FAILED + 1))
echo " -> FAILED (extract rc=$EXTRACT_RC)" >> $LOG
continue
fi
# Post-extraction cleanup
python3 $CLEANUP $REPO >> $LOG 2>&1
# Check if any files were created/modified
CHANGED=$(git status --porcelain | wc -l | tr -d " ")
if [ "$CHANGED" -eq 0 ]; then
echo " -> No changes (enrichment/null-result only)" >> $LOG
continue
fi
# Commit
git add -A
git commit -m "extract: $BASENAME
Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" >> $LOG 2>&1
# Push
git push "http://leo:${TOKEN}@localhost:3000/teleo/teleo-codex.git" "$BRANCH" --force >> $LOG 2>&1
# Create PR
curl -sf -X POST "http://localhost:3000/api/v1/repos/teleo/teleo-codex/pulls" \
-H "Authorization: token $TOKEN" \
-H "Content-Type: application/json" \
-d "{\"title\":\"extract: $BASENAME\",\"head\":\"$BRANCH\",\"base\":\"main\"}" >> /dev/null 2>&1
SUCCESS=$((SUCCESS + 1))
echo " -> SUCCESS ($CHANGED files)" >> $LOG
# Back to main
git checkout -f main 2>/dev/null
# Rate limit
sleep 2
done
echo "[$(date)] Batch complete: $SUCCESS success, $FAILED failed, $SKIPPED skipped (already attempted)" >> $LOG
git checkout -f main 2>/dev/null
git reset --hard origin/main 2>/dev/null