Auto: ops/extract-cron.sh | 1 file changed, 96 insertions(+), 27 deletions(-)

This commit is contained in:
m3taversal 2026-03-10 10:48:26 +00:00
parent 0c83c78132
commit 751a651ce8

View file

@ -18,20 +18,23 @@
# 1. Pull latest main # 1. Pull latest main
# 2. Find sources with status: unprocessed (skip processing/processed/null-result) # 2. Find sources with status: unprocessed (skip processing/processed/null-result)
# 3. For each: run Claude headless to extract claims as the domain agent # 3. For each: run Claude headless to extract claims as the domain agent
# 4. Commit extractions, push, open PR # 4. Mark source as processing on main (prevents re-processing next cycle)
# 5. Update source status to processed # 5. Commit extractions on branch, push, open PR
# 6. Eval pipeline reviews the extraction PR separately
# #
# The eval pipeline (webhook.py) handles review and merge separately. # The eval pipeline (webhook.py) handles review and merge separately.
set -euo pipefail set -euo pipefail
REPO_DIR="/opt/teleo-eval/workspaces/extract" REPO_DIR="/opt/teleo-eval/workspaces/extract"
REPO_URL="http://m3taversal:$(cat /opt/teleo-eval/secrets/forgejo-admin-token)@localhost:3000/teleo/teleo-codex.git" FORGEJO_URL="http://localhost:3000"
FORGEJO_ADMIN_TOKEN=$(cat /opt/teleo-eval/secrets/forgejo-admin-token)
CLAUDE_BIN="/home/teleo/.local/bin/claude" CLAUDE_BIN="/home/teleo/.local/bin/claude"
LOG_DIR="/opt/teleo-eval/logs" LOG_DIR="/opt/teleo-eval/logs"
LOG="$LOG_DIR/extract-cron.log" LOG="$LOG_DIR/extract-cron.log"
LOCKFILE="/tmp/extract-cron.lock" LOCKFILE="/tmp/extract-cron.lock"
MAX_SOURCES=5 # Process at most 5 sources per run to limit cost PENDING_FILE="/opt/teleo-eval/extract-pending.txt"
MAX_SOURCES=5 # Process at most 5 sources per run
log() { echo "[$(date -Iseconds)] $*" >> "$LOG"; } log() { echo "[$(date -Iseconds)] $*" >> "$LOG"; }
@ -48,20 +51,37 @@ fi
echo $$ > "$LOCKFILE" echo $$ > "$LOCKFILE"
trap 'rm -f "$LOCKFILE"' EXIT trap 'rm -f "$LOCKFILE"' EXIT
# --- Init pending file ---
touch "$PENDING_FILE"
# --- Ensure repo clone --- # --- Ensure repo clone ---
if [ ! -d "$REPO_DIR/.git" ]; then if [ ! -d "$REPO_DIR/.git" ]; then
log "Cloning repo..." log "Cloning repo..."
git clone "$REPO_URL" "$REPO_DIR" >> "$LOG" 2>&1 git -c http.extraHeader="Authorization: token $FORGEJO_ADMIN_TOKEN" \
clone "${FORGEJO_URL}/teleo/teleo-codex.git" "$REPO_DIR" >> "$LOG" 2>&1
fi fi
cd "$REPO_DIR" cd "$REPO_DIR"
# Configure git auth via credential helper (keeps tokens out of logs)
git config credential.helper "!f() { echo username=m3taversal; echo password=$FORGEJO_ADMIN_TOKEN; }; f"
git remote set-url origin "${FORGEJO_URL}/teleo/teleo-codex.git" 2>/dev/null || true
# --- Pull latest main --- # --- Pull latest main ---
git checkout main >> "$LOG" 2>&1 git checkout main >> "$LOG" 2>&1
git pull --rebase >> "$LOG" 2>&1 git pull --rebase >> "$LOG" 2>&1
# --- Find unprocessed sources --- # --- Find unprocessed sources ---
UNPROCESSED=$(grep -rl '^status: unprocessed' inbox/archive/ 2>/dev/null | head -n "$MAX_SOURCES" || true) # Only match status: unprocessed within YAML frontmatter (between first two --- lines)
UNPROCESSED=$(awk '/^---$/{f++} f==1 && /^status: unprocessed/{print FILENAME; nextfile}' inbox/archive/*.md 2>/dev/null || true)
# Filter out sources already pending extraction
if [ -s "$PENDING_FILE" ]; then
UNPROCESSED=$(echo "$UNPROCESSED" | grep -vxFf "$PENDING_FILE" || true)
fi
# Limit to MAX_SOURCES
UNPROCESSED=$(echo "$UNPROCESSED" | head -n "$MAX_SOURCES")
if [ -z "$UNPROCESSED" ]; then if [ -z "$UNPROCESSED" ]; then
log "No unprocessed sources found" log "No unprocessed sources found"
@ -78,13 +98,16 @@ for SOURCE_FILE in $UNPROCESSED; do
log "Processing: $SOURCE_FILE → branch $BRANCH" log "Processing: $SOURCE_FILE → branch $BRANCH"
# Mark as pending (prevents re-processing on next cron cycle)
echo "$SOURCE_FILE" >> "$PENDING_FILE"
# Create branch from main # Create branch from main
git checkout main >> "$LOG" 2>&1 git checkout main >> "$LOG" 2>&1
git branch -D "$BRANCH" 2>/dev/null || true git branch -D "$BRANCH" 2>/dev/null || true
git checkout -b "$BRANCH" >> "$LOG" 2>&1 git checkout -b "$BRANCH" >> "$LOG" 2>&1
# Read domain from frontmatter # Read domain from frontmatter
DOMAIN=$(grep '^domain:' "$SOURCE_FILE" | head -1 | sed 's/domain: *//' | tr -d '"' | tr -d "'" | xargs) DOMAIN=$(awk '/^---$/{f++} f==1 && /^domain:/{sub(/^domain: */, ""); gsub(/["'"'"']/, ""); print; exit}' "$SOURCE_FILE")
# Map domain to agent # Map domain to agent
case "$DOMAIN" in case "$DOMAIN" in
@ -126,6 +149,7 @@ IMPORTANT: Use the Edit tool to update the source file status. Use the Write too
timeout 600 "$CLAUDE_BIN" -p "$EXTRACT_PROMPT" \ timeout 600 "$CLAUDE_BIN" -p "$EXTRACT_PROMPT" \
--allowedTools 'Read,Write,Edit,Glob,Grep' \ --allowedTools 'Read,Write,Edit,Glob,Grep' \
--model sonnet \ --model sonnet \
--permission-mode bypassPermissions \
>> "$LOG" 2>&1 || { >> "$LOG" 2>&1 || {
log "WARN: Claude extraction failed or timed out for $SOURCE_FILE" log "WARN: Claude extraction failed or timed out for $SOURCE_FILE"
git checkout main >> "$LOG" 2>&1 git checkout main >> "$LOG" 2>&1
@ -133,41 +157,74 @@ IMPORTANT: Use the Edit tool to update the source file status. Use the Write too
} }
# Check if any files were created/modified # Check if any files were created/modified
CHANGES=$(git status --porcelain | wc -l | tr -d ' ') CHANGED_FILES=$(git status --porcelain)
if [ "$CHANGES" -eq 0 ]; then if [ -z "$CHANGED_FILES" ]; then
log "No changes produced for $SOURCE_FILE" log "No changes produced for $SOURCE_FILE"
git checkout main >> "$LOG" 2>&1 git checkout main >> "$LOG" 2>&1
continue continue
fi fi
# Stage and commit # Stage only files in expected paths
git add inbox/archive/ "domains/$DOMAIN/" >> "$LOG" 2>&1 git status --porcelain | awk '{print $2}' | while read -r f; do
case "$f" in
inbox/archive/*|domains/*)
git add "$f" >> "$LOG" 2>&1
;;
*)
log "WARN: Unexpected file change outside inbox/domains: $f — skipping"
;;
esac
done
# Check if anything was staged
if git diff --cached --quiet; then
log "No valid changes to commit for $SOURCE_FILE"
git checkout -- . >> "$LOG" 2>&1
git checkout main >> "$LOG" 2>&1
continue
fi
AGENT_UPPER=$(echo "$AGENT" | sed 's/./\U&/')
git commit -m "$AGENT: extract claims from $(basename "$SOURCE_FILE") git commit -m "$AGENT: extract claims from $(basename "$SOURCE_FILE")
- Source: $SOURCE_FILE - Source: $SOURCE_FILE
- Domain: $DOMAIN - Domain: $DOMAIN
- Extracted by: headless extraction cron - Extracted by: headless extraction cron
Pentagon-Agent: $(echo "$AGENT" | sed 's/./\U&/') <HEADLESS>" >> "$LOG" 2>&1 Pentagon-Agent: $AGENT_UPPER <HEADLESS>" >> "$LOG" 2>&1
# Push branch # Push branch
git push -u "$REPO_URL" "$BRANCH" --force >> "$LOG" 2>&1 git push -u origin "$BRANCH" --force >> "$LOG" 2>&1
# Open PR # Check if PR already exists for this branch
PR_TITLE="$AGENT: extract claims from $(basename "$SOURCE_FILE" .md)" EXISTING_PR=$(curl -s "${FORGEJO_URL}/api/v1/repos/teleo/teleo-codex/pulls?state=open" \
PR_BODY="## Automated Extraction\n\nSource: \`$SOURCE_FILE\`\nDomain: $DOMAIN\nExtracted by: headless cron on VPS\n\nThis PR was created automatically by the extraction cron job. Claims were extracted using \`skills/extract.md\` process via Claude headless."
curl -s -X POST "http://localhost:3000/api/v1/repos/teleo/teleo-codex/pulls" \
-H "Authorization: token $AGENT_TOKEN" \ -H "Authorization: token $AGENT_TOKEN" \
-H "Content-Type: application/json" \ | jq -r ".[] | select(.head.ref == \"$BRANCH\") | .number" 2>/dev/null)
-d "{
\"title\": \"$PR_TITLE\",
\"body\": \"$PR_BODY\",
\"base\": \"main\",
\"head\": \"$BRANCH\"
}" >> "$LOG" 2>&1
log "PR opened for $SOURCE_FILE" if [ -n "$EXISTING_PR" ]; then
log "PR already exists for $BRANCH (#$EXISTING_PR), skipping creation"
else
# Build PR JSON safely with jq
PR_JSON=$(jq -n \
--arg title "$AGENT: extract claims from $(basename "$SOURCE_FILE" .md)" \
--arg body "## Automated Extraction
Source: \`$SOURCE_FILE\`
Domain: $DOMAIN
Extracted by: headless cron on VPS
This PR was created automatically by the extraction cron job. Claims were extracted using \`skills/extract.md\` process via Claude headless." \
--arg base "main" \
--arg head "$BRANCH" \
'{title: $title, body: $body, base: $base, head: $head}')
curl -s -X POST "${FORGEJO_URL}/api/v1/repos/teleo/teleo-codex/pulls" \
-H "Authorization: token $AGENT_TOKEN" \
-H "Content-Type: application/json" \
-d "$PR_JSON" >> "$LOG" 2>&1
log "PR opened for $SOURCE_FILE"
fi
# Back to main for next source # Back to main for next source
git checkout main >> "$LOG" 2>&1 git checkout main >> "$LOG" 2>&1
@ -176,4 +233,16 @@ Pentagon-Agent: $(echo "$AGENT" | sed 's/./\U&/') <HEADLESS>" >> "$LOG" 2>&1
sleep 5 sleep 5
done done
# Clean up pending file — remove entries for sources that have been processed
# (their PRs exist or their status changed on main)
if [ -f "$PENDING_FILE" ]; then
TEMP_PENDING=$(mktemp)
while IFS= read -r pending_source; do
if [ -f "$pending_source" ] && grep -q '^status: unprocessed' "$pending_source" 2>/dev/null; then
echo "$pending_source" >> "$TEMP_PENDING"
fi
done < "$PENDING_FILE"
mv "$TEMP_PENDING" "$PENDING_FILE"
fi
log "Extraction run complete: processed $COUNT source(s)" log "Extraction run complete: processed $COUNT source(s)"