From 751a651ce8195d158047aef24fdd1f900304d4bd Mon Sep 17 00:00:00 2001 From: m3taversal Date: Tue, 10 Mar 2026 10:48:26 +0000 Subject: [PATCH] Auto: ops/extract-cron.sh | 1 file changed, 96 insertions(+), 27 deletions(-) --- ops/extract-cron.sh | 121 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 95 insertions(+), 26 deletions(-) diff --git a/ops/extract-cron.sh b/ops/extract-cron.sh index a08789d8..70bd3fb8 100755 --- a/ops/extract-cron.sh +++ b/ops/extract-cron.sh @@ -18,20 +18,23 @@ # 1. Pull latest main # 2. Find sources with status: unprocessed (skip processing/processed/null-result) # 3. For each: run Claude headless to extract claims as the domain agent -# 4. Commit extractions, push, open PR -# 5. Update source status to processed +# 4. Mark source as processing on main (prevents re-processing next cycle) +# 5. Commit extractions on branch, push, open PR +# 6. Eval pipeline reviews the extraction PR separately # # The eval pipeline (webhook.py) handles review and merge separately. set -euo pipefail REPO_DIR="/opt/teleo-eval/workspaces/extract" -REPO_URL="http://m3taversal:$(cat /opt/teleo-eval/secrets/forgejo-admin-token)@localhost:3000/teleo/teleo-codex.git" +FORGEJO_URL="http://localhost:3000" +FORGEJO_ADMIN_TOKEN=$(cat /opt/teleo-eval/secrets/forgejo-admin-token) CLAUDE_BIN="/home/teleo/.local/bin/claude" LOG_DIR="/opt/teleo-eval/logs" LOG="$LOG_DIR/extract-cron.log" LOCKFILE="/tmp/extract-cron.lock" -MAX_SOURCES=5 # Process at most 5 sources per run to limit cost +PENDING_FILE="/opt/teleo-eval/extract-pending.txt" +MAX_SOURCES=5 # Process at most 5 sources per run log() { echo "[$(date -Iseconds)] $*" >> "$LOG"; } @@ -48,20 +51,37 @@ fi echo $$ > "$LOCKFILE" trap 'rm -f "$LOCKFILE"' EXIT +# --- Init pending file --- +touch "$PENDING_FILE" + # --- Ensure repo clone --- if [ ! -d "$REPO_DIR/.git" ]; then log "Cloning repo..." - git clone "$REPO_URL" "$REPO_DIR" >> "$LOG" 2>&1 + git -c http.extraHeader="Authorization: token $FORGEJO_ADMIN_TOKEN" \ + clone "${FORGEJO_URL}/teleo/teleo-codex.git" "$REPO_DIR" >> "$LOG" 2>&1 fi cd "$REPO_DIR" +# Configure git auth via credential helper (keeps tokens out of logs) +git config credential.helper "!f() { echo username=m3taversal; echo password=$FORGEJO_ADMIN_TOKEN; }; f" +git remote set-url origin "${FORGEJO_URL}/teleo/teleo-codex.git" 2>/dev/null || true + # --- Pull latest main --- git checkout main >> "$LOG" 2>&1 git pull --rebase >> "$LOG" 2>&1 # --- Find unprocessed sources --- -UNPROCESSED=$(grep -rl '^status: unprocessed' inbox/archive/ 2>/dev/null | head -n "$MAX_SOURCES" || true) +# Only match status: unprocessed within YAML frontmatter (between first two --- lines) +UNPROCESSED=$(awk '/^---$/{f++} f==1 && /^status: unprocessed/{print FILENAME; nextfile}' inbox/archive/*.md 2>/dev/null || true) + +# Filter out sources already pending extraction +if [ -s "$PENDING_FILE" ]; then + UNPROCESSED=$(echo "$UNPROCESSED" | grep -vxFf "$PENDING_FILE" || true) +fi + +# Limit to MAX_SOURCES +UNPROCESSED=$(echo "$UNPROCESSED" | head -n "$MAX_SOURCES") if [ -z "$UNPROCESSED" ]; then log "No unprocessed sources found" @@ -78,13 +98,16 @@ for SOURCE_FILE in $UNPROCESSED; do log "Processing: $SOURCE_FILE → branch $BRANCH" + # Mark as pending (prevents re-processing on next cron cycle) + echo "$SOURCE_FILE" >> "$PENDING_FILE" + # Create branch from main git checkout main >> "$LOG" 2>&1 git branch -D "$BRANCH" 2>/dev/null || true git checkout -b "$BRANCH" >> "$LOG" 2>&1 # Read domain from frontmatter - DOMAIN=$(grep '^domain:' "$SOURCE_FILE" | head -1 | sed 's/domain: *//' | tr -d '"' | tr -d "'" | xargs) + DOMAIN=$(awk '/^---$/{f++} f==1 && /^domain:/{sub(/^domain: */, ""); gsub(/["'"'"']/, ""); print; exit}' "$SOURCE_FILE") # Map domain to agent case "$DOMAIN" in @@ -126,6 +149,7 @@ IMPORTANT: Use the Edit tool to update the source file status. Use the Write too timeout 600 "$CLAUDE_BIN" -p "$EXTRACT_PROMPT" \ --allowedTools 'Read,Write,Edit,Glob,Grep' \ --model sonnet \ + --permission-mode bypassPermissions \ >> "$LOG" 2>&1 || { log "WARN: Claude extraction failed or timed out for $SOURCE_FILE" git checkout main >> "$LOG" 2>&1 @@ -133,41 +157,74 @@ IMPORTANT: Use the Edit tool to update the source file status. Use the Write too } # Check if any files were created/modified - CHANGES=$(git status --porcelain | wc -l | tr -d ' ') - if [ "$CHANGES" -eq 0 ]; then + CHANGED_FILES=$(git status --porcelain) + if [ -z "$CHANGED_FILES" ]; then log "No changes produced for $SOURCE_FILE" git checkout main >> "$LOG" 2>&1 continue fi - # Stage and commit - git add inbox/archive/ "domains/$DOMAIN/" >> "$LOG" 2>&1 + # Stage only files in expected paths + git status --porcelain | awk '{print $2}' | while read -r f; do + case "$f" in + inbox/archive/*|domains/*) + git add "$f" >> "$LOG" 2>&1 + ;; + *) + log "WARN: Unexpected file change outside inbox/domains: $f — skipping" + ;; + esac + done + + # Check if anything was staged + if git diff --cached --quiet; then + log "No valid changes to commit for $SOURCE_FILE" + git checkout -- . >> "$LOG" 2>&1 + git checkout main >> "$LOG" 2>&1 + continue + fi + + AGENT_UPPER=$(echo "$AGENT" | sed 's/./\U&/') git commit -m "$AGENT: extract claims from $(basename "$SOURCE_FILE") - Source: $SOURCE_FILE - Domain: $DOMAIN - Extracted by: headless extraction cron -Pentagon-Agent: $(echo "$AGENT" | sed 's/./\U&/') " >> "$LOG" 2>&1 +Pentagon-Agent: $AGENT_UPPER " >> "$LOG" 2>&1 # Push branch - git push -u "$REPO_URL" "$BRANCH" --force >> "$LOG" 2>&1 + git push -u origin "$BRANCH" --force >> "$LOG" 2>&1 - # Open PR - PR_TITLE="$AGENT: extract claims from $(basename "$SOURCE_FILE" .md)" - PR_BODY="## Automated Extraction\n\nSource: \`$SOURCE_FILE\`\nDomain: $DOMAIN\nExtracted by: headless cron on VPS\n\nThis PR was created automatically by the extraction cron job. Claims were extracted using \`skills/extract.md\` process via Claude headless." - - curl -s -X POST "http://localhost:3000/api/v1/repos/teleo/teleo-codex/pulls" \ + # Check if PR already exists for this branch + EXISTING_PR=$(curl -s "${FORGEJO_URL}/api/v1/repos/teleo/teleo-codex/pulls?state=open" \ -H "Authorization: token $AGENT_TOKEN" \ - -H "Content-Type: application/json" \ - -d "{ - \"title\": \"$PR_TITLE\", - \"body\": \"$PR_BODY\", - \"base\": \"main\", - \"head\": \"$BRANCH\" - }" >> "$LOG" 2>&1 + | jq -r ".[] | select(.head.ref == \"$BRANCH\") | .number" 2>/dev/null) - log "PR opened for $SOURCE_FILE" + if [ -n "$EXISTING_PR" ]; then + log "PR already exists for $BRANCH (#$EXISTING_PR), skipping creation" + else + # Build PR JSON safely with jq + PR_JSON=$(jq -n \ + --arg title "$AGENT: extract claims from $(basename "$SOURCE_FILE" .md)" \ + --arg body "## Automated Extraction + +Source: \`$SOURCE_FILE\` +Domain: $DOMAIN +Extracted by: headless cron on VPS + +This PR was created automatically by the extraction cron job. Claims were extracted using \`skills/extract.md\` process via Claude headless." \ + --arg base "main" \ + --arg head "$BRANCH" \ + '{title: $title, body: $body, base: $base, head: $head}') + + curl -s -X POST "${FORGEJO_URL}/api/v1/repos/teleo/teleo-codex/pulls" \ + -H "Authorization: token $AGENT_TOKEN" \ + -H "Content-Type: application/json" \ + -d "$PR_JSON" >> "$LOG" 2>&1 + + log "PR opened for $SOURCE_FILE" + fi # Back to main for next source git checkout main >> "$LOG" 2>&1 @@ -176,4 +233,16 @@ Pentagon-Agent: $(echo "$AGENT" | sed 's/./\U&/') " >> "$LOG" 2>&1 sleep 5 done +# Clean up pending file — remove entries for sources that have been processed +# (their PRs exist or their status changed on main) +if [ -f "$PENDING_FILE" ]; then + TEMP_PENDING=$(mktemp) + while IFS= read -r pending_source; do + if [ -f "$pending_source" ] && grep -q '^status: unprocessed' "$pending_source" 2>/dev/null; then + echo "$pending_source" >> "$TEMP_PENDING" + fi + done < "$PENDING_FILE" + mv "$TEMP_PENDING" "$PENDING_FILE" +fi + log "Extraction run complete: processed $COUNT source(s)"