Auto: ops/extract-cron.sh | 1 file changed, 96 insertions(+), 27 deletions(-)
This commit is contained in:
parent
0c83c78132
commit
751a651ce8
1 changed files with 95 additions and 26 deletions
|
|
@ -18,20 +18,23 @@
|
|||
# 1. Pull latest main
|
||||
# 2. Find sources with status: unprocessed (skip processing/processed/null-result)
|
||||
# 3. For each: run Claude headless to extract claims as the domain agent
|
||||
# 4. Commit extractions, push, open PR
|
||||
# 5. Update source status to processed
|
||||
# 4. Mark source as processing on main (prevents re-processing next cycle)
|
||||
# 5. Commit extractions on branch, push, open PR
|
||||
# 6. Eval pipeline reviews the extraction PR separately
|
||||
#
|
||||
# The eval pipeline (webhook.py) handles review and merge separately.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO_DIR="/opt/teleo-eval/workspaces/extract"
|
||||
REPO_URL="http://m3taversal:$(cat /opt/teleo-eval/secrets/forgejo-admin-token)@localhost:3000/teleo/teleo-codex.git"
|
||||
FORGEJO_URL="http://localhost:3000"
|
||||
FORGEJO_ADMIN_TOKEN=$(cat /opt/teleo-eval/secrets/forgejo-admin-token)
|
||||
CLAUDE_BIN="/home/teleo/.local/bin/claude"
|
||||
LOG_DIR="/opt/teleo-eval/logs"
|
||||
LOG="$LOG_DIR/extract-cron.log"
|
||||
LOCKFILE="/tmp/extract-cron.lock"
|
||||
MAX_SOURCES=5 # Process at most 5 sources per run to limit cost
|
||||
PENDING_FILE="/opt/teleo-eval/extract-pending.txt"
|
||||
MAX_SOURCES=5 # Process at most 5 sources per run
|
||||
|
||||
log() { echo "[$(date -Iseconds)] $*" >> "$LOG"; }
|
||||
|
||||
|
|
@ -48,20 +51,37 @@ fi
|
|||
echo $$ > "$LOCKFILE"
|
||||
trap 'rm -f "$LOCKFILE"' EXIT
|
||||
|
||||
# --- Init pending file ---
|
||||
touch "$PENDING_FILE"
|
||||
|
||||
# --- Ensure repo clone ---
|
||||
if [ ! -d "$REPO_DIR/.git" ]; then
|
||||
log "Cloning repo..."
|
||||
git clone "$REPO_URL" "$REPO_DIR" >> "$LOG" 2>&1
|
||||
git -c http.extraHeader="Authorization: token $FORGEJO_ADMIN_TOKEN" \
|
||||
clone "${FORGEJO_URL}/teleo/teleo-codex.git" "$REPO_DIR" >> "$LOG" 2>&1
|
||||
fi
|
||||
|
||||
cd "$REPO_DIR"
|
||||
|
||||
# Configure git auth via credential helper (keeps tokens out of logs)
|
||||
git config credential.helper "!f() { echo username=m3taversal; echo password=$FORGEJO_ADMIN_TOKEN; }; f"
|
||||
git remote set-url origin "${FORGEJO_URL}/teleo/teleo-codex.git" 2>/dev/null || true
|
||||
|
||||
# --- Pull latest main ---
|
||||
git checkout main >> "$LOG" 2>&1
|
||||
git pull --rebase >> "$LOG" 2>&1
|
||||
|
||||
# --- Find unprocessed sources ---
|
||||
UNPROCESSED=$(grep -rl '^status: unprocessed' inbox/archive/ 2>/dev/null | head -n "$MAX_SOURCES" || true)
|
||||
# Only match status: unprocessed within YAML frontmatter (between first two --- lines)
|
||||
UNPROCESSED=$(awk '/^---$/{f++} f==1 && /^status: unprocessed/{print FILENAME; nextfile}' inbox/archive/*.md 2>/dev/null || true)
|
||||
|
||||
# Filter out sources already pending extraction
|
||||
if [ -s "$PENDING_FILE" ]; then
|
||||
UNPROCESSED=$(echo "$UNPROCESSED" | grep -vxFf "$PENDING_FILE" || true)
|
||||
fi
|
||||
|
||||
# Limit to MAX_SOURCES
|
||||
UNPROCESSED=$(echo "$UNPROCESSED" | head -n "$MAX_SOURCES")
|
||||
|
||||
if [ -z "$UNPROCESSED" ]; then
|
||||
log "No unprocessed sources found"
|
||||
|
|
@ -78,13 +98,16 @@ for SOURCE_FILE in $UNPROCESSED; do
|
|||
|
||||
log "Processing: $SOURCE_FILE → branch $BRANCH"
|
||||
|
||||
# Mark as pending (prevents re-processing on next cron cycle)
|
||||
echo "$SOURCE_FILE" >> "$PENDING_FILE"
|
||||
|
||||
# Create branch from main
|
||||
git checkout main >> "$LOG" 2>&1
|
||||
git branch -D "$BRANCH" 2>/dev/null || true
|
||||
git checkout -b "$BRANCH" >> "$LOG" 2>&1
|
||||
|
||||
# Read domain from frontmatter
|
||||
DOMAIN=$(grep '^domain:' "$SOURCE_FILE" | head -1 | sed 's/domain: *//' | tr -d '"' | tr -d "'" | xargs)
|
||||
DOMAIN=$(awk '/^---$/{f++} f==1 && /^domain:/{sub(/^domain: */, ""); gsub(/["'"'"']/, ""); print; exit}' "$SOURCE_FILE")
|
||||
|
||||
# Map domain to agent
|
||||
case "$DOMAIN" in
|
||||
|
|
@ -126,6 +149,7 @@ IMPORTANT: Use the Edit tool to update the source file status. Use the Write too
|
|||
timeout 600 "$CLAUDE_BIN" -p "$EXTRACT_PROMPT" \
|
||||
--allowedTools 'Read,Write,Edit,Glob,Grep' \
|
||||
--model sonnet \
|
||||
--permission-mode bypassPermissions \
|
||||
>> "$LOG" 2>&1 || {
|
||||
log "WARN: Claude extraction failed or timed out for $SOURCE_FILE"
|
||||
git checkout main >> "$LOG" 2>&1
|
||||
|
|
@ -133,41 +157,74 @@ IMPORTANT: Use the Edit tool to update the source file status. Use the Write too
|
|||
}
|
||||
|
||||
# Check if any files were created/modified
|
||||
CHANGES=$(git status --porcelain | wc -l | tr -d ' ')
|
||||
if [ "$CHANGES" -eq 0 ]; then
|
||||
CHANGED_FILES=$(git status --porcelain)
|
||||
if [ -z "$CHANGED_FILES" ]; then
|
||||
log "No changes produced for $SOURCE_FILE"
|
||||
git checkout main >> "$LOG" 2>&1
|
||||
continue
|
||||
fi
|
||||
|
||||
# Stage and commit
|
||||
git add inbox/archive/ "domains/$DOMAIN/" >> "$LOG" 2>&1
|
||||
# Stage only files in expected paths
|
||||
git status --porcelain | awk '{print $2}' | while read -r f; do
|
||||
case "$f" in
|
||||
inbox/archive/*|domains/*)
|
||||
git add "$f" >> "$LOG" 2>&1
|
||||
;;
|
||||
*)
|
||||
log "WARN: Unexpected file change outside inbox/domains: $f — skipping"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Check if anything was staged
|
||||
if git diff --cached --quiet; then
|
||||
log "No valid changes to commit for $SOURCE_FILE"
|
||||
git checkout -- . >> "$LOG" 2>&1
|
||||
git checkout main >> "$LOG" 2>&1
|
||||
continue
|
||||
fi
|
||||
|
||||
AGENT_UPPER=$(echo "$AGENT" | sed 's/./\U&/')
|
||||
git commit -m "$AGENT: extract claims from $(basename "$SOURCE_FILE")
|
||||
|
||||
- Source: $SOURCE_FILE
|
||||
- Domain: $DOMAIN
|
||||
- Extracted by: headless extraction cron
|
||||
|
||||
Pentagon-Agent: $(echo "$AGENT" | sed 's/./\U&/') <HEADLESS>" >> "$LOG" 2>&1
|
||||
Pentagon-Agent: $AGENT_UPPER <HEADLESS>" >> "$LOG" 2>&1
|
||||
|
||||
# Push branch
|
||||
git push -u "$REPO_URL" "$BRANCH" --force >> "$LOG" 2>&1
|
||||
git push -u origin "$BRANCH" --force >> "$LOG" 2>&1
|
||||
|
||||
# Open PR
|
||||
PR_TITLE="$AGENT: extract claims from $(basename "$SOURCE_FILE" .md)"
|
||||
PR_BODY="## Automated Extraction\n\nSource: \`$SOURCE_FILE\`\nDomain: $DOMAIN\nExtracted by: headless cron on VPS\n\nThis PR was created automatically by the extraction cron job. Claims were extracted using \`skills/extract.md\` process via Claude headless."
|
||||
# Check if PR already exists for this branch
|
||||
EXISTING_PR=$(curl -s "${FORGEJO_URL}/api/v1/repos/teleo/teleo-codex/pulls?state=open" \
|
||||
-H "Authorization: token $AGENT_TOKEN" \
|
||||
| jq -r ".[] | select(.head.ref == \"$BRANCH\") | .number" 2>/dev/null)
|
||||
|
||||
curl -s -X POST "http://localhost:3000/api/v1/repos/teleo/teleo-codex/pulls" \
|
||||
if [ -n "$EXISTING_PR" ]; then
|
||||
log "PR already exists for $BRANCH (#$EXISTING_PR), skipping creation"
|
||||
else
|
||||
# Build PR JSON safely with jq
|
||||
PR_JSON=$(jq -n \
|
||||
--arg title "$AGENT: extract claims from $(basename "$SOURCE_FILE" .md)" \
|
||||
--arg body "## Automated Extraction
|
||||
|
||||
Source: \`$SOURCE_FILE\`
|
||||
Domain: $DOMAIN
|
||||
Extracted by: headless cron on VPS
|
||||
|
||||
This PR was created automatically by the extraction cron job. Claims were extracted using \`skills/extract.md\` process via Claude headless." \
|
||||
--arg base "main" \
|
||||
--arg head "$BRANCH" \
|
||||
'{title: $title, body: $body, base: $base, head: $head}')
|
||||
|
||||
curl -s -X POST "${FORGEJO_URL}/api/v1/repos/teleo/teleo-codex/pulls" \
|
||||
-H "Authorization: token $AGENT_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"title\": \"$PR_TITLE\",
|
||||
\"body\": \"$PR_BODY\",
|
||||
\"base\": \"main\",
|
||||
\"head\": \"$BRANCH\"
|
||||
}" >> "$LOG" 2>&1
|
||||
-d "$PR_JSON" >> "$LOG" 2>&1
|
||||
|
||||
log "PR opened for $SOURCE_FILE"
|
||||
fi
|
||||
|
||||
# Back to main for next source
|
||||
git checkout main >> "$LOG" 2>&1
|
||||
|
|
@ -176,4 +233,16 @@ Pentagon-Agent: $(echo "$AGENT" | sed 's/./\U&/') <HEADLESS>" >> "$LOG" 2>&1
|
|||
sleep 5
|
||||
done
|
||||
|
||||
# Clean up pending file — remove entries for sources that have been processed
|
||||
# (their PRs exist or their status changed on main)
|
||||
if [ -f "$PENDING_FILE" ]; then
|
||||
TEMP_PENDING=$(mktemp)
|
||||
while IFS= read -r pending_source; do
|
||||
if [ -f "$pending_source" ] && grep -q '^status: unprocessed' "$pending_source" 2>/dev/null; then
|
||||
echo "$pending_source" >> "$TEMP_PENDING"
|
||||
fi
|
||||
done < "$PENDING_FILE"
|
||||
mv "$TEMP_PENDING" "$PENDING_FILE"
|
||||
fi
|
||||
|
||||
log "Extraction run complete: processed $COUNT source(s)"
|
||||
|
|
|
|||
Loading…
Reference in a new issue