From e991e55ab738e143e552e490b208b79671c39f77 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Tue, 10 Mar 2026 10:31:39 +0000 Subject: [PATCH] Auto: ops/extract-cron.sh | 1 file changed, 167 insertions(+) --- ops/extract-cron.sh | 167 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 ops/extract-cron.sh diff --git a/ops/extract-cron.sh b/ops/extract-cron.sh new file mode 100644 index 0000000..53827cf --- /dev/null +++ b/ops/extract-cron.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# Extract claims from unprocessed sources in inbox/archive/ +# Runs via cron on VPS every 15 minutes. +# +# Flow: +# 1. Pull latest main +# 2. Find sources with status: unprocessed +# 3. For each: run Claude headless to extract claims +# 4. Commit extractions, push, open PR +# 5. Update source status to processed +# +# The eval pipeline (webhook.py) handles review and merge separately. + +set -euo pipefail + +REPO_DIR="/opt/teleo-eval/workspaces/extract" +REPO_URL="http://m3taversal:$(cat /opt/teleo-eval/secrets/forgejo-admin-token)@localhost:3000/teleo/teleo-codex.git" +CLAUDE_BIN="/home/teleo/.local/bin/claude" +LOG_DIR="/opt/teleo-eval/logs" +LOG="$LOG_DIR/extract-cron.log" +LOCKFILE="/tmp/extract-cron.lock" +MAX_SOURCES=5 # Process at most 5 sources per run to limit cost + +log() { echo "[$(date -Iseconds)] $*" >> "$LOG"; } + +# --- Lock --- +if [ -f "$LOCKFILE" ]; then + pid=$(cat "$LOCKFILE" 2>/dev/null) + if kill -0 "$pid" 2>/dev/null; then + log "SKIP: already running (pid $pid)" + exit 0 + fi + log "WARN: stale lockfile, removing" + rm -f "$LOCKFILE" +fi +echo $$ > "$LOCKFILE" +trap 'rm -f "$LOCKFILE"' EXIT + +# --- Ensure repo clone --- +if [ ! -d "$REPO_DIR/.git" ]; then + log "Cloning repo..." + git clone "$REPO_URL" "$REPO_DIR" >> "$LOG" 2>&1 +fi + +cd "$REPO_DIR" + +# --- Pull latest main --- +git checkout main >> "$LOG" 2>&1 +git pull --rebase >> "$LOG" 2>&1 + +# --- Find unprocessed sources --- +UNPROCESSED=$(grep -rl '^status: unprocessed' inbox/archive/ 2>/dev/null | head -n "$MAX_SOURCES" || true) + +if [ -z "$UNPROCESSED" ]; then + log "No unprocessed sources found" + exit 0 +fi + +COUNT=$(echo "$UNPROCESSED" | wc -l | tr -d ' ') +log "Found $COUNT unprocessed source(s)" + +# --- Process each source --- +for SOURCE_FILE in $UNPROCESSED; do + SLUG=$(basename "$SOURCE_FILE" .md) + BRANCH="extract/$SLUG" + + log "Processing: $SOURCE_FILE → branch $BRANCH" + + # Create branch from main + git checkout main >> "$LOG" 2>&1 + git branch -D "$BRANCH" 2>/dev/null || true + git checkout -b "$BRANCH" >> "$LOG" 2>&1 + + # Read domain from frontmatter + DOMAIN=$(grep '^domain:' "$SOURCE_FILE" | head -1 | sed 's/domain: *//' | tr -d '"' | tr -d "'" | xargs) + + # Map domain to agent + case "$DOMAIN" in + internet-finance) AGENT="rio" ;; + entertainment) AGENT="clay" ;; + ai-alignment) AGENT="theseus" ;; + health) AGENT="vida" ;; + space-development) AGENT="astra" ;; + *) AGENT="leo" ;; + esac + + AGENT_TOKEN=$(cat "/opt/teleo-eval/secrets/forgejo-${AGENT}-token" 2>/dev/null || cat /opt/teleo-eval/secrets/forgejo-leo-token) + + log "Domain: $DOMAIN, Agent: $AGENT" + + # Run Claude headless to extract claims + EXTRACT_PROMPT="You are $AGENT, a Teleo knowledge base agent. Extract claims from this source. + +READ these files first: +- skills/extract.md (extraction process) +- schemas/claim.md (claim format) +- $SOURCE_FILE (the source to extract from) + +Then scan domains/$DOMAIN/ to check for duplicate claims. + +EXTRACT claims following the process in skills/extract.md: +1. Read the source completely +2. Separate evidence from interpretation +3. Extract candidate claims (specific, disagreeable, evidence-backed) +4. Check for duplicates against existing claims in domains/$DOMAIN/ +5. Write claim files to domains/$DOMAIN/ with proper YAML frontmatter +6. Update $SOURCE_FILE: set status to 'processed', add processed_by: $AGENT, processed_date: $(date +%Y-%m-%d), and claims_extracted list + +If no claims can be extracted, update $SOURCE_FILE: set status to 'null-result' and add notes explaining why. + +IMPORTANT: Use the Edit tool to update the source file status. Use the Write tool to create new claim files. Do not create claims that duplicate existing ones." + + # Run extraction with timeout (10 minutes) + timeout 600 "$CLAUDE_BIN" -p "$EXTRACT_PROMPT" \ + --allowedTools 'Read,Write,Edit,Glob,Grep' \ + --model sonnet \ + >> "$LOG" 2>&1 || { + log "WARN: Claude extraction failed or timed out for $SOURCE_FILE" + git checkout main >> "$LOG" 2>&1 + continue + } + + # Check if any files were created/modified + CHANGES=$(git status --porcelain | wc -l | tr -d ' ') + if [ "$CHANGES" -eq 0 ]; then + log "No changes produced for $SOURCE_FILE" + git checkout main >> "$LOG" 2>&1 + continue + fi + + # Stage and commit + git add inbox/archive/ "domains/$DOMAIN/" >> "$LOG" 2>&1 + git commit -m "$AGENT: extract claims from $(basename "$SOURCE_FILE") + +- Source: $SOURCE_FILE +- Domain: $DOMAIN +- Extracted by: headless extraction cron + +Pentagon-Agent: $(echo "$AGENT" | sed 's/./\U&/') " >> "$LOG" 2>&1 + + # Push branch + git push -u "$REPO_URL" "$BRANCH" --force >> "$LOG" 2>&1 + + # Open PR + PR_TITLE="$AGENT: extract claims from $(basename "$SOURCE_FILE" .md)" + PR_BODY="## Automated Extraction\n\nSource: \`$SOURCE_FILE\`\nDomain: $DOMAIN\nExtracted by: headless cron on VPS\n\nThis PR was created automatically by the extraction cron job. Claims were extracted using \`skills/extract.md\` process via Claude headless." + + curl -s -X POST "http://localhost:3000/api/v1/repos/teleo/teleo-codex/pulls" \ + -H "Authorization: token $AGENT_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{ + \"title\": \"$PR_TITLE\", + \"body\": \"$PR_BODY\", + \"base\": \"main\", + \"head\": \"$BRANCH\" + }" >> "$LOG" 2>&1 + + log "PR opened for $SOURCE_FILE" + + # Back to main for next source + git checkout main >> "$LOG" 2>&1 + + # Brief pause between extractions + sleep 5 +done + +log "Extraction run complete: processed $COUNT source(s)"