Auto: ops/extract-cron.sh | 1 file changed, 167 insertions(+)
This commit is contained in:
parent
fb18064f97
commit
e991e55ab7
1 changed files with 167 additions and 0 deletions
167
ops/extract-cron.sh
Normal file
167
ops/extract-cron.sh
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
#!/bin/bash
|
||||
# Extract claims from unprocessed sources in inbox/archive/
|
||||
# Runs via cron on VPS every 15 minutes.
|
||||
#
|
||||
# Flow:
|
||||
# 1. Pull latest main
|
||||
# 2. Find sources with status: unprocessed
|
||||
# 3. For each: run Claude headless to extract claims
|
||||
# 4. Commit extractions, push, open PR
|
||||
# 5. Update source status to processed
|
||||
#
|
||||
# The eval pipeline (webhook.py) handles review and merge separately.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO_DIR="/opt/teleo-eval/workspaces/extract"
|
||||
REPO_URL="http://m3taversal:$(cat /opt/teleo-eval/secrets/forgejo-admin-token)@localhost:3000/teleo/teleo-codex.git"
|
||||
CLAUDE_BIN="/home/teleo/.local/bin/claude"
|
||||
LOG_DIR="/opt/teleo-eval/logs"
|
||||
LOG="$LOG_DIR/extract-cron.log"
|
||||
LOCKFILE="/tmp/extract-cron.lock"
|
||||
MAX_SOURCES=5 # Process at most 5 sources per run to limit cost
|
||||
|
||||
log() { echo "[$(date -Iseconds)] $*" >> "$LOG"; }
|
||||
|
||||
# --- Lock ---
|
||||
if [ -f "$LOCKFILE" ]; then
|
||||
pid=$(cat "$LOCKFILE" 2>/dev/null)
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log "SKIP: already running (pid $pid)"
|
||||
exit 0
|
||||
fi
|
||||
log "WARN: stale lockfile, removing"
|
||||
rm -f "$LOCKFILE"
|
||||
fi
|
||||
echo $$ > "$LOCKFILE"
|
||||
trap 'rm -f "$LOCKFILE"' EXIT
|
||||
|
||||
# --- Ensure repo clone ---
|
||||
if [ ! -d "$REPO_DIR/.git" ]; then
|
||||
log "Cloning repo..."
|
||||
git clone "$REPO_URL" "$REPO_DIR" >> "$LOG" 2>&1
|
||||
fi
|
||||
|
||||
cd "$REPO_DIR"
|
||||
|
||||
# --- Pull latest main ---
|
||||
git checkout main >> "$LOG" 2>&1
|
||||
git pull --rebase >> "$LOG" 2>&1
|
||||
|
||||
# --- Find unprocessed sources ---
|
||||
UNPROCESSED=$(grep -rl '^status: unprocessed' inbox/archive/ 2>/dev/null | head -n "$MAX_SOURCES" || true)
|
||||
|
||||
if [ -z "$UNPROCESSED" ]; then
|
||||
log "No unprocessed sources found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$UNPROCESSED" | wc -l | tr -d ' ')
|
||||
log "Found $COUNT unprocessed source(s)"
|
||||
|
||||
# --- Process each source ---
|
||||
for SOURCE_FILE in $UNPROCESSED; do
|
||||
SLUG=$(basename "$SOURCE_FILE" .md)
|
||||
BRANCH="extract/$SLUG"
|
||||
|
||||
log "Processing: $SOURCE_FILE → branch $BRANCH"
|
||||
|
||||
# Create branch from main
|
||||
git checkout main >> "$LOG" 2>&1
|
||||
git branch -D "$BRANCH" 2>/dev/null || true
|
||||
git checkout -b "$BRANCH" >> "$LOG" 2>&1
|
||||
|
||||
# Read domain from frontmatter
|
||||
DOMAIN=$(grep '^domain:' "$SOURCE_FILE" | head -1 | sed 's/domain: *//' | tr -d '"' | tr -d "'" | xargs)
|
||||
|
||||
# Map domain to agent
|
||||
case "$DOMAIN" in
|
||||
internet-finance) AGENT="rio" ;;
|
||||
entertainment) AGENT="clay" ;;
|
||||
ai-alignment) AGENT="theseus" ;;
|
||||
health) AGENT="vida" ;;
|
||||
space-development) AGENT="astra" ;;
|
||||
*) AGENT="leo" ;;
|
||||
esac
|
||||
|
||||
AGENT_TOKEN=$(cat "/opt/teleo-eval/secrets/forgejo-${AGENT}-token" 2>/dev/null || cat /opt/teleo-eval/secrets/forgejo-leo-token)
|
||||
|
||||
log "Domain: $DOMAIN, Agent: $AGENT"
|
||||
|
||||
# Run Claude headless to extract claims
|
||||
EXTRACT_PROMPT="You are $AGENT, a Teleo knowledge base agent. Extract claims from this source.
|
||||
|
||||
READ these files first:
|
||||
- skills/extract.md (extraction process)
|
||||
- schemas/claim.md (claim format)
|
||||
- $SOURCE_FILE (the source to extract from)
|
||||
|
||||
Then scan domains/$DOMAIN/ to check for duplicate claims.
|
||||
|
||||
EXTRACT claims following the process in skills/extract.md:
|
||||
1. Read the source completely
|
||||
2. Separate evidence from interpretation
|
||||
3. Extract candidate claims (specific, disagreeable, evidence-backed)
|
||||
4. Check for duplicates against existing claims in domains/$DOMAIN/
|
||||
5. Write claim files to domains/$DOMAIN/ with proper YAML frontmatter
|
||||
6. Update $SOURCE_FILE: set status to 'processed', add processed_by: $AGENT, processed_date: $(date +%Y-%m-%d), and claims_extracted list
|
||||
|
||||
If no claims can be extracted, update $SOURCE_FILE: set status to 'null-result' and add notes explaining why.
|
||||
|
||||
IMPORTANT: Use the Edit tool to update the source file status. Use the Write tool to create new claim files. Do not create claims that duplicate existing ones."
|
||||
|
||||
# Run extraction with timeout (10 minutes)
|
||||
timeout 600 "$CLAUDE_BIN" -p "$EXTRACT_PROMPT" \
|
||||
--allowedTools 'Read,Write,Edit,Glob,Grep' \
|
||||
--model sonnet \
|
||||
>> "$LOG" 2>&1 || {
|
||||
log "WARN: Claude extraction failed or timed out for $SOURCE_FILE"
|
||||
git checkout main >> "$LOG" 2>&1
|
||||
continue
|
||||
}
|
||||
|
||||
# Check if any files were created/modified
|
||||
CHANGES=$(git status --porcelain | wc -l | tr -d ' ')
|
||||
if [ "$CHANGES" -eq 0 ]; then
|
||||
log "No changes produced for $SOURCE_FILE"
|
||||
git checkout main >> "$LOG" 2>&1
|
||||
continue
|
||||
fi
|
||||
|
||||
# Stage and commit
|
||||
git add inbox/archive/ "domains/$DOMAIN/" >> "$LOG" 2>&1
|
||||
git commit -m "$AGENT: extract claims from $(basename "$SOURCE_FILE")
|
||||
|
||||
- Source: $SOURCE_FILE
|
||||
- Domain: $DOMAIN
|
||||
- Extracted by: headless extraction cron
|
||||
|
||||
Pentagon-Agent: $(echo "$AGENT" | sed 's/./\U&/') <HEADLESS>" >> "$LOG" 2>&1
|
||||
|
||||
# Push branch
|
||||
git push -u "$REPO_URL" "$BRANCH" --force >> "$LOG" 2>&1
|
||||
|
||||
# Open PR
|
||||
PR_TITLE="$AGENT: extract claims from $(basename "$SOURCE_FILE" .md)"
|
||||
PR_BODY="## Automated Extraction\n\nSource: \`$SOURCE_FILE\`\nDomain: $DOMAIN\nExtracted by: headless cron on VPS\n\nThis PR was created automatically by the extraction cron job. Claims were extracted using \`skills/extract.md\` process via Claude headless."
|
||||
|
||||
curl -s -X POST "http://localhost:3000/api/v1/repos/teleo/teleo-codex/pulls" \
|
||||
-H "Authorization: token $AGENT_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"title\": \"$PR_TITLE\",
|
||||
\"body\": \"$PR_BODY\",
|
||||
\"base\": \"main\",
|
||||
\"head\": \"$BRANCH\"
|
||||
}" >> "$LOG" 2>&1
|
||||
|
||||
log "PR opened for $SOURCE_FILE"
|
||||
|
||||
# Back to main for next source
|
||||
git checkout main >> "$LOG" 2>&1
|
||||
|
||||
# Brief pause between extractions
|
||||
sleep 5
|
||||
done
|
||||
|
||||
log "Extraction run complete: processed $COUNT source(s)"
|
||||
Loading…
Reference in a new issue