- watchdog.py: tier0 auto-recovery (3 retries, 1h cooldown, audit trail) — pending Ganymede review - stale_pr.py: new module, closes extraction PRs open >30 min with zero claims - deploy.sh: expanded with new deployment features - validate.py, extract.py, cascade.py, db.py: minor fixes - backfill-descriptions.py: utility script - review_queue.py: minor fix Note: watchdog + stale_pr not yet deployed to VPS (reverted after missing import crash) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
207 lines
6.1 KiB
Bash
Executable file
207 lines
6.1 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# deploy.sh — Deploy pipeline and diagnostics to VPS from repo
|
|
# Usage: ./deploy.sh [--dry-run] [--restart]
|
|
#
|
|
# Requires: committed, clean working tree. Enforces repo-first workflow.
|
|
set -euo pipefail
|
|
|
|
VPS_HOST="teleo@77.42.65.182"
|
|
VPS_PIPELINE="/opt/teleo-eval/pipeline"
|
|
VPS_DIAGNOSTICS="/opt/teleo-eval/diagnostics"
|
|
VPS_AGENT_STATE="/opt/teleo-eval/ops/agent-state"
|
|
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
|
|
DRY_RUN=false
|
|
RESTART=false
|
|
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--dry-run) DRY_RUN=true ;;
|
|
--restart) RESTART=true ;;
|
|
--help|-h)
|
|
echo "Usage: $0 [--dry-run] [--restart]"
|
|
echo " --dry-run Show what would be deployed without doing it"
|
|
echo " --restart Restart services after deploy"
|
|
exit 0
|
|
;;
|
|
*) echo "Unknown arg: $arg"; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
# Gate: working tree must be clean
|
|
if [ -n "$(git -C "$REPO_ROOT" status --porcelain)" ]; then
|
|
echo "ERROR: Uncommitted changes. Commit first, deploy second."
|
|
git -C "$REPO_ROOT" status --short
|
|
exit 1
|
|
fi
|
|
|
|
echo "Deploying from commit: $(git -C "$REPO_ROOT" log --oneline -1)"
|
|
echo ""
|
|
|
|
# Syntax check all Python files before deploying
|
|
echo "=== Pre-deploy syntax check ==="
|
|
ERRORS=0
|
|
for f in "$REPO_ROOT/ops/pipeline-v2/lib/"*.py "$REPO_ROOT/ops/pipeline-v2/"*.py "$REPO_ROOT/ops/diagnostics/"*.py; do
|
|
[ -f "$f" ] || continue
|
|
if ! python3 -c "import ast, sys; ast.parse(open(sys.argv[1]).read())" "$f" 2>/dev/null; then
|
|
echo "SYNTAX ERROR: $f"
|
|
ERRORS=$((ERRORS + 1))
|
|
fi
|
|
done
|
|
if [ "$ERRORS" -gt 0 ]; then
|
|
echo "ERROR: $ERRORS files have syntax errors. Fix before deploying."
|
|
exit 1
|
|
fi
|
|
echo "All files pass syntax check."
|
|
echo ""
|
|
|
|
RSYNC_FLAGS="-avz --exclude='__pycache__' --exclude='*.pyc' --exclude='*.bak*'"
|
|
if $DRY_RUN; then
|
|
RSYNC_FLAGS="$RSYNC_FLAGS --dry-run"
|
|
echo "=== DRY RUN ==="
|
|
fi
|
|
|
|
echo "=== Pipeline lib/ ==="
|
|
rsync $RSYNC_FLAGS "$REPO_ROOT/ops/pipeline-v2/lib/" "$VPS_HOST:$VPS_PIPELINE/lib/"
|
|
echo ""
|
|
|
|
echo "=== Pipeline top-level ==="
|
|
for f in teleo-pipeline.py reweave.py batch-extract-50.sh; do
|
|
[ -f "$REPO_ROOT/ops/pipeline-v2/$f" ] || continue
|
|
rsync $RSYNC_FLAGS "$REPO_ROOT/ops/pipeline-v2/$f" "$VPS_HOST:$VPS_PIPELINE/$f"
|
|
done
|
|
echo ""
|
|
|
|
echo "=== Diagnostics ==="
|
|
rsync $RSYNC_FLAGS "$REPO_ROOT/ops/diagnostics/" "$VPS_HOST:$VPS_DIAGNOSTICS/"
|
|
echo ""
|
|
|
|
echo "=== Agent state ==="
|
|
rsync $RSYNC_FLAGS "$REPO_ROOT/ops/agent-state/" "$VPS_HOST:$VPS_AGENT_STATE/"
|
|
echo ""
|
|
|
|
echo "=== Research session ==="
|
|
rsync $RSYNC_FLAGS "$REPO_ROOT/ops/research-session.sh" "$VPS_HOST:/opt/teleo-eval/research-session.sh"
|
|
echo ""
|
|
|
|
if $DRY_RUN; then
|
|
echo "Dry run complete. No changes made."
|
|
exit 0
|
|
fi
|
|
|
|
echo "Deploy complete."
|
|
|
|
if $RESTART; then
|
|
echo ""
|
|
echo "=== Detecting services to restart ==="
|
|
|
|
# Determine which services need restart based on what was deployed.
|
|
# rsync touched these paths → these services:
|
|
# pipeline-v2/lib/, pipeline-v2/*.py → teleo-pipeline
|
|
# diagnostics/ → teleo-diagnostics
|
|
# agent-state/, research-session.sh → no restart (not daemons)
|
|
RESTART_SVCS=""
|
|
|
|
# Check VPS for recent file changes from this deploy
|
|
# Compare local files against VPS to see what actually changed
|
|
PIPELINE_CHANGED=false
|
|
DIAG_CHANGED=false
|
|
|
|
# Pipeline: lib/ or top-level scripts
|
|
if ! rsync -avzn --exclude='__pycache__' --exclude='*.pyc' --exclude='*.bak*' \
|
|
"$REPO_ROOT/ops/pipeline-v2/lib/" "$VPS_HOST:$VPS_PIPELINE/lib/" 2>/dev/null | grep -q '\.py$'; then
|
|
true # no python changes
|
|
else
|
|
PIPELINE_CHANGED=true
|
|
fi
|
|
for f in teleo-pipeline.py reweave.py; do
|
|
if [ -f "$REPO_ROOT/ops/pipeline-v2/$f" ]; then
|
|
if rsync -avzn "$REPO_ROOT/ops/pipeline-v2/$f" "$VPS_HOST:$VPS_PIPELINE/$f" 2>/dev/null | grep -q "$f"; then
|
|
PIPELINE_CHANGED=true
|
|
fi
|
|
fi
|
|
done
|
|
|
|
# Diagnostics
|
|
if rsync -avzn --exclude='__pycache__' --exclude='*.pyc' --exclude='*.bak*' \
|
|
"$REPO_ROOT/ops/diagnostics/" "$VPS_HOST:$VPS_DIAGNOSTICS/" 2>/dev/null | grep -q '\.py$'; then
|
|
DIAG_CHANGED=true
|
|
fi
|
|
|
|
if $PIPELINE_CHANGED; then
|
|
RESTART_SVCS="$RESTART_SVCS teleo-pipeline"
|
|
echo " teleo-pipeline: files changed, will restart"
|
|
else
|
|
echo " teleo-pipeline: no changes, skipping"
|
|
fi
|
|
|
|
if $DIAG_CHANGED; then
|
|
RESTART_SVCS="$RESTART_SVCS teleo-diagnostics"
|
|
echo " teleo-diagnostics: files changed, will restart"
|
|
else
|
|
echo " teleo-diagnostics: no changes, skipping"
|
|
fi
|
|
|
|
if [ -z "$RESTART_SVCS" ]; then
|
|
echo ""
|
|
echo "No service files changed. Skipping restart."
|
|
else
|
|
echo ""
|
|
echo "=== Restarting:$RESTART_SVCS ==="
|
|
ssh "$VPS_HOST" "sudo systemctl restart $RESTART_SVCS"
|
|
echo "Services restarted. Waiting 5s for startup..."
|
|
sleep 5
|
|
|
|
echo ""
|
|
echo "=== Smoke test ==="
|
|
SMOKE_FAIL=0
|
|
|
|
# Check systemd unit status for restarted services
|
|
for svc in $RESTART_SVCS; do
|
|
if ssh "$VPS_HOST" "systemctl is-active --quiet $svc"; then
|
|
echo " $svc: active"
|
|
else
|
|
echo " $svc: FAILED"
|
|
ssh "$VPS_HOST" "journalctl -u $svc -n 10 --no-pager" || true
|
|
SMOKE_FAIL=1
|
|
fi
|
|
done
|
|
|
|
# Hit health endpoints for restarted services
|
|
if echo "$RESTART_SVCS" | grep -q "teleo-pipeline"; then
|
|
if ssh "$VPS_HOST" "curl -sf --connect-timeout 3 http://localhost:8080/health > /dev/null"; then
|
|
echo " pipeline health (8080): OK"
|
|
else
|
|
echo " pipeline health (8080): FAILED"
|
|
SMOKE_FAIL=1
|
|
fi
|
|
fi
|
|
|
|
if echo "$RESTART_SVCS" | grep -q "teleo-diagnostics"; then
|
|
if ssh "$VPS_HOST" "curl -sf --connect-timeout 3 http://localhost:8081/ops > /dev/null"; then
|
|
echo " diagnostics (8081): OK"
|
|
else
|
|
echo " diagnostics (8081): FAILED"
|
|
SMOKE_FAIL=1
|
|
fi
|
|
fi
|
|
|
|
# Tail logs for quick visual check
|
|
echo ""
|
|
echo "=== Recent logs (10s) ==="
|
|
JOURNAL_UNITS=""
|
|
for svc in $RESTART_SVCS; do
|
|
JOURNAL_UNITS="$JOURNAL_UNITS -u $svc"
|
|
done
|
|
ssh "$VPS_HOST" "journalctl $JOURNAL_UNITS --since '-10s' --no-pager -n 20" || true
|
|
|
|
if [ "$SMOKE_FAIL" -gt 0 ]; then
|
|
echo ""
|
|
echo "WARNING: Smoke test detected failures. Check logs above."
|
|
exit 1
|
|
fi
|
|
|
|
echo ""
|
|
echo "Smoke test passed."
|
|
fi
|
|
fi
|