Merge pull request #34 from living-ip/codex/teleo-agent-autorecover-20260702
Some checks are pending
CI / lint-and-test (push) Waiting to run
Some checks are pending
CI / lint-and-test (push) Waiting to run
Add Telegram agent autorecovery timer
This commit is contained in:
commit
66ecbf316e
7 changed files with 224 additions and 0 deletions
|
|
@ -16,6 +16,7 @@ PIPELINE_DIR="/opt/teleo-eval/pipeline"
|
|||
TELEGRAM_DIR="/opt/teleo-eval/telegram"
|
||||
DIAGNOSTICS_DIR="/opt/teleo-eval/diagnostics"
|
||||
AGENT_STATE_DIR="/opt/teleo-eval/ops/agent-state"
|
||||
SYSTEMD_DIR="/etc/systemd/system"
|
||||
STAMP_FILE="/opt/teleo-eval/.last-deploy-sha"
|
||||
LOG_TAG="auto-deploy"
|
||||
|
||||
|
|
@ -99,6 +100,21 @@ find /opt/teleo-eval -maxdepth 3 -name '*.sh' -not -perm -u+x -exec chmod +x {}
|
|||
|
||||
log "Files synced"
|
||||
|
||||
if [ "$OLD_SHA" = "none" ] || git diff --name-only "$OLD_SHA" "$NEW_SHA" -- systemd/ 2>/dev/null | grep -q .; then
|
||||
log "Installing systemd units"
|
||||
for unit in systemd/*.service systemd/*.timer; do
|
||||
[ -f "$unit" ] || continue
|
||||
sudo install -m 0644 "$unit" "$SYSTEMD_DIR/$(basename "$unit")"
|
||||
done
|
||||
sudo systemctl daemon-reload
|
||||
if [ -f systemd/teleo-auto-deploy.timer ]; then
|
||||
sudo systemctl enable --now teleo-auto-deploy.timer >/dev/null
|
||||
fi
|
||||
if [ -f systemd/teleo-agent-healthcheck.timer ]; then
|
||||
sudo systemctl enable --now teleo-agent-healthcheck.timer >/dev/null
|
||||
fi
|
||||
fi
|
||||
|
||||
# Restart services only if Python files changed
|
||||
RESTART=""
|
||||
add_restart() {
|
||||
|
|
@ -125,6 +141,10 @@ if [ "$OLD_SHA" != "none" ]; then
|
|||
add_restart_if_unit_active teleo-agent@leo
|
||||
add_restart_if_unit_exists teleo-agent@leo-wallet-test
|
||||
fi
|
||||
if git diff --name-only "$OLD_SHA" "$NEW_SHA" -- systemd/teleo-agent@.service 2>/dev/null | grep -q .; then
|
||||
add_restart_if_unit_active teleo-agent@leo
|
||||
add_restart_if_unit_exists teleo-agent@leo-wallet-test
|
||||
fi
|
||||
if git diff --name-only "$OLD_SHA" "$NEW_SHA" -- diagnostics/ 2>/dev/null | grep -q '\.py$'; then
|
||||
add_restart teleo-diagnostics
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ VPS_PIPELINE="/opt/teleo-eval/pipeline"
|
|||
VPS_TELEGRAM="/opt/teleo-eval/telegram"
|
||||
VPS_DIAGNOSTICS="/opt/teleo-eval/diagnostics"
|
||||
VPS_AGENT_STATE="/opt/teleo-eval/ops/agent-state"
|
||||
VPS_SYSTEMD="/etc/systemd/system"
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
|
||||
DRY_RUN=false
|
||||
|
|
@ -94,6 +95,14 @@ echo "=== Research session ==="
|
|||
rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/research/research-session.sh" "$VPS_HOST:/opt/teleo-eval/research-session.sh"
|
||||
echo ""
|
||||
|
||||
echo "=== Systemd units ==="
|
||||
if $DRY_RUN; then
|
||||
rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/systemd/" "$VPS_HOST:/tmp/teleo-systemd-dry-run/"
|
||||
else
|
||||
tar -C "$REPO_ROOT/systemd" -cf - . | ssh "$VPS_HOST" "tmpdir=\$(mktemp -d); tar -C \"\$tmpdir\" -xf -; sudo install -m 0644 \"\$tmpdir\"/*.service \"\$tmpdir\"/*.timer '$VPS_SYSTEMD'/; rm -rf \"\$tmpdir\"; sudo systemctl daemon-reload; sudo systemctl enable --now teleo-auto-deploy.timer teleo-agent-healthcheck.timer >/dev/null"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
if $DRY_RUN; then
|
||||
echo "Dry run complete. No changes made."
|
||||
exit 0
|
||||
|
|
|
|||
10
systemd/teleo-agent-healthcheck.service
Normal file
10
systemd/teleo-agent-healthcheck.service
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
[Unit]
|
||||
Description=Teleo Telegram agent autorecovery check
|
||||
After=network.target
|
||||
Wants=network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/opt/teleo-eval/pipeline/.venv/bin/python3 /opt/teleo-eval/telegram/agent_healthcheck.py --agents leo leo-wallet-test --since "20 min ago"
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
11
systemd/teleo-agent-healthcheck.timer
Normal file
11
systemd/teleo-agent-healthcheck.timer
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
[Unit]
|
||||
Description=Run Teleo Telegram agent autorecovery check
|
||||
|
||||
[Timer]
|
||||
OnBootSec=2min
|
||||
OnUnitActiveSec=5min
|
||||
AccuracySec=30s
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
114
telegram/agent_healthcheck.py
Normal file
114
telegram/agent_healthcheck.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Recover Teleo Telegram agent services from known runtime faults."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
DEFAULT_AGENTS = ("leo", "leo-wallet-test")
|
||||
TRANSCRIPT_PATH = "/opt/teleo-eval/transcripts"
|
||||
RECOVERABLE_LOG_PATTERNS = (
|
||||
"Read-only file system: '/opt/teleo-eval/transcripts",
|
||||
'Read-only file system: "/opt/teleo-eval/transcripts',
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CommandResult:
|
||||
returncode: int
|
||||
stdout: str
|
||||
stderr: str
|
||||
|
||||
|
||||
def run_command(args: list[str]) -> CommandResult:
|
||||
completed = subprocess.run(args, capture_output=True, text=True, check=False)
|
||||
return CommandResult(
|
||||
returncode=completed.returncode,
|
||||
stdout=completed.stdout,
|
||||
stderr=completed.stderr,
|
||||
)
|
||||
|
||||
|
||||
def unit_name(agent: str) -> str:
|
||||
return f"teleo-agent@{agent}.service"
|
||||
|
||||
|
||||
def service_is_active(unit: str) -> bool:
|
||||
return run_command(["systemctl", "is-active", "--quiet", unit]).returncode == 0
|
||||
|
||||
|
||||
def readwrite_paths_include_transcripts(unit: str) -> bool:
|
||||
result = run_command(["systemctl", "show", unit, "-p", "ReadWritePaths", "--value"])
|
||||
return result.returncode == 0 and TRANSCRIPT_PATH in result.stdout.split()
|
||||
|
||||
|
||||
def recent_logs(unit: str, since: str) -> str:
|
||||
result = run_command(["journalctl", "-u", unit, "--since", since, "--no-pager"])
|
||||
if result.returncode != 0:
|
||||
return result.stdout + result.stderr
|
||||
return result.stdout
|
||||
|
||||
|
||||
def should_restart_from_logs(log_text: str) -> bool:
|
||||
return any(pattern in log_text for pattern in RECOVERABLE_LOG_PATTERNS)
|
||||
|
||||
|
||||
def restart_service(unit: str, *, dry_run: bool) -> bool:
|
||||
if dry_run:
|
||||
print(f"dry_run restart {unit}")
|
||||
return True
|
||||
result = run_command(["systemctl", "restart", unit])
|
||||
if result.returncode != 0:
|
||||
print(f"restart_failed unit={unit} stderr={result.stderr.strip()}", file=sys.stderr)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def check_agent(agent: str, *, since: str, dry_run: bool) -> bool:
|
||||
unit = unit_name(agent)
|
||||
ok = True
|
||||
reasons: list[str] = []
|
||||
|
||||
if not readwrite_paths_include_transcripts(unit):
|
||||
print(f"unit_missing_transcript_write_path unit={unit}", file=sys.stderr)
|
||||
ok = False
|
||||
|
||||
if not service_is_active(unit):
|
||||
reasons.append("inactive")
|
||||
elif should_restart_from_logs(recent_logs(unit, since)):
|
||||
reasons.append("recoverable_log_fault")
|
||||
|
||||
if reasons:
|
||||
print(f"recovering unit={unit} reasons={','.join(reasons)}")
|
||||
ok = restart_service(unit, dry_run=dry_run) and ok
|
||||
if ok and not dry_run:
|
||||
ok = service_is_active(unit)
|
||||
print(f"post_restart unit={unit} active={ok}")
|
||||
else:
|
||||
print(f"healthy unit={unit}")
|
||||
|
||||
return ok
|
||||
|
||||
|
||||
def parse_args(argv: list[str]) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--agents", nargs="+", default=list(DEFAULT_AGENTS))
|
||||
parser.add_argument("--since", default="20 min ago")
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
args = parse_args(sys.argv[1:] if argv is None else argv)
|
||||
ok = True
|
||||
for agent in args.agents:
|
||||
ok = check_agent(agent, since=args.since, dry_run=args.dry_run) and ok
|
||||
return 0 if ok else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
35
tests/test_teleo_agent_healthcheck.py
Normal file
35
tests/test_teleo_agent_healthcheck.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(REPO_ROOT / "telegram"))
|
||||
|
||||
import agent_healthcheck # noqa: E402
|
||||
|
||||
|
||||
def test_unit_name_uses_teleo_agent_template():
|
||||
assert agent_healthcheck.unit_name("leo") == "teleo-agent@leo.service"
|
||||
assert agent_healthcheck.unit_name("leo-wallet-test") == "teleo-agent@leo-wallet-test.service"
|
||||
|
||||
|
||||
def test_should_restart_from_transcript_read_only_fault():
|
||||
log_text = "OSError: [Errno 30] Read-only file system: '/opt/teleo-eval/transcripts/leo'"
|
||||
|
||||
assert agent_healthcheck.should_restart_from_logs(log_text)
|
||||
|
||||
|
||||
def test_should_restart_from_double_quoted_transcript_read_only_fault():
|
||||
log_text = 'OSError: [Errno 30] Read-only file system: "/opt/teleo-eval/transcripts/leo"'
|
||||
|
||||
assert agent_healthcheck.should_restart_from_logs(log_text)
|
||||
|
||||
|
||||
def test_should_not_restart_from_generic_application_log():
|
||||
log_text = "INFO:root:Application started\nINFO:root:Bot running as @livingipleobot"
|
||||
|
||||
assert not agent_healthcheck.should_restart_from_logs(log_text)
|
||||
|
||||
|
||||
def test_default_agents_are_live_leo_and_wallet_test():
|
||||
assert agent_healthcheck.DEFAULT_AGENTS == ("leo", "leo-wallet-test")
|
||||
|
|
@ -45,3 +45,28 @@ def test_auto_deploy_prefers_github_remote_when_present():
|
|||
assert 'git fetch "$DEPLOY_REMOTE" main' in auto_deploy
|
||||
assert 'git rev-parse "$DEPLOY_REMOTE/main"' in auto_deploy
|
||||
assert 'git merge --ff-only "$DEPLOY_REMOTE/main"' in auto_deploy
|
||||
|
||||
|
||||
def test_agent_healthcheck_timer_runs_both_live_telegram_agents():
|
||||
service = (REPO_ROOT / "systemd" / "teleo-agent-healthcheck.service").read_text()
|
||||
timer = (REPO_ROOT / "systemd" / "teleo-agent-healthcheck.timer").read_text()
|
||||
|
||||
assert "/opt/teleo-eval/telegram/agent_healthcheck.py" in service
|
||||
assert "--agents leo leo-wallet-test" in service
|
||||
assert "--since \"20 min ago\"" in service
|
||||
assert "OnUnitActiveSec=5min" in timer
|
||||
assert "WantedBy=timers.target" in timer
|
||||
|
||||
|
||||
def test_deploy_scripts_install_systemd_units_and_enable_agent_healthcheck():
|
||||
auto_deploy = (REPO_ROOT / "deploy" / "auto-deploy.sh").read_text()
|
||||
manual_deploy = (REPO_ROOT / "deploy" / "deploy.sh").read_text()
|
||||
|
||||
assert 'SYSTEMD_DIR="/etc/systemd/system"' in auto_deploy
|
||||
assert 'sudo install -m 0644 "$unit" "$SYSTEMD_DIR/$(basename "$unit")"' in auto_deploy
|
||||
assert "sudo systemctl daemon-reload" in auto_deploy
|
||||
assert "sudo systemctl enable --now teleo-agent-healthcheck.timer" in auto_deploy
|
||||
assert "git diff --name-only \"$OLD_SHA\" \"$NEW_SHA\" -- systemd/teleo-agent@.service" in auto_deploy
|
||||
|
||||
assert 'VPS_SYSTEMD="/etc/systemd/system"' in manual_deploy
|
||||
assert "teleo-agent-healthcheck.timer" in manual_deploy
|
||||
|
|
|
|||
Loading…
Reference in a new issue