diff --git a/deploy/auto-deploy.sh b/deploy/auto-deploy.sh index 86faf9b..cfb79a4 100755 --- a/deploy/auto-deploy.sh +++ b/deploy/auto-deploy.sh @@ -16,6 +16,7 @@ PIPELINE_DIR="/opt/teleo-eval/pipeline" TELEGRAM_DIR="/opt/teleo-eval/telegram" DIAGNOSTICS_DIR="/opt/teleo-eval/diagnostics" AGENT_STATE_DIR="/opt/teleo-eval/ops/agent-state" +SYSTEMD_DIR="/etc/systemd/system" STAMP_FILE="/opt/teleo-eval/.last-deploy-sha" LOG_TAG="auto-deploy" @@ -99,6 +100,21 @@ find /opt/teleo-eval -maxdepth 3 -name '*.sh' -not -perm -u+x -exec chmod +x {} log "Files synced" +if [ "$OLD_SHA" = "none" ] || git diff --name-only "$OLD_SHA" "$NEW_SHA" -- systemd/ 2>/dev/null | grep -q .; then + log "Installing systemd units" + for unit in systemd/*.service systemd/*.timer; do + [ -f "$unit" ] || continue + sudo install -m 0644 "$unit" "$SYSTEMD_DIR/$(basename "$unit")" + done + sudo systemctl daemon-reload + if [ -f systemd/teleo-auto-deploy.timer ]; then + sudo systemctl enable --now teleo-auto-deploy.timer >/dev/null + fi + if [ -f systemd/teleo-agent-healthcheck.timer ]; then + sudo systemctl enable --now teleo-agent-healthcheck.timer >/dev/null + fi +fi + # Restart services only if Python files changed RESTART="" add_restart() { @@ -125,6 +141,10 @@ if [ "$OLD_SHA" != "none" ]; then add_restart_if_unit_active teleo-agent@leo add_restart_if_unit_exists teleo-agent@leo-wallet-test fi + if git diff --name-only "$OLD_SHA" "$NEW_SHA" -- systemd/teleo-agent@.service 2>/dev/null | grep -q .; then + add_restart_if_unit_active teleo-agent@leo + add_restart_if_unit_exists teleo-agent@leo-wallet-test + fi if git diff --name-only "$OLD_SHA" "$NEW_SHA" -- diagnostics/ 2>/dev/null | grep -q '\.py$'; then add_restart teleo-diagnostics fi diff --git a/deploy/deploy.sh b/deploy/deploy.sh index 0607ecd..2a87728 100755 --- a/deploy/deploy.sh +++ b/deploy/deploy.sh @@ -10,6 +10,7 @@ VPS_PIPELINE="/opt/teleo-eval/pipeline" VPS_TELEGRAM="/opt/teleo-eval/telegram" VPS_DIAGNOSTICS="/opt/teleo-eval/diagnostics" VPS_AGENT_STATE="/opt/teleo-eval/ops/agent-state" +VPS_SYSTEMD="/etc/systemd/system" REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" DRY_RUN=false @@ -94,6 +95,14 @@ echo "=== Research session ===" rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/research/research-session.sh" "$VPS_HOST:/opt/teleo-eval/research-session.sh" echo "" +echo "=== Systemd units ===" +if $DRY_RUN; then + rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/systemd/" "$VPS_HOST:/tmp/teleo-systemd-dry-run/" +else + tar -C "$REPO_ROOT/systemd" -cf - . | ssh "$VPS_HOST" "tmpdir=\$(mktemp -d); tar -C \"\$tmpdir\" -xf -; sudo install -m 0644 \"\$tmpdir\"/*.service \"\$tmpdir\"/*.timer '$VPS_SYSTEMD'/; rm -rf \"\$tmpdir\"; sudo systemctl daemon-reload; sudo systemctl enable --now teleo-auto-deploy.timer teleo-agent-healthcheck.timer >/dev/null" +fi +echo "" + if $DRY_RUN; then echo "Dry run complete. No changes made." exit 0 diff --git a/systemd/teleo-agent-healthcheck.service b/systemd/teleo-agent-healthcheck.service new file mode 100644 index 0000000..0f39c7a --- /dev/null +++ b/systemd/teleo-agent-healthcheck.service @@ -0,0 +1,10 @@ +[Unit] +Description=Teleo Telegram agent autorecovery check +After=network.target +Wants=network.target + +[Service] +Type=oneshot +ExecStart=/opt/teleo-eval/pipeline/.venv/bin/python3 /opt/teleo-eval/telegram/agent_healthcheck.py --agents leo leo-wallet-test --since "20 min ago" +StandardOutput=journal +StandardError=journal diff --git a/systemd/teleo-agent-healthcheck.timer b/systemd/teleo-agent-healthcheck.timer new file mode 100644 index 0000000..12e715f --- /dev/null +++ b/systemd/teleo-agent-healthcheck.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Run Teleo Telegram agent autorecovery check + +[Timer] +OnBootSec=2min +OnUnitActiveSec=5min +AccuracySec=30s +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/telegram/agent_healthcheck.py b/telegram/agent_healthcheck.py new file mode 100644 index 0000000..46d914e --- /dev/null +++ b/telegram/agent_healthcheck.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +"""Recover Teleo Telegram agent services from known runtime faults.""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +from dataclasses import dataclass + + +DEFAULT_AGENTS = ("leo", "leo-wallet-test") +TRANSCRIPT_PATH = "/opt/teleo-eval/transcripts" +RECOVERABLE_LOG_PATTERNS = ( + "Read-only file system: '/opt/teleo-eval/transcripts", + 'Read-only file system: "/opt/teleo-eval/transcripts', +) + + +@dataclass(frozen=True) +class CommandResult: + returncode: int + stdout: str + stderr: str + + +def run_command(args: list[str]) -> CommandResult: + completed = subprocess.run(args, capture_output=True, text=True, check=False) + return CommandResult( + returncode=completed.returncode, + stdout=completed.stdout, + stderr=completed.stderr, + ) + + +def unit_name(agent: str) -> str: + return f"teleo-agent@{agent}.service" + + +def service_is_active(unit: str) -> bool: + return run_command(["systemctl", "is-active", "--quiet", unit]).returncode == 0 + + +def readwrite_paths_include_transcripts(unit: str) -> bool: + result = run_command(["systemctl", "show", unit, "-p", "ReadWritePaths", "--value"]) + return result.returncode == 0 and TRANSCRIPT_PATH in result.stdout.split() + + +def recent_logs(unit: str, since: str) -> str: + result = run_command(["journalctl", "-u", unit, "--since", since, "--no-pager"]) + if result.returncode != 0: + return result.stdout + result.stderr + return result.stdout + + +def should_restart_from_logs(log_text: str) -> bool: + return any(pattern in log_text for pattern in RECOVERABLE_LOG_PATTERNS) + + +def restart_service(unit: str, *, dry_run: bool) -> bool: + if dry_run: + print(f"dry_run restart {unit}") + return True + result = run_command(["systemctl", "restart", unit]) + if result.returncode != 0: + print(f"restart_failed unit={unit} stderr={result.stderr.strip()}", file=sys.stderr) + return False + return True + + +def check_agent(agent: str, *, since: str, dry_run: bool) -> bool: + unit = unit_name(agent) + ok = True + reasons: list[str] = [] + + if not readwrite_paths_include_transcripts(unit): + print(f"unit_missing_transcript_write_path unit={unit}", file=sys.stderr) + ok = False + + if not service_is_active(unit): + reasons.append("inactive") + elif should_restart_from_logs(recent_logs(unit, since)): + reasons.append("recoverable_log_fault") + + if reasons: + print(f"recovering unit={unit} reasons={','.join(reasons)}") + ok = restart_service(unit, dry_run=dry_run) and ok + if ok and not dry_run: + ok = service_is_active(unit) + print(f"post_restart unit={unit} active={ok}") + else: + print(f"healthy unit={unit}") + + return ok + + +def parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--agents", nargs="+", default=list(DEFAULT_AGENTS)) + parser.add_argument("--since", default="20 min ago") + parser.add_argument("--dry-run", action="store_true") + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(sys.argv[1:] if argv is None else argv) + ok = True + for agent in args.agents: + ok = check_agent(agent, since=args.since, dry_run=args.dry_run) and ok + return 0 if ok else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_teleo_agent_healthcheck.py b/tests/test_teleo_agent_healthcheck.py new file mode 100644 index 0000000..e491196 --- /dev/null +++ b/tests/test_teleo_agent_healthcheck.py @@ -0,0 +1,35 @@ +from pathlib import Path +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(REPO_ROOT / "telegram")) + +import agent_healthcheck # noqa: E402 + + +def test_unit_name_uses_teleo_agent_template(): + assert agent_healthcheck.unit_name("leo") == "teleo-agent@leo.service" + assert agent_healthcheck.unit_name("leo-wallet-test") == "teleo-agent@leo-wallet-test.service" + + +def test_should_restart_from_transcript_read_only_fault(): + log_text = "OSError: [Errno 30] Read-only file system: '/opt/teleo-eval/transcripts/leo'" + + assert agent_healthcheck.should_restart_from_logs(log_text) + + +def test_should_restart_from_double_quoted_transcript_read_only_fault(): + log_text = 'OSError: [Errno 30] Read-only file system: "/opt/teleo-eval/transcripts/leo"' + + assert agent_healthcheck.should_restart_from_logs(log_text) + + +def test_should_not_restart_from_generic_application_log(): + log_text = "INFO:root:Application started\nINFO:root:Bot running as @livingipleobot" + + assert not agent_healthcheck.should_restart_from_logs(log_text) + + +def test_default_agents_are_live_leo_and_wallet_test(): + assert agent_healthcheck.DEFAULT_AGENTS == ("leo", "leo-wallet-test") diff --git a/tests/test_teleo_agent_systemd.py b/tests/test_teleo_agent_systemd.py index c7295bc..d64dd63 100644 --- a/tests/test_teleo_agent_systemd.py +++ b/tests/test_teleo_agent_systemd.py @@ -45,3 +45,28 @@ def test_auto_deploy_prefers_github_remote_when_present(): assert 'git fetch "$DEPLOY_REMOTE" main' in auto_deploy assert 'git rev-parse "$DEPLOY_REMOTE/main"' in auto_deploy assert 'git merge --ff-only "$DEPLOY_REMOTE/main"' in auto_deploy + + +def test_agent_healthcheck_timer_runs_both_live_telegram_agents(): + service = (REPO_ROOT / "systemd" / "teleo-agent-healthcheck.service").read_text() + timer = (REPO_ROOT / "systemd" / "teleo-agent-healthcheck.timer").read_text() + + assert "/opt/teleo-eval/telegram/agent_healthcheck.py" in service + assert "--agents leo leo-wallet-test" in service + assert "--since \"20 min ago\"" in service + assert "OnUnitActiveSec=5min" in timer + assert "WantedBy=timers.target" in timer + + +def test_deploy_scripts_install_systemd_units_and_enable_agent_healthcheck(): + auto_deploy = (REPO_ROOT / "deploy" / "auto-deploy.sh").read_text() + manual_deploy = (REPO_ROOT / "deploy" / "deploy.sh").read_text() + + assert 'SYSTEMD_DIR="/etc/systemd/system"' in auto_deploy + assert 'sudo install -m 0644 "$unit" "$SYSTEMD_DIR/$(basename "$unit")"' in auto_deploy + assert "sudo systemctl daemon-reload" in auto_deploy + assert "sudo systemctl enable --now teleo-agent-healthcheck.timer" in auto_deploy + assert "git diff --name-only \"$OLD_SHA\" \"$NEW_SHA\" -- systemd/teleo-agent@.service" in auto_deploy + + assert 'VPS_SYSTEMD="/etc/systemd/system"' in manual_deploy + assert "teleo-agent-healthcheck.timer" in manual_deploy