Merge pull request #34 from living-ip/codex/teleo-agent-autorecover-20260702
Some checks are pending
CI / lint-and-test (push) Waiting to run

Add Telegram agent autorecovery timer
This commit is contained in:
twentyOne2x 2026-07-02 23:38:30 +02:00 committed by GitHub
commit 66ecbf316e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 224 additions and 0 deletions

View file

@ -16,6 +16,7 @@ PIPELINE_DIR="/opt/teleo-eval/pipeline"
TELEGRAM_DIR="/opt/teleo-eval/telegram"
DIAGNOSTICS_DIR="/opt/teleo-eval/diagnostics"
AGENT_STATE_DIR="/opt/teleo-eval/ops/agent-state"
SYSTEMD_DIR="/etc/systemd/system"
STAMP_FILE="/opt/teleo-eval/.last-deploy-sha"
LOG_TAG="auto-deploy"
@ -99,6 +100,21 @@ find /opt/teleo-eval -maxdepth 3 -name '*.sh' -not -perm -u+x -exec chmod +x {}
log "Files synced"
if [ "$OLD_SHA" = "none" ] || git diff --name-only "$OLD_SHA" "$NEW_SHA" -- systemd/ 2>/dev/null | grep -q .; then
log "Installing systemd units"
for unit in systemd/*.service systemd/*.timer; do
[ -f "$unit" ] || continue
sudo install -m 0644 "$unit" "$SYSTEMD_DIR/$(basename "$unit")"
done
sudo systemctl daemon-reload
if [ -f systemd/teleo-auto-deploy.timer ]; then
sudo systemctl enable --now teleo-auto-deploy.timer >/dev/null
fi
if [ -f systemd/teleo-agent-healthcheck.timer ]; then
sudo systemctl enable --now teleo-agent-healthcheck.timer >/dev/null
fi
fi
# Restart services only if Python files changed
RESTART=""
add_restart() {
@ -125,6 +141,10 @@ if [ "$OLD_SHA" != "none" ]; then
add_restart_if_unit_active teleo-agent@leo
add_restart_if_unit_exists teleo-agent@leo-wallet-test
fi
if git diff --name-only "$OLD_SHA" "$NEW_SHA" -- systemd/teleo-agent@.service 2>/dev/null | grep -q .; then
add_restart_if_unit_active teleo-agent@leo
add_restart_if_unit_exists teleo-agent@leo-wallet-test
fi
if git diff --name-only "$OLD_SHA" "$NEW_SHA" -- diagnostics/ 2>/dev/null | grep -q '\.py$'; then
add_restart teleo-diagnostics
fi

View file

@ -10,6 +10,7 @@ VPS_PIPELINE="/opt/teleo-eval/pipeline"
VPS_TELEGRAM="/opt/teleo-eval/telegram"
VPS_DIAGNOSTICS="/opt/teleo-eval/diagnostics"
VPS_AGENT_STATE="/opt/teleo-eval/ops/agent-state"
VPS_SYSTEMD="/etc/systemd/system"
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
DRY_RUN=false
@ -94,6 +95,14 @@ echo "=== Research session ==="
rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/research/research-session.sh" "$VPS_HOST:/opt/teleo-eval/research-session.sh"
echo ""
echo "=== Systemd units ==="
if $DRY_RUN; then
rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/systemd/" "$VPS_HOST:/tmp/teleo-systemd-dry-run/"
else
tar -C "$REPO_ROOT/systemd" -cf - . | ssh "$VPS_HOST" "tmpdir=\$(mktemp -d); tar -C \"\$tmpdir\" -xf -; sudo install -m 0644 \"\$tmpdir\"/*.service \"\$tmpdir\"/*.timer '$VPS_SYSTEMD'/; rm -rf \"\$tmpdir\"; sudo systemctl daemon-reload; sudo systemctl enable --now teleo-auto-deploy.timer teleo-agent-healthcheck.timer >/dev/null"
fi
echo ""
if $DRY_RUN; then
echo "Dry run complete. No changes made."
exit 0

View file

@ -0,0 +1,10 @@
[Unit]
Description=Teleo Telegram agent autorecovery check
After=network.target
Wants=network.target
[Service]
Type=oneshot
ExecStart=/opt/teleo-eval/pipeline/.venv/bin/python3 /opt/teleo-eval/telegram/agent_healthcheck.py --agents leo leo-wallet-test --since "20 min ago"
StandardOutput=journal
StandardError=journal

View file

@ -0,0 +1,11 @@
[Unit]
Description=Run Teleo Telegram agent autorecovery check
[Timer]
OnBootSec=2min
OnUnitActiveSec=5min
AccuracySec=30s
Persistent=true
[Install]
WantedBy=timers.target

View file

@ -0,0 +1,114 @@
#!/usr/bin/env python3
"""Recover Teleo Telegram agent services from known runtime faults."""
from __future__ import annotations
import argparse
import subprocess
import sys
from dataclasses import dataclass
DEFAULT_AGENTS = ("leo", "leo-wallet-test")
TRANSCRIPT_PATH = "/opt/teleo-eval/transcripts"
RECOVERABLE_LOG_PATTERNS = (
"Read-only file system: '/opt/teleo-eval/transcripts",
'Read-only file system: "/opt/teleo-eval/transcripts',
)
@dataclass(frozen=True)
class CommandResult:
returncode: int
stdout: str
stderr: str
def run_command(args: list[str]) -> CommandResult:
completed = subprocess.run(args, capture_output=True, text=True, check=False)
return CommandResult(
returncode=completed.returncode,
stdout=completed.stdout,
stderr=completed.stderr,
)
def unit_name(agent: str) -> str:
return f"teleo-agent@{agent}.service"
def service_is_active(unit: str) -> bool:
return run_command(["systemctl", "is-active", "--quiet", unit]).returncode == 0
def readwrite_paths_include_transcripts(unit: str) -> bool:
result = run_command(["systemctl", "show", unit, "-p", "ReadWritePaths", "--value"])
return result.returncode == 0 and TRANSCRIPT_PATH in result.stdout.split()
def recent_logs(unit: str, since: str) -> str:
result = run_command(["journalctl", "-u", unit, "--since", since, "--no-pager"])
if result.returncode != 0:
return result.stdout + result.stderr
return result.stdout
def should_restart_from_logs(log_text: str) -> bool:
return any(pattern in log_text for pattern in RECOVERABLE_LOG_PATTERNS)
def restart_service(unit: str, *, dry_run: bool) -> bool:
if dry_run:
print(f"dry_run restart {unit}")
return True
result = run_command(["systemctl", "restart", unit])
if result.returncode != 0:
print(f"restart_failed unit={unit} stderr={result.stderr.strip()}", file=sys.stderr)
return False
return True
def check_agent(agent: str, *, since: str, dry_run: bool) -> bool:
unit = unit_name(agent)
ok = True
reasons: list[str] = []
if not readwrite_paths_include_transcripts(unit):
print(f"unit_missing_transcript_write_path unit={unit}", file=sys.stderr)
ok = False
if not service_is_active(unit):
reasons.append("inactive")
elif should_restart_from_logs(recent_logs(unit, since)):
reasons.append("recoverable_log_fault")
if reasons:
print(f"recovering unit={unit} reasons={','.join(reasons)}")
ok = restart_service(unit, dry_run=dry_run) and ok
if ok and not dry_run:
ok = service_is_active(unit)
print(f"post_restart unit={unit} active={ok}")
else:
print(f"healthy unit={unit}")
return ok
def parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--agents", nargs="+", default=list(DEFAULT_AGENTS))
parser.add_argument("--since", default="20 min ago")
parser.add_argument("--dry-run", action="store_true")
return parser.parse_args(argv)
def main(argv: list[str] | None = None) -> int:
args = parse_args(sys.argv[1:] if argv is None else argv)
ok = True
for agent in args.agents:
ok = check_agent(agent, since=args.since, dry_run=args.dry_run) and ok
return 0 if ok else 1
if __name__ == "__main__":
raise SystemExit(main())

View file

@ -0,0 +1,35 @@
from pathlib import Path
import sys
REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO_ROOT / "telegram"))
import agent_healthcheck # noqa: E402
def test_unit_name_uses_teleo_agent_template():
assert agent_healthcheck.unit_name("leo") == "teleo-agent@leo.service"
assert agent_healthcheck.unit_name("leo-wallet-test") == "teleo-agent@leo-wallet-test.service"
def test_should_restart_from_transcript_read_only_fault():
log_text = "OSError: [Errno 30] Read-only file system: '/opt/teleo-eval/transcripts/leo'"
assert agent_healthcheck.should_restart_from_logs(log_text)
def test_should_restart_from_double_quoted_transcript_read_only_fault():
log_text = 'OSError: [Errno 30] Read-only file system: "/opt/teleo-eval/transcripts/leo"'
assert agent_healthcheck.should_restart_from_logs(log_text)
def test_should_not_restart_from_generic_application_log():
log_text = "INFO:root:Application started\nINFO:root:Bot running as @livingipleobot"
assert not agent_healthcheck.should_restart_from_logs(log_text)
def test_default_agents_are_live_leo_and_wallet_test():
assert agent_healthcheck.DEFAULT_AGENTS == ("leo", "leo-wallet-test")

View file

@ -45,3 +45,28 @@ def test_auto_deploy_prefers_github_remote_when_present():
assert 'git fetch "$DEPLOY_REMOTE" main' in auto_deploy
assert 'git rev-parse "$DEPLOY_REMOTE/main"' in auto_deploy
assert 'git merge --ff-only "$DEPLOY_REMOTE/main"' in auto_deploy
def test_agent_healthcheck_timer_runs_both_live_telegram_agents():
service = (REPO_ROOT / "systemd" / "teleo-agent-healthcheck.service").read_text()
timer = (REPO_ROOT / "systemd" / "teleo-agent-healthcheck.timer").read_text()
assert "/opt/teleo-eval/telegram/agent_healthcheck.py" in service
assert "--agents leo leo-wallet-test" in service
assert "--since \"20 min ago\"" in service
assert "OnUnitActiveSec=5min" in timer
assert "WantedBy=timers.target" in timer
def test_deploy_scripts_install_systemd_units_and_enable_agent_healthcheck():
auto_deploy = (REPO_ROOT / "deploy" / "auto-deploy.sh").read_text()
manual_deploy = (REPO_ROOT / "deploy" / "deploy.sh").read_text()
assert 'SYSTEMD_DIR="/etc/systemd/system"' in auto_deploy
assert 'sudo install -m 0644 "$unit" "$SYSTEMD_DIR/$(basename "$unit")"' in auto_deploy
assert "sudo systemctl daemon-reload" in auto_deploy
assert "sudo systemctl enable --now teleo-agent-healthcheck.timer" in auto_deploy
assert "git diff --name-only \"$OLD_SHA\" \"$NEW_SHA\" -- systemd/teleo-agent@.service" in auto_deploy
assert 'VPS_SYSTEMD="/etc/systemd/system"' in manual_deploy
assert "teleo-agent-healthcheck.timer" in manual_deploy