teleo-codex/ops/pipeline-v2/lib/breaker.py
m3taversal 05d74d5e32 sync: import all VPS pipeline + diagnostics code as baseline
Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source
of truth. Previously only 8 of 67 files existed in repo — the rest were
deployed directly to VPS via SCP, causing massive drift.

Includes:
- pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.)
- pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh
- diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics)
- agent-state/: bootstrap, lib-state, cascade inbox processor, schema
- systemd/: service unit files for reference
- deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate
- research-session.sh: updated with Step 8.5 digest + cascade inbox processing

No new code written — all files are exact copies from VPS as of 2026-04-06.
From this point forward: edit in repo, commit, then deploy.sh.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 00:00:00 +01:00

150 lines
5 KiB
Python

"""Circuit breaker state machine — per-stage, backed by SQLite."""
import logging
from datetime import datetime, timezone
from . import config
logger = logging.getLogger("pipeline.breaker")
# States
CLOSED = "closed"
OPEN = "open"
HALFOPEN = "halfopen"
class CircuitBreaker:
"""Per-stage circuit breaker.
CLOSED: normal operation
OPEN: stage paused (threshold consecutive failures reached)
HALFOPEN: cooldown expired, try 1 worker to probe recovery
"""
def __init__(self, name: str, conn):
self.name = name
self.conn = conn
self._ensure_row()
def _ensure_row(self):
self.conn.execute(
"INSERT OR IGNORE INTO circuit_breakers (name) VALUES (?)",
(self.name,),
)
def _get_state(self) -> dict:
row = self.conn.execute(
"SELECT state, failures, successes, tripped_at, last_success_at FROM circuit_breakers WHERE name = ?",
(self.name,),
).fetchone()
return (
dict(row)
if row
else {"state": CLOSED, "failures": 0, "successes": 0, "tripped_at": None, "last_success_at": None}
)
def _set_state(
self,
state: str,
failures: int = None,
successes: int = None,
tripped_at: str = None,
last_success_at: str = None,
):
updates = ["state = ?", "last_update = datetime('now')"]
params = [state]
if failures is not None:
updates.append("failures = ?")
params.append(failures)
if successes is not None:
updates.append("successes = ?")
params.append(successes)
if tripped_at is not None:
updates.append("tripped_at = ?")
params.append(tripped_at)
if last_success_at is not None:
updates.append("last_success_at = ?")
params.append(last_success_at)
params.append(self.name)
self.conn.execute(
f"UPDATE circuit_breakers SET {', '.join(updates)} WHERE name = ?",
params,
)
def allow_request(self) -> bool:
"""Check if requests are allowed. Returns True if CLOSED or HALFOPEN."""
s = self._get_state()
if s["state"] == CLOSED:
return True
if s["state"] == OPEN:
# Check cooldown
if s["tripped_at"]:
tripped = datetime.fromisoformat(s["tripped_at"])
if tripped.tzinfo is None:
tripped = tripped.replace(tzinfo=timezone.utc)
elapsed = (datetime.now(timezone.utc) - tripped).total_seconds()
if elapsed >= config.BREAKER_COOLDOWN:
logger.info("Breaker %s: cooldown expired, entering HALFOPEN", self.name)
self._set_state(HALFOPEN, successes=0)
return True
return False
# HALFOPEN — allow one probe
return True
def max_workers(self) -> int:
"""Return max workers allowed in current state."""
s = self._get_state()
if s["state"] == HALFOPEN:
return 1 # probe with single worker
return None # no restriction from breaker
def record_success(self):
"""Record a successful cycle. Updates last_success_at for stall detection (Vida)."""
s = self._get_state()
now = datetime.now(timezone.utc).isoformat()
if s["state"] == HALFOPEN:
logger.info("Breaker %s: HALFOPEN probe succeeded, closing", self.name)
self._set_state(CLOSED, failures=0, successes=0, last_success_at=now)
elif s["state"] == CLOSED:
if s["failures"] > 0:
self._set_state(CLOSED, failures=0, last_success_at=now)
else:
self._set_state(CLOSED, last_success_at=now)
def record_failure(self):
"""Record a failed cycle."""
s = self._get_state()
if s["state"] == HALFOPEN:
logger.warning("Breaker %s: HALFOPEN probe failed, reopening", self.name)
self._set_state(
OPEN,
failures=s["failures"] + 1,
tripped_at=datetime.now(timezone.utc).isoformat(),
)
elif s["state"] == CLOSED:
new_failures = s["failures"] + 1
if new_failures >= config.BREAKER_THRESHOLD:
logger.warning(
"Breaker %s: threshold reached (%d failures), opening",
self.name,
new_failures,
)
self._set_state(
OPEN,
failures=new_failures,
tripped_at=datetime.now(timezone.utc).isoformat(),
)
else:
self._set_state(CLOSED, failures=new_failures)
elif s["state"] == OPEN:
self._set_state(OPEN, failures=s["failures"] + 1)
def reset(self):
"""Force reset to CLOSED."""
logger.info("Breaker %s: force reset to CLOSED", self.name)
self._set_state(CLOSED, failures=0, successes=0)