diff --git a/diagnostics/alerting.py b/diagnostics/alerting.py deleted file mode 100644 index 33dde714e..000000000 --- a/diagnostics/alerting.py +++ /dev/null @@ -1,537 +0,0 @@ -"""Argus active monitoring — health watchdog, quality regression, throughput anomaly detection. - -Provides check functions that detect problems and return structured alerts. -Called by /check endpoint (periodic cron) or on-demand. - -Alert schema: - { - "id": str, # unique key for dedup (e.g. "dormant:ganymede") - "severity": str, # "critical" | "warning" | "info" - "category": str, # "health" | "quality" | "throughput" | "failure_pattern" - "title": str, # human-readable headline - "detail": str, # actionable description - "agent": str|None, # affected agent (if applicable) - "domain": str|None, # affected domain (if applicable) - "detected_at": str, # ISO timestamp - "auto_resolve": bool, # clears when condition clears - } -""" - -import json -import sqlite3 -import statistics -from datetime import datetime, timezone - - -# ─── Agent-domain mapping (static config, maintained by Argus) ────────────── - -AGENT_DOMAINS = { - "rio": ["internet-finance"], - "clay": ["creative-industries"], - "ganymede": None, # reviewer — cross-domain - "epimetheus": None, # infra - "leo": None, # standards - "oberon": None, # evolution tracking - "vida": None, # health monitoring - "hermes": None, # comms - "astra": None, # research -} - -# Thresholds -DORMANCY_HOURS = 48 -APPROVAL_DROP_THRESHOLD = 15 # percentage points below 7-day baseline -THROUGHPUT_DROP_RATIO = 0.5 # alert if today < 50% of 7-day SMA -REJECTION_SPIKE_RATIO = 0.20 # single reason > 20% of recent rejections -STUCK_LOOP_THRESHOLD = 3 # same agent + same rejection reason > N times in 6h -COST_SPIKE_RATIO = 2.0 # daily cost > 2x 7-day average - - -def _now_iso() -> str: - return datetime.now(timezone.utc).isoformat() - - -# ─── Check: Agent Health (dormancy detection) ─────────────────────────────── - - -def check_agent_health(conn: sqlite3.Connection) -> list[dict]: - """Detect agents with no PR activity in the last DORMANCY_HOURS hours.""" - alerts = [] - - # Get last activity per agent - rows = conn.execute( - """SELECT agent, MAX(last_attempt) as latest, COUNT(*) as total_prs - FROM prs WHERE agent IS NOT NULL - GROUP BY agent""" - ).fetchall() - - now = datetime.now(timezone.utc) - for r in rows: - agent = r["agent"] - latest = r["latest"] - if not latest: - continue - - last_dt = datetime.fromisoformat(latest) - if last_dt.tzinfo is None: - last_dt = last_dt.replace(tzinfo=timezone.utc) - - hours_since = (now - last_dt).total_seconds() / 3600 - - if hours_since > DORMANCY_HOURS: - alerts.append({ - "id": f"dormant:{agent}", - "severity": "warning", - "category": "health", - "title": f"Agent '{agent}' dormant for {int(hours_since)}h", - "detail": ( - f"No PR activity since {latest}. " - f"Last seen {int(hours_since)}h ago (threshold: {DORMANCY_HOURS}h). " - f"Total historical PRs: {r['total_prs']}." - ), - "agent": agent, - "domain": None, - "detected_at": _now_iso(), - "auto_resolve": True, - }) - - return alerts - - -# ─── Check: Quality Regression (approval rate drop) ───────────────────────── - - -def check_quality_regression(conn: sqlite3.Connection) -> list[dict]: - """Detect approval rate drops vs 7-day baseline, per agent and per domain.""" - alerts = [] - - # 7-day baseline approval rate (overall) - baseline = conn.execute( - """SELECT - COUNT(CASE WHEN event='approved' THEN 1 END) as approved, - COUNT(*) as total - FROM audit_log - WHERE stage='evaluate' - AND event IN ('approved','changes_requested','domain_rejected','tier05_rejected') - AND timestamp > datetime('now', '-7 days')""" - ).fetchone() - baseline_rate = (baseline["approved"] / baseline["total"] * 100) if baseline["total"] else None - - # 24h approval rate (overall) - recent = conn.execute( - """SELECT - COUNT(CASE WHEN event='approved' THEN 1 END) as approved, - COUNT(*) as total - FROM audit_log - WHERE stage='evaluate' - AND event IN ('approved','changes_requested','domain_rejected','tier05_rejected') - AND timestamp > datetime('now', '-24 hours')""" - ).fetchone() - recent_rate = (recent["approved"] / recent["total"] * 100) if recent["total"] else None - - if baseline_rate is not None and recent_rate is not None: - drop = baseline_rate - recent_rate - if drop > APPROVAL_DROP_THRESHOLD: - alerts.append({ - "id": "quality_regression:overall", - "severity": "critical", - "category": "quality", - "title": f"Approval rate dropped {drop:.0f}pp (24h: {recent_rate:.0f}% vs 7d: {baseline_rate:.0f}%)", - "detail": ( - f"24h approval rate ({recent_rate:.1f}%) is {drop:.1f} percentage points below " - f"7-day baseline ({baseline_rate:.1f}%). " - f"Evaluated {recent['total']} PRs in last 24h." - ), - "agent": None, - "domain": None, - "detected_at": _now_iso(), - "auto_resolve": True, - }) - - # Per-agent approval rate (24h vs 7d) — only for agents with >=5 evals in each window - # COALESCE: rejection events use $.agent, eval events use $.domain_agent (Epimetheus 2026-03-28) - _check_approval_by_dimension(conn, alerts, "agent", "COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent'))") - - # Per-domain approval rate (24h vs 7d) — Theseus addition - _check_approval_by_dimension(conn, alerts, "domain", "json_extract(detail, '$.domain')") - - return alerts - - -def _check_approval_by_dimension(conn, alerts, dim_name, dim_expr): - """Check approval rate regression grouped by a dimension (agent or domain).""" - # 7-day baseline per dimension - baseline_rows = conn.execute( - f"""SELECT {dim_expr} as dim_val, - COUNT(CASE WHEN event='approved' THEN 1 END) as approved, - COUNT(*) as total - FROM audit_log - WHERE stage='evaluate' - AND event IN ('approved','changes_requested','domain_rejected','tier05_rejected') - AND timestamp > datetime('now', '-7 days') - AND {dim_expr} IS NOT NULL - GROUP BY dim_val HAVING total >= 5""" - ).fetchall() - baselines = {r["dim_val"]: (r["approved"] / r["total"] * 100) for r in baseline_rows} - - # 24h per dimension - recent_rows = conn.execute( - f"""SELECT {dim_expr} as dim_val, - COUNT(CASE WHEN event='approved' THEN 1 END) as approved, - COUNT(*) as total - FROM audit_log - WHERE stage='evaluate' - AND event IN ('approved','changes_requested','domain_rejected','tier05_rejected') - AND timestamp > datetime('now', '-24 hours') - AND {dim_expr} IS NOT NULL - GROUP BY dim_val HAVING total >= 5""" - ).fetchall() - - for r in recent_rows: - val = r["dim_val"] - if val not in baselines: - continue - recent_rate = r["approved"] / r["total"] * 100 - base_rate = baselines[val] - drop = base_rate - recent_rate - if drop > APPROVAL_DROP_THRESHOLD: - alerts.append({ - "id": f"quality_regression:{dim_name}:{val}", - "severity": "warning", - "category": "quality", - "title": f"{dim_name.title()} '{val}' approval dropped {drop:.0f}pp", - "detail": ( - f"24h: {recent_rate:.1f}% vs 7d baseline: {base_rate:.1f}% " - f"({r['total']} evals in 24h)." - ), - "agent": val if dim_name == "agent" else None, - "domain": val if dim_name == "domain" else None, - "detected_at": _now_iso(), - "auto_resolve": True, - }) - - -# ─── Check: Throughput Anomaly ────────────────────────────────────────────── - - -def check_throughput(conn: sqlite3.Connection) -> list[dict]: - """Detect throughput stalling — today vs 7-day SMA.""" - alerts = [] - - # Daily merged counts for last 7 days - rows = conn.execute( - """SELECT date(merged_at) as day, COUNT(*) as n - FROM prs WHERE merged_at > datetime('now', '-7 days') - GROUP BY day ORDER BY day""" - ).fetchall() - - if len(rows) < 2: - return alerts # Not enough data - - daily_counts = [r["n"] for r in rows] - sma = statistics.mean(daily_counts[:-1]) if len(daily_counts) > 1 else daily_counts[0] - today_count = daily_counts[-1] - - if sma > 0 and today_count < sma * THROUGHPUT_DROP_RATIO: - alerts.append({ - "id": "throughput:stalling", - "severity": "warning", - "category": "throughput", - "title": f"Throughput stalling: {today_count} merges today vs {sma:.0f}/day avg", - "detail": ( - f"Today's merge count ({today_count}) is below {THROUGHPUT_DROP_RATIO:.0%} of " - f"7-day average ({sma:.1f}/day). Daily counts: {daily_counts}." - ), - "agent": None, - "domain": None, - "detected_at": _now_iso(), - "auto_resolve": True, - }) - - return alerts - - -# ─── Check: Rejection Reason Spike ───────────────────────────────────────── - - -def check_rejection_spike(conn: sqlite3.Connection) -> list[dict]: - """Detect single rejection reason exceeding REJECTION_SPIKE_RATIO of recent rejections.""" - alerts = [] - - # Total rejections in 24h - total = conn.execute( - """SELECT COUNT(*) as n FROM audit_log - WHERE stage='evaluate' - AND event IN ('changes_requested','domain_rejected','tier05_rejected') - AND timestamp > datetime('now', '-24 hours')""" - ).fetchone()["n"] - - if total < 10: - return alerts # Not enough data - - # Count by rejection tag - tags = conn.execute( - """SELECT value as tag, COUNT(*) as cnt - FROM audit_log, json_each(json_extract(detail, '$.issues')) - WHERE stage='evaluate' - AND event IN ('changes_requested','domain_rejected','tier05_rejected') - AND timestamp > datetime('now', '-24 hours') - GROUP BY tag ORDER BY cnt DESC""" - ).fetchall() - - for t in tags: - ratio = t["cnt"] / total - if ratio > REJECTION_SPIKE_RATIO: - alerts.append({ - "id": f"rejection_spike:{t['tag']}", - "severity": "warning", - "category": "quality", - "title": f"Rejection reason '{t['tag']}' at {ratio:.0%} of rejections", - "detail": ( - f"'{t['tag']}' accounts for {t['cnt']}/{total} rejections in 24h " - f"({ratio:.1%}). Threshold: {REJECTION_SPIKE_RATIO:.0%}." - ), - "agent": None, - "domain": None, - "detected_at": _now_iso(), - "auto_resolve": True, - }) - - return alerts - - -# ─── Check: Stuck Loops ──────────────────────────────────────────────────── - - -def check_stuck_loops(conn: sqlite3.Connection) -> list[dict]: - """Detect agents repeatedly failing on the same rejection reason.""" - alerts = [] - - # COALESCE: rejection events use $.agent, eval events use $.domain_agent (Epimetheus 2026-03-28) - rows = conn.execute( - """SELECT COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent')) as agent, - value as tag, - COUNT(*) as cnt - FROM audit_log, json_each(json_extract(detail, '$.issues')) - WHERE stage='evaluate' - AND event IN ('changes_requested','domain_rejected','tier05_rejected') - AND timestamp > datetime('now', '-6 hours') - AND COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent')) IS NOT NULL - GROUP BY agent, tag - HAVING cnt > ?""", - (STUCK_LOOP_THRESHOLD,), - ).fetchall() - - for r in rows: - alerts.append({ - "id": f"stuck_loop:{r['agent']}:{r['tag']}", - "severity": "critical", - "category": "health", - "title": f"Agent '{r['agent']}' stuck: '{r['tag']}' failed {r['cnt']}x in 6h", - "detail": ( - f"Agent '{r['agent']}' has been rejected for '{r['tag']}' " - f"{r['cnt']} times in the last 6 hours (threshold: {STUCK_LOOP_THRESHOLD}). " - f"Stop and reassess." - ), - "agent": r["agent"], - "domain": None, - "detected_at": _now_iso(), - "auto_resolve": True, - }) - - return alerts - - -# ─── Check: Cost Spikes ──────────────────────────────────────────────────── - - -def check_cost_spikes(conn: sqlite3.Connection) -> list[dict]: - """Detect daily cost exceeding 2x of 7-day average per agent.""" - alerts = [] - - # Check if costs table exists and has agent column - try: - cols = conn.execute("PRAGMA table_info(costs)").fetchall() - col_names = {c["name"] for c in cols} - except sqlite3.Error: - return alerts - - if "agent" not in col_names or "cost_usd" not in col_names: - # Fall back to per-PR cost tracking - rows = conn.execute( - """SELECT agent, - SUM(CASE WHEN created_at > datetime('now', '-1 day') THEN cost_usd ELSE 0 END) as today_cost, - SUM(CASE WHEN created_at > datetime('now', '-7 days') THEN cost_usd ELSE 0 END) / 7.0 as avg_daily - FROM prs WHERE agent IS NOT NULL AND cost_usd > 0 - GROUP BY agent - HAVING avg_daily > 0""" - ).fetchall() - else: - rows = conn.execute( - """SELECT agent, - SUM(CASE WHEN timestamp > datetime('now', '-1 day') THEN cost_usd ELSE 0 END) as today_cost, - SUM(CASE WHEN timestamp > datetime('now', '-7 days') THEN cost_usd ELSE 0 END) / 7.0 as avg_daily - FROM costs WHERE agent IS NOT NULL - GROUP BY agent - HAVING avg_daily > 0""" - ).fetchall() - - for r in rows: - if r["avg_daily"] and r["today_cost"] > r["avg_daily"] * COST_SPIKE_RATIO: - ratio = r["today_cost"] / r["avg_daily"] - alerts.append({ - "id": f"cost_spike:{r['agent']}", - "severity": "warning", - "category": "health", - "title": f"Agent '{r['agent']}' cost spike: ${r['today_cost']:.2f} today ({ratio:.1f}x avg)", - "detail": ( - f"Today's cost (${r['today_cost']:.2f}) is {ratio:.1f}x the 7-day daily average " - f"(${r['avg_daily']:.2f}). Threshold: {COST_SPIKE_RATIO}x." - ), - "agent": r["agent"], - "domain": None, - "detected_at": _now_iso(), - "auto_resolve": True, - }) - - return alerts - - -# ─── Check: Domain Rejection Patterns (Theseus addition) ─────────────────── - - -def check_domain_rejection_patterns(conn: sqlite3.Connection) -> list[dict]: - """Track rejection reason shift per domain — surfaces domain maturity issues.""" - alerts = [] - - # Per-domain rejection breakdown in 24h - rows = conn.execute( - """SELECT json_extract(detail, '$.domain') as domain, - value as tag, - COUNT(*) as cnt - FROM audit_log, json_each(json_extract(detail, '$.issues')) - WHERE stage='evaluate' - AND event IN ('changes_requested','domain_rejected','tier05_rejected') - AND timestamp > datetime('now', '-24 hours') - AND json_extract(detail, '$.domain') IS NOT NULL - GROUP BY domain, tag - ORDER BY domain, cnt DESC""" - ).fetchall() - - # Group by domain - domain_tags = {} - for r in rows: - d = r["domain"] - if d not in domain_tags: - domain_tags[d] = [] - domain_tags[d].append({"tag": r["tag"], "count": r["cnt"]}) - - # Flag if a domain has >50% of rejections from a single reason (concentrated failure) - for domain, tags in domain_tags.items(): - total = sum(t["count"] for t in tags) - if total < 5: - continue - top = tags[0] - ratio = top["count"] / total - if ratio > 0.5: - alerts.append({ - "id": f"domain_rejection_pattern:{domain}:{top['tag']}", - "severity": "info", - "category": "failure_pattern", - "title": f"Domain '{domain}': {ratio:.0%} of rejections are '{top['tag']}'", - "detail": ( - f"In domain '{domain}', {top['count']}/{total} rejections (24h) are for " - f"'{top['tag']}'. This may indicate a systematic issue with evidence standards " - f"or schema compliance in this domain." - ), - "agent": None, - "domain": domain, - "detected_at": _now_iso(), - "auto_resolve": True, - }) - - return alerts - - -# ─── Failure Report Generator ─────────────────────────────────────────────── - - -def generate_failure_report(conn: sqlite3.Connection, agent: str, hours: int = 24) -> dict | None: - """Compile a failure report for a specific agent. - - Returns top rejection reasons, example PRs, and suggested fixes. - Designed to be sent directly to the agent via Pentagon messaging. - """ - hours = int(hours) # defensive — callers should pass int, but enforce it - rows = conn.execute( - """SELECT value as tag, COUNT(*) as cnt, - GROUP_CONCAT(DISTINCT json_extract(detail, '$.pr')) as pr_numbers - FROM audit_log, json_each(json_extract(detail, '$.issues')) - WHERE stage='evaluate' - AND event IN ('changes_requested','domain_rejected','tier05_rejected') - AND json_extract(detail, '$.agent') = ? - AND timestamp > datetime('now', ? || ' hours') - GROUP BY tag ORDER BY cnt DESC - LIMIT 5""", - (agent, f"-{hours}"), - ).fetchall() - - if not rows: - return None - - total_rejections = sum(r["cnt"] for r in rows) - top_reasons = [] - for r in rows: - prs = r["pr_numbers"].split(",")[:3] if r["pr_numbers"] else [] - top_reasons.append({ - "reason": r["tag"], - "count": r["cnt"], - "pct": round(r["cnt"] / total_rejections * 100, 1), - "example_prs": prs, - "suggestion": _suggest_fix(r["tag"]), - }) - - return { - "agent": agent, - "period_hours": hours, - "total_rejections": total_rejections, - "top_reasons": top_reasons, - "generated_at": _now_iso(), - } - - -def _suggest_fix(rejection_tag: str) -> str: - """Map known rejection reasons to actionable suggestions.""" - suggestions = { - "broken_wiki_links": "Check that all [[wiki links]] in claims resolve to existing files. Run link validation before submitting.", - "near_duplicate": "Search existing claims before creating new ones. Use semantic search to find similar claims.", - "frontmatter_schema": "Validate YAML frontmatter against the claim schema. Required fields: title, domain, confidence, type.", - "weak_evidence": "Add concrete sources, data points, or citations. Claims need evidence that can be independently verified.", - "missing_confidence": "Every claim needs a confidence level: proven, likely, experimental, or speculative.", - "domain_mismatch": "Ensure claims are filed under the correct domain. Check domain definitions if unsure.", - "too_broad": "Break broad claims into specific, testable sub-claims.", - "missing_links": "Claims should link to related claims, entities, or sources. Isolated claims are harder to verify.", - } - return suggestions.get(rejection_tag, f"Review rejection reason '{rejection_tag}' and adjust extraction accordingly.") - - -# ─── Run All Checks ──────────────────────────────────────────────────────── - - -def run_all_checks(conn: sqlite3.Connection) -> list[dict]: - """Execute all check functions and return combined alerts.""" - alerts = [] - alerts.extend(check_agent_health(conn)) - alerts.extend(check_quality_regression(conn)) - alerts.extend(check_throughput(conn)) - alerts.extend(check_rejection_spike(conn)) - alerts.extend(check_stuck_loops(conn)) - alerts.extend(check_cost_spikes(conn)) - alerts.extend(check_domain_rejection_patterns(conn)) - return alerts - - -def format_alert_message(alert: dict) -> str: - """Format an alert for Pentagon messaging.""" - severity_icon = {"critical": "!!", "warning": "!", "info": "~"} - icon = severity_icon.get(alert["severity"], "?") - return f"[{icon}] {alert['title']}\n{alert['detail']}" diff --git a/diagnostics/alerting_routes.py b/diagnostics/alerting_routes.py deleted file mode 100644 index fd3574071..000000000 --- a/diagnostics/alerting_routes.py +++ /dev/null @@ -1,125 +0,0 @@ -"""Route handlers for /check and /api/alerts endpoints. - -Import into app.py and register routes in create_app(). -""" - -import json -import logging -from datetime import datetime, timezone - -from aiohttp import web -from alerting import run_all_checks, generate_failure_report, format_alert_message # requires CWD = deploy dir; switch to relative import if packaged - -logger = logging.getLogger("argus.alerting") - -# In-memory alert store (replaced each /check cycle, persists between requests) -_active_alerts: list[dict] = [] -_last_check: str | None = None - - -async def handle_check(request): - """GET /check — run all monitoring checks, update active alerts, return results. - - Designed to be called by systemd timer every 5 minutes. - Returns JSON summary of all detected issues. - """ - conn = request.app["_alerting_conn_func"]() - try: - alerts = run_all_checks(conn) - except Exception as e: - logger.error("Check failed: %s", e) - return web.json_response({"error": str(e)}, status=500) - - global _active_alerts, _last_check - _active_alerts = alerts - _last_check = datetime.now(timezone.utc).isoformat() - - # Generate failure reports for agents with stuck loops - failure_reports = {} - stuck_agents = {a["agent"] for a in alerts if a["category"] == "health" and "stuck" in a["id"] and a["agent"]} - for agent in stuck_agents: - report = generate_failure_report(conn, agent) - if report: - failure_reports[agent] = report - - result = { - "checked_at": _last_check, - "alert_count": len(alerts), - "critical": sum(1 for a in alerts if a["severity"] == "critical"), - "warning": sum(1 for a in alerts if a["severity"] == "warning"), - "info": sum(1 for a in alerts if a["severity"] == "info"), - "alerts": alerts, - "failure_reports": failure_reports, - } - - logger.info( - "Check complete: %d alerts (%d critical, %d warning)", - len(alerts), - result["critical"], - result["warning"], - ) - - return web.json_response(result) - - -async def handle_api_alerts(request): - """GET /api/alerts — return current active alerts. - - Query params: - severity: filter by severity (critical, warning, info) - category: filter by category (health, quality, throughput, failure_pattern) - agent: filter by agent name - domain: filter by domain - """ - alerts = list(_active_alerts) - - # Filters - severity = request.query.get("severity") - if severity: - alerts = [a for a in alerts if a["severity"] == severity] - - category = request.query.get("category") - if category: - alerts = [a for a in alerts if a["category"] == category] - - agent = request.query.get("agent") - if agent: - alerts = [a for a in alerts if a.get("agent") == agent] - - domain = request.query.get("domain") - if domain: - alerts = [a for a in alerts if a.get("domain") == domain] - - return web.json_response({ - "alerts": alerts, - "total": len(alerts), - "last_check": _last_check, - }) - - -async def handle_api_failure_report(request): - """GET /api/failure-report/{agent} — generate failure report for an agent. - - Query params: - hours: lookback window (default 24) - """ - agent = request.match_info["agent"] - hours = int(request.query.get("hours", "24")) - conn = request.app["_alerting_conn_func"]() - - report = generate_failure_report(conn, agent, hours) - if not report: - return web.json_response({"agent": agent, "status": "no_rejections", "period_hours": hours}) - - return web.json_response(report) - - -def register_alerting_routes(app, get_conn_func): - """Register alerting routes on the app. - - get_conn_func: callable that returns a read-only sqlite3.Connection - """ - app["_alerting_conn_func"] = get_conn_func - app.router.add_get("/check", handle_check) - app.router.add_get("/api/alerts", handle_api_alerts) - app.router.add_get("/api/failure-report/{agent}", handle_api_failure_report) diff --git a/ops/deploy.sh b/ops/deploy.sh index 31a2f6d1d..c571e9fca 100755 --- a/ops/deploy.sh +++ b/ops/deploy.sh @@ -93,7 +93,115 @@ echo "Deploy complete." if $RESTART; then echo "" - echo "=== Restarting services ===" - ssh "$VPS_HOST" "sudo systemctl restart teleo-pipeline teleo-diagnostics" - echo "Services restarted." + echo "=== Detecting services to restart ===" + + # Determine which services need restart based on what was deployed. + # rsync touched these paths → these services: + # pipeline-v2/lib/, pipeline-v2/*.py → teleo-pipeline + # diagnostics/ → teleo-diagnostics + # agent-state/, research-session.sh → no restart (not daemons) + RESTART_SVCS="" + + # Check VPS for recent file changes from this deploy + # Compare local files against VPS to see what actually changed + PIPELINE_CHANGED=false + DIAG_CHANGED=false + + # Pipeline: lib/ or top-level scripts + if ! rsync -avzn --exclude='__pycache__' --exclude='*.pyc' --exclude='*.bak*' \ + "$REPO_ROOT/ops/pipeline-v2/lib/" "$VPS_HOST:$VPS_PIPELINE/lib/" 2>/dev/null | grep -q '\.py$'; then + true # no python changes + else + PIPELINE_CHANGED=true + fi + for f in teleo-pipeline.py reweave.py; do + if [ -f "$REPO_ROOT/ops/pipeline-v2/$f" ]; then + if rsync -avzn "$REPO_ROOT/ops/pipeline-v2/$f" "$VPS_HOST:$VPS_PIPELINE/$f" 2>/dev/null | grep -q "$f"; then + PIPELINE_CHANGED=true + fi + fi + done + + # Diagnostics + if rsync -avzn --exclude='__pycache__' --exclude='*.pyc' --exclude='*.bak*' \ + "$REPO_ROOT/ops/diagnostics/" "$VPS_HOST:$VPS_DIAGNOSTICS/" 2>/dev/null | grep -q '\.py$'; then + DIAG_CHANGED=true + fi + + if $PIPELINE_CHANGED; then + RESTART_SVCS="$RESTART_SVCS teleo-pipeline" + echo " teleo-pipeline: files changed, will restart" + else + echo " teleo-pipeline: no changes, skipping" + fi + + if $DIAG_CHANGED; then + RESTART_SVCS="$RESTART_SVCS teleo-diagnostics" + echo " teleo-diagnostics: files changed, will restart" + else + echo " teleo-diagnostics: no changes, skipping" + fi + + if [ -z "$RESTART_SVCS" ]; then + echo "" + echo "No service files changed. Skipping restart." + else + echo "" + echo "=== Restarting:$RESTART_SVCS ===" + ssh "$VPS_HOST" "sudo systemctl restart $RESTART_SVCS" + echo "Services restarted. Waiting 5s for startup..." + sleep 5 + + echo "" + echo "=== Smoke test ===" + SMOKE_FAIL=0 + + # Check systemd unit status for restarted services + for svc in $RESTART_SVCS; do + if ssh "$VPS_HOST" "systemctl is-active --quiet $svc"; then + echo " $svc: active" + else + echo " $svc: FAILED" + ssh "$VPS_HOST" "journalctl -u $svc -n 10 --no-pager" || true + SMOKE_FAIL=1 + fi + done + + # Hit health endpoints for restarted services + if echo "$RESTART_SVCS" | grep -q "teleo-pipeline"; then + if ssh "$VPS_HOST" "curl -sf --connect-timeout 3 http://localhost:8080/health > /dev/null"; then + echo " pipeline health (8080): OK" + else + echo " pipeline health (8080): FAILED" + SMOKE_FAIL=1 + fi + fi + + if echo "$RESTART_SVCS" | grep -q "teleo-diagnostics"; then + if ssh "$VPS_HOST" "curl -sf --connect-timeout 3 http://localhost:8081/ops > /dev/null"; then + echo " diagnostics (8081): OK" + else + echo " diagnostics (8081): FAILED" + SMOKE_FAIL=1 + fi + fi + + # Tail logs for quick visual check + echo "" + echo "=== Recent logs (10s) ===" + JOURNAL_UNITS="" + for svc in $RESTART_SVCS; do + JOURNAL_UNITS="$JOURNAL_UNITS -u $svc" + done + ssh "$VPS_HOST" "journalctl $JOURNAL_UNITS --since '-10s' --no-pager -n 20" || true + + if [ "$SMOKE_FAIL" -gt 0 ]; then + echo "" + echo "WARNING: Smoke test detected failures. Check logs above." + exit 1 + fi + + echo "" + echo "Smoke test passed." + fi fi diff --git a/ops/diagnostics/CONSOLIDATION-DIFF-LOG.md b/ops/diagnostics/CONSOLIDATION-DIFF-LOG.md new file mode 100644 index 000000000..9f2593be4 --- /dev/null +++ b/ops/diagnostics/CONSOLIDATION-DIFF-LOG.md @@ -0,0 +1,141 @@ +# Diagnostics Consolidation Diff Log +# Branch: epimetheus/consolidate-infra +# Date: 2026-04-13 + +## Files with multiple copies — resolution + +### alerting.py +- ROOT diagnostics/alerting.py (22320 bytes) — KEPT (newer: has _ALLOWED_DIM_EXPRS SQL injection protection, stricter dim_expr validation) +- ops/diagnostics/alerting.py (22039 bytes) — OVERWRITTEN (missing SQL injection guards) +- VPS /opt/teleo-eval/diagnostics/alerting.py (22039 bytes) — matches ops/ version, needs deploy + +### alerting_routes.py +- ROOT diagnostics/alerting_routes.py (4216 bytes) — KEPT (newer: proper try/finally/conn.close, ValueError catch on hours param) +- ops/diagnostics/alerting_routes.py (4043 bytes) — OVERWRITTEN (missing error handling, missing conn.close) +- VPS /opt/teleo-eval/diagnostics/alerting_routes.py (4043 bytes) — matches ops/ version, needs deploy + +### vitality.py +- ROOT diagnostics/vitality.py (25548 bytes) — KEPT (only copy in repo, larger than VPS) +- VPS /opt/teleo-eval/diagnostics/vitality.py (18539 bytes) — older version, needs deploy +- MOVED TO: ops/diagnostics/vitality.py + +### vitality_routes.py +- ROOT diagnostics/vitality_routes.py (10824 bytes) — KEPT (only copy in repo, larger than VPS) +- VPS /opt/teleo-eval/diagnostics/vitality_routes.py (9729 bytes) — older version, needs deploy +- MOVED TO: ops/diagnostics/vitality_routes.py + +## Files moved + +| From | To | Reason | +|------|-----|--------| +| diagnostics/vitality.py | ops/diagnostics/vitality.py | Consolidate to canonical location | +| diagnostics/vitality_routes.py | ops/diagnostics/vitality_routes.py | Consolidate to canonical location | +| diagnostics/alerting.py | ops/diagnostics/alerting.py | Newer version overwrites older | +| diagnostics/alerting_routes.py | ops/diagnostics/alerting_routes.py | Newer version overwrites older | + +## Root diagnostics/ after consolidation +- PATCH_INSTRUCTIONS.md — kept (documentation, not code) +- evolution.md — kept (documentation) +- weekly/2026-03-25-week3.md — kept (report) +- ops/sessions/*.json — kept (session data) +- alerting.py, alerting_routes.py REMOVED by this consolidation +- vitality.py, vitality_routes.py were already absent (moved in prior commit) +- No .py files remain in root diagnostics/ + +## VPS .bak files inventory (30+ files) +All in /opt/teleo-eval/diagnostics/. Git is the backup now. Safe to delete after consolidation verified. + +## VPS deploy needed after merge +alerting.py, alerting_routes.py, vitality.py, vitality_routes.py — all local versions are newer than VPS. + +--- + +## Root Patch Script Audit (Epimetheus's 7 patches) + +### patch-prompt-version.py — APPLIED +- **Target:** db.py, merge.py, extract.py, extraction_prompt.py +- **What:** Schema v17 migration for prompt_version/pipeline_version columns, version stamping on PR discovery, feedback param for re-extraction +- **Status:** All 4 targets have changes. Schema is at v19 (includes this migration). merge.py stamps versions. extract.py has feedback param. extraction_prompt.py has previous_feedback. +- **Action:** SAFE TO DELETE + +### tmp-patch-research-state.py — APPLIED +- **Target:** research-session.sh +- **What:** Integrates agent-state hooks (state_start_session, state_update_report, state_journal_append) +- **Status:** All hooks present in research-session.sh (STATE_LIB sourcing, HAS_STATE init, session lifecycle calls) +- **Action:** SAFE TO DELETE + +### patch-dashboard-cost.py — STALE (superseded) +- **Target:** dashboard_routes.py +- **What:** Adds per-PR cost queries via audit_log (cost_map, triage_cost_map) +- **Status:** Cost tracking implemented differently in current codebase — uses `costs` table and p.cost_usd column, not audit_log aggregation. Patch logic abandoned in favor of newer approach. +- **Action:** SAFE TO DELETE (superseded by different implementation) + +### patch-dashboard-prs-cost.py — STALE (superseded) +- **Target:** dashboard_prs.py +- **What:** Adds Cost column header, fmtCost() function, cost cell in row template +- **Status:** Cost KPI card exists (line 101) but implemented as card-based KPI, not table column. fmtCost() not present. Different UI approach than patch intended. +- **Action:** SAFE TO DELETE (superseded by card-based cost display) + +### patch-cost-per-pr.py — NOT APPLIED +- **Target:** evaluate.py +- **What:** Adds _estimate_cost() helper function, cost instrumentation to audit events (haiku_triage, domain_rejected, approved, changes_requested) +- **Status:** _estimate_cost not found in evaluate.py. No cost fields in audit events. eval_checks.py has its own estimate_cost but for bot responses, not pipeline eval. +- **Action:** SAFE TO DELETE — eval_checks.py already has cost estimation for its own use case. The pipeline eval cost tracking was a different approach that was never completed. + +### patch-dashboard-prs-version.py — NOT APPLIED +- **Target:** dashboard_prs.py +- **What:** Adds version badges (prompt_version, pipeline_version) to eval chain section and agent cell +- **Status:** No version badges in dashboard_prs.py. prompt_version/pipeline_version not displayed anywhere. +- **Action:** SAFE TO DELETE — version columns exist in schema (v17 migration) but UI display was never built. Low priority feature, can be re-implemented from schema when needed. + +### patch-dashboard-version.py — NOT APPLIED +- **Target:** dashboard_routes.py, shared_ui.py +- **What:** Adds prompt_version/pipeline_version to SELECT query, version badges to shared_ui +- **Status:** Version fields not in SELECT. shared_ui.py exists but without version display. +- **Action:** SAFE TO DELETE — same reasoning as patch-dashboard-prs-version.py. + +### Summary + +| Script | Status | Action | +|--------|--------|--------| +| patch-prompt-version.py | APPLIED | Delete | +| tmp-patch-research-state.py | APPLIED | Delete | +| patch-dashboard-cost.py | STALE (superseded) | Delete | +| patch-dashboard-prs-cost.py | STALE (superseded) | Delete | +| patch-cost-per-pr.py | NOT APPLIED (abandoned) | Delete | +| patch-dashboard-prs-version.py | NOT APPLIED (low priority) | Delete | +| patch-dashboard-version.py | NOT APPLIED (low priority) | Delete | + +All 7 safe to delete. 2 were applied, 2 were superseded by different implementations, 3 were never applied but the features either exist differently or are low priority. + +--- + +## Root Orphan Files + +### extract.py (693 lines) +- **Location:** Pentagon workspace root +- **Canonical:** teleo-codex/ops/pipeline-v2/openrouter-extract-v2.py (Apr 7+) +- **Status:** Older draft (Apr 1). Confirmed by Cory as safe to delete. +- **Action:** DELETE + +### cascade.py (274 lines) +- **Location:** Pentagon workspace root +- **Canonical:** teleo-codex/ops/pipeline-v2/lib/cascade.py (10372 bytes, Apr 13) +- **Status:** Older draft. Confirmed by Cory as safe to delete. +- **Action:** DELETE + +--- + +## Argus's Patch Scripts (in root diagnostics/) + +8 patch scripts owned by Argus — audit responsibility is Argus's: +- diagnostics/compute_profile_patch.py +- diagnostics/dashboard_compute_patch.py +- diagnostics/patch_4page.py +- diagnostics/patch_dashboard_tokens.py +- diagnostics/patch_evaluate_costs.py +- diagnostics/patch_llm_cli.py +- diagnostics/patch_prs_page.py +- diagnostics/patch_vps_app.py + +These remain in root diagnostics/ until Argus completes his audit. diff --git a/ops/diagnostics/alerting.py b/ops/diagnostics/alerting.py index 0c84ae5b4..c0dab371a 100644 --- a/ops/diagnostics/alerting.py +++ b/ops/diagnostics/alerting.py @@ -157,8 +157,17 @@ def check_quality_regression(conn: sqlite3.Connection) -> list[dict]: return alerts +_ALLOWED_DIM_EXPRS = frozenset({ + "json_extract(detail, '$.agent')", + "json_extract(detail, '$.domain')", + "COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent'))", +}) + + def _check_approval_by_dimension(conn, alerts, dim_name, dim_expr): - """Check approval rate regression grouped by a dimension (agent or domain).""" + """Check approval rate regression grouped by a dimension. dim_expr must be in _ALLOWED_DIM_EXPRS.""" + if dim_expr not in _ALLOWED_DIM_EXPRS: + raise ValueError(f"untrusted dim_expr: {dim_expr}") # 7-day baseline per dimension baseline_rows = conn.execute( f"""SELECT {dim_expr} as dim_val, @@ -468,7 +477,7 @@ def generate_failure_report(conn: sqlite3.Connection, agent: str, hours: int = 2 FROM audit_log, json_each(json_extract(detail, '$.issues')) WHERE stage='evaluate' AND event IN ('changes_requested','domain_rejected','tier05_rejected') - AND COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent')) = ? + AND json_extract(detail, '$.agent') = ? AND timestamp > datetime('now', ? || ' hours') GROUP BY tag ORDER BY cnt DESC LIMIT 5""", diff --git a/ops/diagnostics/alerting_routes.py b/ops/diagnostics/alerting_routes.py index fd3574071..6e736b110 100644 --- a/ops/diagnostics/alerting_routes.py +++ b/ops/diagnostics/alerting_routes.py @@ -26,22 +26,24 @@ async def handle_check(request): conn = request.app["_alerting_conn_func"]() try: alerts = run_all_checks(conn) + + # Generate failure reports for agents with stuck loops + failure_reports = {} + stuck_agents = {a["agent"] for a in alerts if a["category"] == "health" and "stuck" in a["id"] and a["agent"]} + for agent in stuck_agents: + report = generate_failure_report(conn, agent) + if report: + failure_reports[agent] = report except Exception as e: logger.error("Check failed: %s", e) return web.json_response({"error": str(e)}, status=500) + finally: + conn.close() global _active_alerts, _last_check _active_alerts = alerts _last_check = datetime.now(timezone.utc).isoformat() - # Generate failure reports for agents with stuck loops - failure_reports = {} - stuck_agents = {a["agent"] for a in alerts if a["category"] == "health" and "stuck" in a["id"] and a["agent"]} - for agent in stuck_agents: - report = generate_failure_report(conn, agent) - if report: - failure_reports[agent] = report - result = { "checked_at": _last_check, "alert_count": len(alerts), @@ -104,10 +106,15 @@ async def handle_api_failure_report(request): hours: lookback window (default 24) """ agent = request.match_info["agent"] - hours = int(request.query.get("hours", "24")) + try: + hours = min(int(request.query.get("hours", "24")), 168) + except ValueError: + hours = 24 conn = request.app["_alerting_conn_func"]() - - report = generate_failure_report(conn, agent, hours) + try: + report = generate_failure_report(conn, agent, hours) + finally: + conn.close() if not report: return web.json_response({"agent": agent, "status": "no_rejections", "period_hours": hours}) diff --git a/ops/diagnostics/dashboard_epistemic.py b/ops/diagnostics/dashboard_epistemic.py index c0e1c093f..cb3dd5ef7 100644 --- a/ops/diagnostics/dashboard_epistemic.py +++ b/ops/diagnostics/dashboard_epistemic.py @@ -74,7 +74,7 @@ def render_epistemic_page(vital_signs: dict, now: datetime) -> str:
Multi-model agreement rate requires the model_evals table.
- Blocked on: model_evals table creation (Theseus 2 Phase 3) + Blocked on: model_evals table creation (Ship Phase 3)
Current eval models: Haiku (triage), GPT-4o (domain), Sonnet/Opus (Leo).
diff --git a/ops/diagnostics/dashboard_prs.py b/ops/diagnostics/dashboard_prs.py index 638ab52a1..e1ca5c08c 100644 --- a/ops/diagnostics/dashboard_prs.py +++ b/ops/diagnostics/dashboard_prs.py @@ -1,8 +1,8 @@ """PR Lifecycle dashboard — single-page view of every PR through the pipeline. -Sortable table: PR#, summary, claims, domain, contributor, outcome, evals, evaluator, cost, date. -Click any row to expand: claim titles, eval chain, timeline, reviews, issues. -Hero cards: total PRs, merge rate, total claims, est. cost. +Sortable table: PR#, summary, claims, domain, outcome, evals, evaluator, cost, date. +Click any row to expand: timeline, claim list, issues summary. +Hero cards: total PRs, merge rate, median eval rounds, total claims, total cost. Data sources: prs table, audit_log (eval rounds), review_records. Owner: Ship @@ -14,7 +14,7 @@ from shared_ui import render_page EXTRA_CSS = """ - .content-wrapper { max-width: 1600px !important; } + .page-content { max-width: 1600px !important; } .filters { display: flex; gap: 12px; flex-wrap: wrap; margin-bottom: 16px; } .filters select, .filters input { background: #161b22; color: #c9d1d9; border: 1px solid #30363d; @@ -22,15 +22,14 @@ EXTRA_CSS = """ .filters select:focus, .filters input:focus { border-color: #58a6ff; outline: none; } .pr-table { width: 100%; border-collapse: collapse; font-size: 13px; table-layout: fixed; } .pr-table th:nth-child(1) { width: 50px; } /* PR# */ - .pr-table th:nth-child(2) { width: 28%; } /* Summary */ + .pr-table th:nth-child(2) { width: 30%; } /* Summary */ .pr-table th:nth-child(3) { width: 50px; } /* Claims */ - .pr-table th:nth-child(4) { width: 11%; } /* Domain */ - .pr-table th:nth-child(5) { width: 10%; } /* Contributor */ - .pr-table th:nth-child(6) { width: 10%; } /* Outcome */ - .pr-table th:nth-child(7) { width: 44px; } /* Evals */ - .pr-table th:nth-child(8) { width: 12%; } /* Evaluator */ - .pr-table th:nth-child(9) { width: 60px; } /* Cost */ - .pr-table th:nth-child(10) { width: 80px; } /* Date */ + .pr-table th:nth-child(4) { width: 12%; } /* Domain */ + .pr-table th:nth-child(5) { width: 10%; } /* Outcome */ + .pr-table th:nth-child(6) { width: 50px; } /* Evals */ + .pr-table th:nth-child(7) { width: 16%; } /* Evaluator */ + .pr-table th:nth-child(8) { width: 70px; } /* Cost */ + .pr-table th:nth-child(9) { width: 90px; } /* Date */ .pr-table td { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; padding: 8px 6px; } .pr-table td:nth-child(2) { white-space: normal; overflow: visible; line-height: 1.4; } .pr-table th { cursor: pointer; user-select: none; position: relative; padding: 8px 18px 8px 6px; } @@ -49,24 +48,22 @@ EXTRA_CSS = """ .pr-table .pr-link:hover { text-decoration: underline; } .pr-table td .summary-text { font-size: 12px; color: #c9d1d9; } .pr-table td .review-snippet { font-size: 11px; color: #f85149; margin-top: 2px; opacity: 0.8; } - .pr-table td .model-tag { font-size: 10px; color: #6e7681; background: #161b22; border-radius: 3px; padding: 1px 4px; } - .pr-table td .contributor-tag { font-size: 11px; color: #d2a8ff; } - .pr-table td .contributor-self { font-size: 11px; color: #6e7681; font-style: italic; } + .pr-table td .model-tag { font-size: 9px; color: #6e7681; background: #21262d; border-radius: 3px; padding: 1px 4px; display: inline-block; margin: 1px 0; } .pr-table td .expand-chevron { display: inline-block; width: 12px; color: #484f58; font-size: 10px; transition: transform 0.2s; } .pr-table tr.expanded .expand-chevron { transform: rotate(90deg); color: #58a6ff; } + .pr-table td .cost-val { font-size: 12px; color: #8b949e; } + .pr-table td .claims-count { font-size: 13px; color: #c9d1d9; text-align: center; } + .pr-table td .evals-count { font-size: 13px; text-align: center; } .trace-panel { background: #0d1117; border: 1px solid #30363d; border-radius: 8px; padding: 16px; margin: 4px 0 8px 0; font-size: 12px; display: none; } .trace-panel.open { display: block; } - .trace-panel h4 { color: #58a6ff; font-size: 12px; margin: 12px 0 6px 0; } - .trace-panel h4:first-child { margin-top: 0; } - .claim-list { list-style: none; padding: 0; margin: 0; } - .claim-list li { padding: 4px 0 4px 16px; border-left: 2px solid #238636; color: #c9d1d9; font-size: 12px; line-height: 1.5; } - .claim-list li .claim-confidence { font-size: 10px; color: #8b949e; margin-left: 6px; } - .issues-box { background: #1c1210; border: 1px solid #f8514933; border-radius: 6px; + .trace-panel .section-title { color: #58a6ff; font-size: 12px; font-weight: 600; margin: 12px 0 6px; } + .trace-panel .section-title:first-child { margin-top: 0; } + .trace-panel .claim-list { list-style: none; padding: 0; margin: 0; } + .trace-panel .claim-list li { padding: 4px 0; border-bottom: 1px solid #21262d; color: #c9d1d9; font-size: 12px; } + .trace-panel .claim-list li:last-child { border-bottom: none; } + .trace-panel .issues-box { background: #1c1017; border: 1px solid #f8514930; border-radius: 6px; padding: 8px 12px; margin: 4px 0; font-size: 12px; color: #f85149; } - .eval-chain { background: #161b22; border-radius: 6px; padding: 8px 12px; margin: 4px 0; font-size: 12px; } - .eval-chain .chain-step { display: inline-block; margin-right: 6px; } - .eval-chain .chain-arrow { color: #484f58; margin: 0 4px; } .trace-timeline { list-style: none; padding: 0; } .trace-timeline li { padding: 4px 0; border-left: 2px solid #30363d; padding-left: 12px; margin-left: 8px; } .trace-timeline li .ts { color: #484f58; font-size: 11px; } @@ -76,6 +73,12 @@ EXTRA_CSS = """ .trace-timeline li.ev-changes .ev { color: #d29922; } .review-text { background: #161b22; padding: 8px 12px; border-radius: 4px; margin: 4px 0; white-space: pre-wrap; font-size: 11px; color: #8b949e; max-height: 200px; overflow-y: auto; } + .eval-chain { background: #161b22; border-radius: 6px; padding: 8px 12px; margin: 4px 0 8px; + font-size: 12px; display: flex; gap: 12px; flex-wrap: wrap; align-items: center; } + .eval-chain .step { display: flex; align-items: center; gap: 4px; } + .eval-chain .step-label { color: #8b949e; font-size: 11px; } + .eval-chain .step-model { color: #c9d1d9; font-size: 11px; font-weight: 600; } + .eval-chain .arrow { color: #484f58; } .pagination { display: flex; gap: 8px; align-items: center; justify-content: center; margin-top: 16px; } .pagination button { background: #161b22; color: #c9d1d9; border: 1px solid #30363d; border-radius: 4px; padding: 4px 12px; cursor: pointer; font-size: 12px; } @@ -93,6 +96,7 @@ def render_prs_page(now: datetime) -> str:
Total PRs
--
Merge Rate
--
+
Median Eval Rounds
--
Total Claims
--
Est. Cost
--
@@ -100,7 +104,6 @@ def render_prs_page(now: datetime) -> str:
-