diff --git a/diagnostics/alerting.py b/diagnostics/alerting.py
index 0c84ae5..3de3819 100644
--- a/diagnostics/alerting.py
+++ b/diagnostics/alerting.py
@@ -67,6 +67,8 @@ def check_agent_health(conn: sqlite3.Connection) -> list[dict]:
now = datetime.now(timezone.utc)
for r in rows:
agent = r["agent"]
+ if agent in ("unknown", None):
+ continue
latest = r["latest"]
if not latest:
continue
@@ -157,8 +159,17 @@ def check_quality_regression(conn: sqlite3.Connection) -> list[dict]:
return alerts
+_ALLOWED_DIM_EXPRS = frozenset({
+ "json_extract(detail, '$.agent')",
+ "json_extract(detail, '$.domain')",
+ "COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent'))",
+})
+
+
def _check_approval_by_dimension(conn, alerts, dim_name, dim_expr):
- """Check approval rate regression grouped by a dimension (agent or domain)."""
+ """Check approval rate regression grouped by a dimension. dim_expr must be in _ALLOWED_DIM_EXPRS."""
+ if dim_expr not in _ALLOWED_DIM_EXPRS:
+ raise ValueError(f"untrusted dim_expr: {dim_expr}")
# 7-day baseline per dimension
baseline_rows = conn.execute(
f"""SELECT {dim_expr} as dim_val,
@@ -257,24 +268,22 @@ def check_rejection_spike(conn: sqlite3.Connection) -> list[dict]:
"""Detect single rejection reason exceeding REJECTION_SPIKE_RATIO of recent rejections."""
alerts = []
- # Total rejections in 24h
+ # Total rejected PRs in 24h (prs.eval_issues is the canonical source — Epimetheus 2026-04-02)
total = conn.execute(
- """SELECT COUNT(*) as n FROM audit_log
- WHERE stage='evaluate'
- AND event IN ('changes_requested','domain_rejected','tier05_rejected')
- AND timestamp > datetime('now', '-24 hours')"""
+ """SELECT COUNT(*) as n FROM prs
+ WHERE eval_issues IS NOT NULL AND eval_issues != '[]'
+ AND created_at > datetime('now', '-24 hours')"""
).fetchone()["n"]
if total < 10:
return alerts # Not enough data
- # Count by rejection tag
+ # Count by rejection tag from prs.eval_issues
tags = conn.execute(
"""SELECT value as tag, COUNT(*) as cnt
- FROM audit_log, json_each(json_extract(detail, '$.issues'))
- WHERE stage='evaluate'
- AND event IN ('changes_requested','domain_rejected','tier05_rejected')
- AND timestamp > datetime('now', '-24 hours')
+ FROM prs, json_each(prs.eval_issues)
+ WHERE eval_issues IS NOT NULL AND eval_issues != '[]'
+ AND created_at > datetime('now', '-24 hours')
GROUP BY tag ORDER BY cnt DESC"""
).fetchall()
@@ -306,16 +315,13 @@ def check_stuck_loops(conn: sqlite3.Connection) -> list[dict]:
"""Detect agents repeatedly failing on the same rejection reason."""
alerts = []
- # COALESCE: rejection events use $.agent, eval events use $.domain_agent (Epimetheus 2026-03-28)
+ # Agent + rejection reason from prs table directly (Epimetheus correction 2026-04-02)
rows = conn.execute(
- """SELECT COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent')) as agent,
- value as tag,
- COUNT(*) as cnt
- FROM audit_log, json_each(json_extract(detail, '$.issues'))
- WHERE stage='evaluate'
- AND event IN ('changes_requested','domain_rejected','tier05_rejected')
- AND timestamp > datetime('now', '-6 hours')
- AND COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent')) IS NOT NULL
+ """SELECT agent, value as tag, COUNT(*) as cnt
+ FROM prs, json_each(prs.eval_issues)
+ WHERE eval_issues IS NOT NULL AND eval_issues != '[]'
+ AND agent IS NOT NULL
+ AND created_at > datetime('now', '-6 hours')
GROUP BY agent, tag
HAVING cnt > ?""",
(STUCK_LOOP_THRESHOLD,),
@@ -403,16 +409,13 @@ def check_domain_rejection_patterns(conn: sqlite3.Connection) -> list[dict]:
"""Track rejection reason shift per domain — surfaces domain maturity issues."""
alerts = []
- # Per-domain rejection breakdown in 24h
+ # Per-domain rejection breakdown in 24h from prs table (Epimetheus correction 2026-04-02)
rows = conn.execute(
- """SELECT json_extract(detail, '$.domain') as domain,
- value as tag,
- COUNT(*) as cnt
- FROM audit_log, json_each(json_extract(detail, '$.issues'))
- WHERE stage='evaluate'
- AND event IN ('changes_requested','domain_rejected','tier05_rejected')
- AND timestamp > datetime('now', '-24 hours')
- AND json_extract(detail, '$.domain') IS NOT NULL
+ """SELECT domain, value as tag, COUNT(*) as cnt
+ FROM prs, json_each(prs.eval_issues)
+ WHERE eval_issues IS NOT NULL AND eval_issues != '[]'
+ AND domain IS NOT NULL
+ AND created_at > datetime('now', '-24 hours')
GROUP BY domain, tag
ORDER BY domain, cnt DESC"""
).fetchall()
@@ -464,12 +467,11 @@ def generate_failure_report(conn: sqlite3.Connection, agent: str, hours: int = 2
hours = int(hours) # defensive — callers should pass int, but enforce it
rows = conn.execute(
"""SELECT value as tag, COUNT(*) as cnt,
- GROUP_CONCAT(DISTINCT json_extract(detail, '$.pr')) as pr_numbers
- FROM audit_log, json_each(json_extract(detail, '$.issues'))
- WHERE stage='evaluate'
- AND event IN ('changes_requested','domain_rejected','tier05_rejected')
- AND COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent')) = ?
- AND timestamp > datetime('now', ? || ' hours')
+ GROUP_CONCAT(DISTINCT number) as pr_numbers
+ FROM prs, json_each(prs.eval_issues)
+ WHERE eval_issues IS NOT NULL AND eval_issues != '[]'
+ AND agent = ?
+ AND created_at > datetime('now', ? || ' hours')
GROUP BY tag ORDER BY cnt DESC
LIMIT 5""",
(agent, f"-{hours}"),
diff --git a/diagnostics/alerting_routes.py b/diagnostics/alerting_routes.py
index fd35740..6e736b1 100644
--- a/diagnostics/alerting_routes.py
+++ b/diagnostics/alerting_routes.py
@@ -26,22 +26,24 @@ async def handle_check(request):
conn = request.app["_alerting_conn_func"]()
try:
alerts = run_all_checks(conn)
+
+ # Generate failure reports for agents with stuck loops
+ failure_reports = {}
+ stuck_agents = {a["agent"] for a in alerts if a["category"] == "health" and "stuck" in a["id"] and a["agent"]}
+ for agent in stuck_agents:
+ report = generate_failure_report(conn, agent)
+ if report:
+ failure_reports[agent] = report
except Exception as e:
logger.error("Check failed: %s", e)
return web.json_response({"error": str(e)}, status=500)
+ finally:
+ conn.close()
global _active_alerts, _last_check
_active_alerts = alerts
_last_check = datetime.now(timezone.utc).isoformat()
- # Generate failure reports for agents with stuck loops
- failure_reports = {}
- stuck_agents = {a["agent"] for a in alerts if a["category"] == "health" and "stuck" in a["id"] and a["agent"]}
- for agent in stuck_agents:
- report = generate_failure_report(conn, agent)
- if report:
- failure_reports[agent] = report
-
result = {
"checked_at": _last_check,
"alert_count": len(alerts),
@@ -104,10 +106,15 @@ async def handle_api_failure_report(request):
hours: lookback window (default 24)
"""
agent = request.match_info["agent"]
- hours = int(request.query.get("hours", "24"))
+ try:
+ hours = min(int(request.query.get("hours", "24")), 168)
+ except ValueError:
+ hours = 24
conn = request.app["_alerting_conn_func"]()
-
- report = generate_failure_report(conn, agent, hours)
+ try:
+ report = generate_failure_report(conn, agent, hours)
+ finally:
+ conn.close()
if not report:
return web.json_response({"agent": agent, "status": "no_rejections", "period_hours": hours})
diff --git a/diagnostics/dashboard_epistemic.py b/diagnostics/dashboard_epistemic.py
index c0e1c09..6074f42 100644
--- a/diagnostics/dashboard_epistemic.py
+++ b/diagnostics/dashboard_epistemic.py
@@ -74,7 +74,7 @@ def render_epistemic_page(vital_signs: dict, now: datetime) -> str:
⚙
Multi-model agreement rate requires the model_evals table.
- Blocked on: model_evals table creation (Theseus 2 Phase 3)
+ Blocked on: model_evals table creation (Ship Phase 3)