diff --git a/diagnostics/alerting.py b/diagnostics/alerting.py index 0c84ae5..3de3819 100644 --- a/diagnostics/alerting.py +++ b/diagnostics/alerting.py @@ -67,6 +67,8 @@ def check_agent_health(conn: sqlite3.Connection) -> list[dict]: now = datetime.now(timezone.utc) for r in rows: agent = r["agent"] + if agent in ("unknown", None): + continue latest = r["latest"] if not latest: continue @@ -157,8 +159,17 @@ def check_quality_regression(conn: sqlite3.Connection) -> list[dict]: return alerts +_ALLOWED_DIM_EXPRS = frozenset({ + "json_extract(detail, '$.agent')", + "json_extract(detail, '$.domain')", + "COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent'))", +}) + + def _check_approval_by_dimension(conn, alerts, dim_name, dim_expr): - """Check approval rate regression grouped by a dimension (agent or domain).""" + """Check approval rate regression grouped by a dimension. dim_expr must be in _ALLOWED_DIM_EXPRS.""" + if dim_expr not in _ALLOWED_DIM_EXPRS: + raise ValueError(f"untrusted dim_expr: {dim_expr}") # 7-day baseline per dimension baseline_rows = conn.execute( f"""SELECT {dim_expr} as dim_val, @@ -257,24 +268,22 @@ def check_rejection_spike(conn: sqlite3.Connection) -> list[dict]: """Detect single rejection reason exceeding REJECTION_SPIKE_RATIO of recent rejections.""" alerts = [] - # Total rejections in 24h + # Total rejected PRs in 24h (prs.eval_issues is the canonical source — Epimetheus 2026-04-02) total = conn.execute( - """SELECT COUNT(*) as n FROM audit_log - WHERE stage='evaluate' - AND event IN ('changes_requested','domain_rejected','tier05_rejected') - AND timestamp > datetime('now', '-24 hours')""" + """SELECT COUNT(*) as n FROM prs + WHERE eval_issues IS NOT NULL AND eval_issues != '[]' + AND created_at > datetime('now', '-24 hours')""" ).fetchone()["n"] if total < 10: return alerts # Not enough data - # Count by rejection tag + # Count by rejection tag from prs.eval_issues tags = conn.execute( """SELECT value as tag, COUNT(*) as cnt - FROM audit_log, json_each(json_extract(detail, '$.issues')) - WHERE stage='evaluate' - AND event IN ('changes_requested','domain_rejected','tier05_rejected') - AND timestamp > datetime('now', '-24 hours') + FROM prs, json_each(prs.eval_issues) + WHERE eval_issues IS NOT NULL AND eval_issues != '[]' + AND created_at > datetime('now', '-24 hours') GROUP BY tag ORDER BY cnt DESC""" ).fetchall() @@ -306,16 +315,13 @@ def check_stuck_loops(conn: sqlite3.Connection) -> list[dict]: """Detect agents repeatedly failing on the same rejection reason.""" alerts = [] - # COALESCE: rejection events use $.agent, eval events use $.domain_agent (Epimetheus 2026-03-28) + # Agent + rejection reason from prs table directly (Epimetheus correction 2026-04-02) rows = conn.execute( - """SELECT COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent')) as agent, - value as tag, - COUNT(*) as cnt - FROM audit_log, json_each(json_extract(detail, '$.issues')) - WHERE stage='evaluate' - AND event IN ('changes_requested','domain_rejected','tier05_rejected') - AND timestamp > datetime('now', '-6 hours') - AND COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent')) IS NOT NULL + """SELECT agent, value as tag, COUNT(*) as cnt + FROM prs, json_each(prs.eval_issues) + WHERE eval_issues IS NOT NULL AND eval_issues != '[]' + AND agent IS NOT NULL + AND created_at > datetime('now', '-6 hours') GROUP BY agent, tag HAVING cnt > ?""", (STUCK_LOOP_THRESHOLD,), @@ -403,16 +409,13 @@ def check_domain_rejection_patterns(conn: sqlite3.Connection) -> list[dict]: """Track rejection reason shift per domain — surfaces domain maturity issues.""" alerts = [] - # Per-domain rejection breakdown in 24h + # Per-domain rejection breakdown in 24h from prs table (Epimetheus correction 2026-04-02) rows = conn.execute( - """SELECT json_extract(detail, '$.domain') as domain, - value as tag, - COUNT(*) as cnt - FROM audit_log, json_each(json_extract(detail, '$.issues')) - WHERE stage='evaluate' - AND event IN ('changes_requested','domain_rejected','tier05_rejected') - AND timestamp > datetime('now', '-24 hours') - AND json_extract(detail, '$.domain') IS NOT NULL + """SELECT domain, value as tag, COUNT(*) as cnt + FROM prs, json_each(prs.eval_issues) + WHERE eval_issues IS NOT NULL AND eval_issues != '[]' + AND domain IS NOT NULL + AND created_at > datetime('now', '-24 hours') GROUP BY domain, tag ORDER BY domain, cnt DESC""" ).fetchall() @@ -464,12 +467,11 @@ def generate_failure_report(conn: sqlite3.Connection, agent: str, hours: int = 2 hours = int(hours) # defensive — callers should pass int, but enforce it rows = conn.execute( """SELECT value as tag, COUNT(*) as cnt, - GROUP_CONCAT(DISTINCT json_extract(detail, '$.pr')) as pr_numbers - FROM audit_log, json_each(json_extract(detail, '$.issues')) - WHERE stage='evaluate' - AND event IN ('changes_requested','domain_rejected','tier05_rejected') - AND COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent')) = ? - AND timestamp > datetime('now', ? || ' hours') + GROUP_CONCAT(DISTINCT number) as pr_numbers + FROM prs, json_each(prs.eval_issues) + WHERE eval_issues IS NOT NULL AND eval_issues != '[]' + AND agent = ? + AND created_at > datetime('now', ? || ' hours') GROUP BY tag ORDER BY cnt DESC LIMIT 5""", (agent, f"-{hours}"), diff --git a/diagnostics/alerting_routes.py b/diagnostics/alerting_routes.py index fd35740..6e736b1 100644 --- a/diagnostics/alerting_routes.py +++ b/diagnostics/alerting_routes.py @@ -26,22 +26,24 @@ async def handle_check(request): conn = request.app["_alerting_conn_func"]() try: alerts = run_all_checks(conn) + + # Generate failure reports for agents with stuck loops + failure_reports = {} + stuck_agents = {a["agent"] for a in alerts if a["category"] == "health" and "stuck" in a["id"] and a["agent"]} + for agent in stuck_agents: + report = generate_failure_report(conn, agent) + if report: + failure_reports[agent] = report except Exception as e: logger.error("Check failed: %s", e) return web.json_response({"error": str(e)}, status=500) + finally: + conn.close() global _active_alerts, _last_check _active_alerts = alerts _last_check = datetime.now(timezone.utc).isoformat() - # Generate failure reports for agents with stuck loops - failure_reports = {} - stuck_agents = {a["agent"] for a in alerts if a["category"] == "health" and "stuck" in a["id"] and a["agent"]} - for agent in stuck_agents: - report = generate_failure_report(conn, agent) - if report: - failure_reports[agent] = report - result = { "checked_at": _last_check, "alert_count": len(alerts), @@ -104,10 +106,15 @@ async def handle_api_failure_report(request): hours: lookback window (default 24) """ agent = request.match_info["agent"] - hours = int(request.query.get("hours", "24")) + try: + hours = min(int(request.query.get("hours", "24")), 168) + except ValueError: + hours = 24 conn = request.app["_alerting_conn_func"]() - - report = generate_failure_report(conn, agent, hours) + try: + report = generate_failure_report(conn, agent, hours) + finally: + conn.close() if not report: return web.json_response({"agent": agent, "status": "no_rejections", "period_hours": hours}) diff --git a/diagnostics/dashboard_epistemic.py b/diagnostics/dashboard_epistemic.py index c0e1c09..6074f42 100644 --- a/diagnostics/dashboard_epistemic.py +++ b/diagnostics/dashboard_epistemic.py @@ -74,7 +74,7 @@ def render_epistemic_page(vital_signs: dict, now: datetime) -> str:
Multi-model agreement rate requires the model_evals table.
- Blocked on: model_evals table creation (Theseus 2 Phase 3) + Blocked on: model_evals table creation (Ship Phase 3)
Current eval models: Haiku (triage), GPT-4o (domain), Sonnet/Opus (Leo).
@@ -194,12 +194,6 @@ fetch('/api/review-summary?days=30') reasonRows += '' + esc(r.reason) + '' + r.count + ''; }} - // Disagreement types - let disagreeRows = ''; - for (const d of (data.disagreement_types || [])) {{ - disagreeRows += '' + esc(d.type) + '' + d.count + ''; - }} - el.innerHTML = `
Total Reviews
${{data.total}}
@@ -215,13 +209,6 @@ fetch('/api/review-summary?days=30') ${{reasonRows || 'No rejections'}}
-
-
Disagreement Types
- - - ${{disagreeRows || ''}} -
TypeCount
No disagreements
-
`; }}).catch(() => {{ document.getElementById('review-container').innerHTML = diff --git a/diagnostics/dashboard_prs.py b/diagnostics/dashboard_prs.py index 638ab52..e1ca5c0 100644 --- a/diagnostics/dashboard_prs.py +++ b/diagnostics/dashboard_prs.py @@ -1,8 +1,8 @@ """PR Lifecycle dashboard — single-page view of every PR through the pipeline. -Sortable table: PR#, summary, claims, domain, contributor, outcome, evals, evaluator, cost, date. -Click any row to expand: claim titles, eval chain, timeline, reviews, issues. -Hero cards: total PRs, merge rate, total claims, est. cost. +Sortable table: PR#, summary, claims, domain, outcome, evals, evaluator, cost, date. +Click any row to expand: timeline, claim list, issues summary. +Hero cards: total PRs, merge rate, median eval rounds, total claims, total cost. Data sources: prs table, audit_log (eval rounds), review_records. Owner: Ship @@ -14,7 +14,7 @@ from shared_ui import render_page EXTRA_CSS = """ - .content-wrapper { max-width: 1600px !important; } + .page-content { max-width: 1600px !important; } .filters { display: flex; gap: 12px; flex-wrap: wrap; margin-bottom: 16px; } .filters select, .filters input { background: #161b22; color: #c9d1d9; border: 1px solid #30363d; @@ -22,15 +22,14 @@ EXTRA_CSS = """ .filters select:focus, .filters input:focus { border-color: #58a6ff; outline: none; } .pr-table { width: 100%; border-collapse: collapse; font-size: 13px; table-layout: fixed; } .pr-table th:nth-child(1) { width: 50px; } /* PR# */ - .pr-table th:nth-child(2) { width: 28%; } /* Summary */ + .pr-table th:nth-child(2) { width: 30%; } /* Summary */ .pr-table th:nth-child(3) { width: 50px; } /* Claims */ - .pr-table th:nth-child(4) { width: 11%; } /* Domain */ - .pr-table th:nth-child(5) { width: 10%; } /* Contributor */ - .pr-table th:nth-child(6) { width: 10%; } /* Outcome */ - .pr-table th:nth-child(7) { width: 44px; } /* Evals */ - .pr-table th:nth-child(8) { width: 12%; } /* Evaluator */ - .pr-table th:nth-child(9) { width: 60px; } /* Cost */ - .pr-table th:nth-child(10) { width: 80px; } /* Date */ + .pr-table th:nth-child(4) { width: 12%; } /* Domain */ + .pr-table th:nth-child(5) { width: 10%; } /* Outcome */ + .pr-table th:nth-child(6) { width: 50px; } /* Evals */ + .pr-table th:nth-child(7) { width: 16%; } /* Evaluator */ + .pr-table th:nth-child(8) { width: 70px; } /* Cost */ + .pr-table th:nth-child(9) { width: 90px; } /* Date */ .pr-table td { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; padding: 8px 6px; } .pr-table td:nth-child(2) { white-space: normal; overflow: visible; line-height: 1.4; } .pr-table th { cursor: pointer; user-select: none; position: relative; padding: 8px 18px 8px 6px; } @@ -49,24 +48,22 @@ EXTRA_CSS = """ .pr-table .pr-link:hover { text-decoration: underline; } .pr-table td .summary-text { font-size: 12px; color: #c9d1d9; } .pr-table td .review-snippet { font-size: 11px; color: #f85149; margin-top: 2px; opacity: 0.8; } - .pr-table td .model-tag { font-size: 10px; color: #6e7681; background: #161b22; border-radius: 3px; padding: 1px 4px; } - .pr-table td .contributor-tag { font-size: 11px; color: #d2a8ff; } - .pr-table td .contributor-self { font-size: 11px; color: #6e7681; font-style: italic; } + .pr-table td .model-tag { font-size: 9px; color: #6e7681; background: #21262d; border-radius: 3px; padding: 1px 4px; display: inline-block; margin: 1px 0; } .pr-table td .expand-chevron { display: inline-block; width: 12px; color: #484f58; font-size: 10px; transition: transform 0.2s; } .pr-table tr.expanded .expand-chevron { transform: rotate(90deg); color: #58a6ff; } + .pr-table td .cost-val { font-size: 12px; color: #8b949e; } + .pr-table td .claims-count { font-size: 13px; color: #c9d1d9; text-align: center; } + .pr-table td .evals-count { font-size: 13px; text-align: center; } .trace-panel { background: #0d1117; border: 1px solid #30363d; border-radius: 8px; padding: 16px; margin: 4px 0 8px 0; font-size: 12px; display: none; } .trace-panel.open { display: block; } - .trace-panel h4 { color: #58a6ff; font-size: 12px; margin: 12px 0 6px 0; } - .trace-panel h4:first-child { margin-top: 0; } - .claim-list { list-style: none; padding: 0; margin: 0; } - .claim-list li { padding: 4px 0 4px 16px; border-left: 2px solid #238636; color: #c9d1d9; font-size: 12px; line-height: 1.5; } - .claim-list li .claim-confidence { font-size: 10px; color: #8b949e; margin-left: 6px; } - .issues-box { background: #1c1210; border: 1px solid #f8514933; border-radius: 6px; + .trace-panel .section-title { color: #58a6ff; font-size: 12px; font-weight: 600; margin: 12px 0 6px; } + .trace-panel .section-title:first-child { margin-top: 0; } + .trace-panel .claim-list { list-style: none; padding: 0; margin: 0; } + .trace-panel .claim-list li { padding: 4px 0; border-bottom: 1px solid #21262d; color: #c9d1d9; font-size: 12px; } + .trace-panel .claim-list li:last-child { border-bottom: none; } + .trace-panel .issues-box { background: #1c1017; border: 1px solid #f8514930; border-radius: 6px; padding: 8px 12px; margin: 4px 0; font-size: 12px; color: #f85149; } - .eval-chain { background: #161b22; border-radius: 6px; padding: 8px 12px; margin: 4px 0; font-size: 12px; } - .eval-chain .chain-step { display: inline-block; margin-right: 6px; } - .eval-chain .chain-arrow { color: #484f58; margin: 0 4px; } .trace-timeline { list-style: none; padding: 0; } .trace-timeline li { padding: 4px 0; border-left: 2px solid #30363d; padding-left: 12px; margin-left: 8px; } .trace-timeline li .ts { color: #484f58; font-size: 11px; } @@ -76,6 +73,12 @@ EXTRA_CSS = """ .trace-timeline li.ev-changes .ev { color: #d29922; } .review-text { background: #161b22; padding: 8px 12px; border-radius: 4px; margin: 4px 0; white-space: pre-wrap; font-size: 11px; color: #8b949e; max-height: 200px; overflow-y: auto; } + .eval-chain { background: #161b22; border-radius: 6px; padding: 8px 12px; margin: 4px 0 8px; + font-size: 12px; display: flex; gap: 12px; flex-wrap: wrap; align-items: center; } + .eval-chain .step { display: flex; align-items: center; gap: 4px; } + .eval-chain .step-label { color: #8b949e; font-size: 11px; } + .eval-chain .step-model { color: #c9d1d9; font-size: 11px; font-weight: 600; } + .eval-chain .arrow { color: #484f58; } .pagination { display: flex; gap: 8px; align-items: center; justify-content: center; margin-top: 16px; } .pagination button { background: #161b22; color: #c9d1d9; border: 1px solid #30363d; border-radius: 4px; padding: 4px 12px; cursor: pointer; font-size: 12px; } @@ -93,6 +96,7 @@ def render_prs_page(now: datetime) -> str:
Total PRs
--
Merge Rate
--
+
Median Eval Rounds
--
Total Claims
--
Est. Cost
--
@@ -100,7 +104,6 @@ def render_prs_page(now: datetime) -> str:
-