Merge branch 'epimetheus/consolidate-infra'

This commit is contained in:
m3taversal 2026-04-13 10:59:36 +02:00
commit 7ba6247b9d
21 changed files with 3120 additions and 1498 deletions

View file

@ -1,537 +0,0 @@
"""Argus active monitoring — health watchdog, quality regression, throughput anomaly detection.
Provides check functions that detect problems and return structured alerts.
Called by /check endpoint (periodic cron) or on-demand.
Alert schema:
{
"id": str, # unique key for dedup (e.g. "dormant:ganymede")
"severity": str, # "critical" | "warning" | "info"
"category": str, # "health" | "quality" | "throughput" | "failure_pattern"
"title": str, # human-readable headline
"detail": str, # actionable description
"agent": str|None, # affected agent (if applicable)
"domain": str|None, # affected domain (if applicable)
"detected_at": str, # ISO timestamp
"auto_resolve": bool, # clears when condition clears
}
"""
import json
import sqlite3
import statistics
from datetime import datetime, timezone
# ─── Agent-domain mapping (static config, maintained by Argus) ──────────────
AGENT_DOMAINS = {
"rio": ["internet-finance"],
"clay": ["creative-industries"],
"ganymede": None, # reviewer — cross-domain
"epimetheus": None, # infra
"leo": None, # standards
"oberon": None, # evolution tracking
"vida": None, # health monitoring
"hermes": None, # comms
"astra": None, # research
}
# Thresholds
DORMANCY_HOURS = 48
APPROVAL_DROP_THRESHOLD = 15 # percentage points below 7-day baseline
THROUGHPUT_DROP_RATIO = 0.5 # alert if today < 50% of 7-day SMA
REJECTION_SPIKE_RATIO = 0.20 # single reason > 20% of recent rejections
STUCK_LOOP_THRESHOLD = 3 # same agent + same rejection reason > N times in 6h
COST_SPIKE_RATIO = 2.0 # daily cost > 2x 7-day average
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
# ─── Check: Agent Health (dormancy detection) ───────────────────────────────
def check_agent_health(conn: sqlite3.Connection) -> list[dict]:
"""Detect agents with no PR activity in the last DORMANCY_HOURS hours."""
alerts = []
# Get last activity per agent
rows = conn.execute(
"""SELECT agent, MAX(last_attempt) as latest, COUNT(*) as total_prs
FROM prs WHERE agent IS NOT NULL
GROUP BY agent"""
).fetchall()
now = datetime.now(timezone.utc)
for r in rows:
agent = r["agent"]
latest = r["latest"]
if not latest:
continue
last_dt = datetime.fromisoformat(latest)
if last_dt.tzinfo is None:
last_dt = last_dt.replace(tzinfo=timezone.utc)
hours_since = (now - last_dt).total_seconds() / 3600
if hours_since > DORMANCY_HOURS:
alerts.append({
"id": f"dormant:{agent}",
"severity": "warning",
"category": "health",
"title": f"Agent '{agent}' dormant for {int(hours_since)}h",
"detail": (
f"No PR activity since {latest}. "
f"Last seen {int(hours_since)}h ago (threshold: {DORMANCY_HOURS}h). "
f"Total historical PRs: {r['total_prs']}."
),
"agent": agent,
"domain": None,
"detected_at": _now_iso(),
"auto_resolve": True,
})
return alerts
# ─── Check: Quality Regression (approval rate drop) ─────────────────────────
def check_quality_regression(conn: sqlite3.Connection) -> list[dict]:
"""Detect approval rate drops vs 7-day baseline, per agent and per domain."""
alerts = []
# 7-day baseline approval rate (overall)
baseline = conn.execute(
"""SELECT
COUNT(CASE WHEN event='approved' THEN 1 END) as approved,
COUNT(*) as total
FROM audit_log
WHERE stage='evaluate'
AND event IN ('approved','changes_requested','domain_rejected','tier05_rejected')
AND timestamp > datetime('now', '-7 days')"""
).fetchone()
baseline_rate = (baseline["approved"] / baseline["total"] * 100) if baseline["total"] else None
# 24h approval rate (overall)
recent = conn.execute(
"""SELECT
COUNT(CASE WHEN event='approved' THEN 1 END) as approved,
COUNT(*) as total
FROM audit_log
WHERE stage='evaluate'
AND event IN ('approved','changes_requested','domain_rejected','tier05_rejected')
AND timestamp > datetime('now', '-24 hours')"""
).fetchone()
recent_rate = (recent["approved"] / recent["total"] * 100) if recent["total"] else None
if baseline_rate is not None and recent_rate is not None:
drop = baseline_rate - recent_rate
if drop > APPROVAL_DROP_THRESHOLD:
alerts.append({
"id": "quality_regression:overall",
"severity": "critical",
"category": "quality",
"title": f"Approval rate dropped {drop:.0f}pp (24h: {recent_rate:.0f}% vs 7d: {baseline_rate:.0f}%)",
"detail": (
f"24h approval rate ({recent_rate:.1f}%) is {drop:.1f} percentage points below "
f"7-day baseline ({baseline_rate:.1f}%). "
f"Evaluated {recent['total']} PRs in last 24h."
),
"agent": None,
"domain": None,
"detected_at": _now_iso(),
"auto_resolve": True,
})
# Per-agent approval rate (24h vs 7d) — only for agents with >=5 evals in each window
# COALESCE: rejection events use $.agent, eval events use $.domain_agent (Epimetheus 2026-03-28)
_check_approval_by_dimension(conn, alerts, "agent", "COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent'))")
# Per-domain approval rate (24h vs 7d) — Theseus addition
_check_approval_by_dimension(conn, alerts, "domain", "json_extract(detail, '$.domain')")
return alerts
def _check_approval_by_dimension(conn, alerts, dim_name, dim_expr):
"""Check approval rate regression grouped by a dimension (agent or domain)."""
# 7-day baseline per dimension
baseline_rows = conn.execute(
f"""SELECT {dim_expr} as dim_val,
COUNT(CASE WHEN event='approved' THEN 1 END) as approved,
COUNT(*) as total
FROM audit_log
WHERE stage='evaluate'
AND event IN ('approved','changes_requested','domain_rejected','tier05_rejected')
AND timestamp > datetime('now', '-7 days')
AND {dim_expr} IS NOT NULL
GROUP BY dim_val HAVING total >= 5"""
).fetchall()
baselines = {r["dim_val"]: (r["approved"] / r["total"] * 100) for r in baseline_rows}
# 24h per dimension
recent_rows = conn.execute(
f"""SELECT {dim_expr} as dim_val,
COUNT(CASE WHEN event='approved' THEN 1 END) as approved,
COUNT(*) as total
FROM audit_log
WHERE stage='evaluate'
AND event IN ('approved','changes_requested','domain_rejected','tier05_rejected')
AND timestamp > datetime('now', '-24 hours')
AND {dim_expr} IS NOT NULL
GROUP BY dim_val HAVING total >= 5"""
).fetchall()
for r in recent_rows:
val = r["dim_val"]
if val not in baselines:
continue
recent_rate = r["approved"] / r["total"] * 100
base_rate = baselines[val]
drop = base_rate - recent_rate
if drop > APPROVAL_DROP_THRESHOLD:
alerts.append({
"id": f"quality_regression:{dim_name}:{val}",
"severity": "warning",
"category": "quality",
"title": f"{dim_name.title()} '{val}' approval dropped {drop:.0f}pp",
"detail": (
f"24h: {recent_rate:.1f}% vs 7d baseline: {base_rate:.1f}% "
f"({r['total']} evals in 24h)."
),
"agent": val if dim_name == "agent" else None,
"domain": val if dim_name == "domain" else None,
"detected_at": _now_iso(),
"auto_resolve": True,
})
# ─── Check: Throughput Anomaly ──────────────────────────────────────────────
def check_throughput(conn: sqlite3.Connection) -> list[dict]:
"""Detect throughput stalling — today vs 7-day SMA."""
alerts = []
# Daily merged counts for last 7 days
rows = conn.execute(
"""SELECT date(merged_at) as day, COUNT(*) as n
FROM prs WHERE merged_at > datetime('now', '-7 days')
GROUP BY day ORDER BY day"""
).fetchall()
if len(rows) < 2:
return alerts # Not enough data
daily_counts = [r["n"] for r in rows]
sma = statistics.mean(daily_counts[:-1]) if len(daily_counts) > 1 else daily_counts[0]
today_count = daily_counts[-1]
if sma > 0 and today_count < sma * THROUGHPUT_DROP_RATIO:
alerts.append({
"id": "throughput:stalling",
"severity": "warning",
"category": "throughput",
"title": f"Throughput stalling: {today_count} merges today vs {sma:.0f}/day avg",
"detail": (
f"Today's merge count ({today_count}) is below {THROUGHPUT_DROP_RATIO:.0%} of "
f"7-day average ({sma:.1f}/day). Daily counts: {daily_counts}."
),
"agent": None,
"domain": None,
"detected_at": _now_iso(),
"auto_resolve": True,
})
return alerts
# ─── Check: Rejection Reason Spike ─────────────────────────────────────────
def check_rejection_spike(conn: sqlite3.Connection) -> list[dict]:
"""Detect single rejection reason exceeding REJECTION_SPIKE_RATIO of recent rejections."""
alerts = []
# Total rejections in 24h
total = conn.execute(
"""SELECT COUNT(*) as n FROM audit_log
WHERE stage='evaluate'
AND event IN ('changes_requested','domain_rejected','tier05_rejected')
AND timestamp > datetime('now', '-24 hours')"""
).fetchone()["n"]
if total < 10:
return alerts # Not enough data
# Count by rejection tag
tags = conn.execute(
"""SELECT value as tag, COUNT(*) as cnt
FROM audit_log, json_each(json_extract(detail, '$.issues'))
WHERE stage='evaluate'
AND event IN ('changes_requested','domain_rejected','tier05_rejected')
AND timestamp > datetime('now', '-24 hours')
GROUP BY tag ORDER BY cnt DESC"""
).fetchall()
for t in tags:
ratio = t["cnt"] / total
if ratio > REJECTION_SPIKE_RATIO:
alerts.append({
"id": f"rejection_spike:{t['tag']}",
"severity": "warning",
"category": "quality",
"title": f"Rejection reason '{t['tag']}' at {ratio:.0%} of rejections",
"detail": (
f"'{t['tag']}' accounts for {t['cnt']}/{total} rejections in 24h "
f"({ratio:.1%}). Threshold: {REJECTION_SPIKE_RATIO:.0%}."
),
"agent": None,
"domain": None,
"detected_at": _now_iso(),
"auto_resolve": True,
})
return alerts
# ─── Check: Stuck Loops ────────────────────────────────────────────────────
def check_stuck_loops(conn: sqlite3.Connection) -> list[dict]:
"""Detect agents repeatedly failing on the same rejection reason."""
alerts = []
# COALESCE: rejection events use $.agent, eval events use $.domain_agent (Epimetheus 2026-03-28)
rows = conn.execute(
"""SELECT COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent')) as agent,
value as tag,
COUNT(*) as cnt
FROM audit_log, json_each(json_extract(detail, '$.issues'))
WHERE stage='evaluate'
AND event IN ('changes_requested','domain_rejected','tier05_rejected')
AND timestamp > datetime('now', '-6 hours')
AND COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent')) IS NOT NULL
GROUP BY agent, tag
HAVING cnt > ?""",
(STUCK_LOOP_THRESHOLD,),
).fetchall()
for r in rows:
alerts.append({
"id": f"stuck_loop:{r['agent']}:{r['tag']}",
"severity": "critical",
"category": "health",
"title": f"Agent '{r['agent']}' stuck: '{r['tag']}' failed {r['cnt']}x in 6h",
"detail": (
f"Agent '{r['agent']}' has been rejected for '{r['tag']}' "
f"{r['cnt']} times in the last 6 hours (threshold: {STUCK_LOOP_THRESHOLD}). "
f"Stop and reassess."
),
"agent": r["agent"],
"domain": None,
"detected_at": _now_iso(),
"auto_resolve": True,
})
return alerts
# ─── Check: Cost Spikes ────────────────────────────────────────────────────
def check_cost_spikes(conn: sqlite3.Connection) -> list[dict]:
"""Detect daily cost exceeding 2x of 7-day average per agent."""
alerts = []
# Check if costs table exists and has agent column
try:
cols = conn.execute("PRAGMA table_info(costs)").fetchall()
col_names = {c["name"] for c in cols}
except sqlite3.Error:
return alerts
if "agent" not in col_names or "cost_usd" not in col_names:
# Fall back to per-PR cost tracking
rows = conn.execute(
"""SELECT agent,
SUM(CASE WHEN created_at > datetime('now', '-1 day') THEN cost_usd ELSE 0 END) as today_cost,
SUM(CASE WHEN created_at > datetime('now', '-7 days') THEN cost_usd ELSE 0 END) / 7.0 as avg_daily
FROM prs WHERE agent IS NOT NULL AND cost_usd > 0
GROUP BY agent
HAVING avg_daily > 0"""
).fetchall()
else:
rows = conn.execute(
"""SELECT agent,
SUM(CASE WHEN timestamp > datetime('now', '-1 day') THEN cost_usd ELSE 0 END) as today_cost,
SUM(CASE WHEN timestamp > datetime('now', '-7 days') THEN cost_usd ELSE 0 END) / 7.0 as avg_daily
FROM costs WHERE agent IS NOT NULL
GROUP BY agent
HAVING avg_daily > 0"""
).fetchall()
for r in rows:
if r["avg_daily"] and r["today_cost"] > r["avg_daily"] * COST_SPIKE_RATIO:
ratio = r["today_cost"] / r["avg_daily"]
alerts.append({
"id": f"cost_spike:{r['agent']}",
"severity": "warning",
"category": "health",
"title": f"Agent '{r['agent']}' cost spike: ${r['today_cost']:.2f} today ({ratio:.1f}x avg)",
"detail": (
f"Today's cost (${r['today_cost']:.2f}) is {ratio:.1f}x the 7-day daily average "
f"(${r['avg_daily']:.2f}). Threshold: {COST_SPIKE_RATIO}x."
),
"agent": r["agent"],
"domain": None,
"detected_at": _now_iso(),
"auto_resolve": True,
})
return alerts
# ─── Check: Domain Rejection Patterns (Theseus addition) ───────────────────
def check_domain_rejection_patterns(conn: sqlite3.Connection) -> list[dict]:
"""Track rejection reason shift per domain — surfaces domain maturity issues."""
alerts = []
# Per-domain rejection breakdown in 24h
rows = conn.execute(
"""SELECT json_extract(detail, '$.domain') as domain,
value as tag,
COUNT(*) as cnt
FROM audit_log, json_each(json_extract(detail, '$.issues'))
WHERE stage='evaluate'
AND event IN ('changes_requested','domain_rejected','tier05_rejected')
AND timestamp > datetime('now', '-24 hours')
AND json_extract(detail, '$.domain') IS NOT NULL
GROUP BY domain, tag
ORDER BY domain, cnt DESC"""
).fetchall()
# Group by domain
domain_tags = {}
for r in rows:
d = r["domain"]
if d not in domain_tags:
domain_tags[d] = []
domain_tags[d].append({"tag": r["tag"], "count": r["cnt"]})
# Flag if a domain has >50% of rejections from a single reason (concentrated failure)
for domain, tags in domain_tags.items():
total = sum(t["count"] for t in tags)
if total < 5:
continue
top = tags[0]
ratio = top["count"] / total
if ratio > 0.5:
alerts.append({
"id": f"domain_rejection_pattern:{domain}:{top['tag']}",
"severity": "info",
"category": "failure_pattern",
"title": f"Domain '{domain}': {ratio:.0%} of rejections are '{top['tag']}'",
"detail": (
f"In domain '{domain}', {top['count']}/{total} rejections (24h) are for "
f"'{top['tag']}'. This may indicate a systematic issue with evidence standards "
f"or schema compliance in this domain."
),
"agent": None,
"domain": domain,
"detected_at": _now_iso(),
"auto_resolve": True,
})
return alerts
# ─── Failure Report Generator ───────────────────────────────────────────────
def generate_failure_report(conn: sqlite3.Connection, agent: str, hours: int = 24) -> dict | None:
"""Compile a failure report for a specific agent.
Returns top rejection reasons, example PRs, and suggested fixes.
Designed to be sent directly to the agent via Pentagon messaging.
"""
hours = int(hours) # defensive — callers should pass int, but enforce it
rows = conn.execute(
"""SELECT value as tag, COUNT(*) as cnt,
GROUP_CONCAT(DISTINCT json_extract(detail, '$.pr')) as pr_numbers
FROM audit_log, json_each(json_extract(detail, '$.issues'))
WHERE stage='evaluate'
AND event IN ('changes_requested','domain_rejected','tier05_rejected')
AND json_extract(detail, '$.agent') = ?
AND timestamp > datetime('now', ? || ' hours')
GROUP BY tag ORDER BY cnt DESC
LIMIT 5""",
(agent, f"-{hours}"),
).fetchall()
if not rows:
return None
total_rejections = sum(r["cnt"] for r in rows)
top_reasons = []
for r in rows:
prs = r["pr_numbers"].split(",")[:3] if r["pr_numbers"] else []
top_reasons.append({
"reason": r["tag"],
"count": r["cnt"],
"pct": round(r["cnt"] / total_rejections * 100, 1),
"example_prs": prs,
"suggestion": _suggest_fix(r["tag"]),
})
return {
"agent": agent,
"period_hours": hours,
"total_rejections": total_rejections,
"top_reasons": top_reasons,
"generated_at": _now_iso(),
}
def _suggest_fix(rejection_tag: str) -> str:
"""Map known rejection reasons to actionable suggestions."""
suggestions = {
"broken_wiki_links": "Check that all [[wiki links]] in claims resolve to existing files. Run link validation before submitting.",
"near_duplicate": "Search existing claims before creating new ones. Use semantic search to find similar claims.",
"frontmatter_schema": "Validate YAML frontmatter against the claim schema. Required fields: title, domain, confidence, type.",
"weak_evidence": "Add concrete sources, data points, or citations. Claims need evidence that can be independently verified.",
"missing_confidence": "Every claim needs a confidence level: proven, likely, experimental, or speculative.",
"domain_mismatch": "Ensure claims are filed under the correct domain. Check domain definitions if unsure.",
"too_broad": "Break broad claims into specific, testable sub-claims.",
"missing_links": "Claims should link to related claims, entities, or sources. Isolated claims are harder to verify.",
}
return suggestions.get(rejection_tag, f"Review rejection reason '{rejection_tag}' and adjust extraction accordingly.")
# ─── Run All Checks ────────────────────────────────────────────────────────
def run_all_checks(conn: sqlite3.Connection) -> list[dict]:
"""Execute all check functions and return combined alerts."""
alerts = []
alerts.extend(check_agent_health(conn))
alerts.extend(check_quality_regression(conn))
alerts.extend(check_throughput(conn))
alerts.extend(check_rejection_spike(conn))
alerts.extend(check_stuck_loops(conn))
alerts.extend(check_cost_spikes(conn))
alerts.extend(check_domain_rejection_patterns(conn))
return alerts
def format_alert_message(alert: dict) -> str:
"""Format an alert for Pentagon messaging."""
severity_icon = {"critical": "!!", "warning": "!", "info": "~"}
icon = severity_icon.get(alert["severity"], "?")
return f"[{icon}] {alert['title']}\n{alert['detail']}"

View file

@ -1,125 +0,0 @@
"""Route handlers for /check and /api/alerts endpoints.
Import into app.py and register routes in create_app().
"""
import json
import logging
from datetime import datetime, timezone
from aiohttp import web
from alerting import run_all_checks, generate_failure_report, format_alert_message # requires CWD = deploy dir; switch to relative import if packaged
logger = logging.getLogger("argus.alerting")
# In-memory alert store (replaced each /check cycle, persists between requests)
_active_alerts: list[dict] = []
_last_check: str | None = None
async def handle_check(request):
"""GET /check — run all monitoring checks, update active alerts, return results.
Designed to be called by systemd timer every 5 minutes.
Returns JSON summary of all detected issues.
"""
conn = request.app["_alerting_conn_func"]()
try:
alerts = run_all_checks(conn)
except Exception as e:
logger.error("Check failed: %s", e)
return web.json_response({"error": str(e)}, status=500)
global _active_alerts, _last_check
_active_alerts = alerts
_last_check = datetime.now(timezone.utc).isoformat()
# Generate failure reports for agents with stuck loops
failure_reports = {}
stuck_agents = {a["agent"] for a in alerts if a["category"] == "health" and "stuck" in a["id"] and a["agent"]}
for agent in stuck_agents:
report = generate_failure_report(conn, agent)
if report:
failure_reports[agent] = report
result = {
"checked_at": _last_check,
"alert_count": len(alerts),
"critical": sum(1 for a in alerts if a["severity"] == "critical"),
"warning": sum(1 for a in alerts if a["severity"] == "warning"),
"info": sum(1 for a in alerts if a["severity"] == "info"),
"alerts": alerts,
"failure_reports": failure_reports,
}
logger.info(
"Check complete: %d alerts (%d critical, %d warning)",
len(alerts),
result["critical"],
result["warning"],
)
return web.json_response(result)
async def handle_api_alerts(request):
"""GET /api/alerts — return current active alerts.
Query params:
severity: filter by severity (critical, warning, info)
category: filter by category (health, quality, throughput, failure_pattern)
agent: filter by agent name
domain: filter by domain
"""
alerts = list(_active_alerts)
# Filters
severity = request.query.get("severity")
if severity:
alerts = [a for a in alerts if a["severity"] == severity]
category = request.query.get("category")
if category:
alerts = [a for a in alerts if a["category"] == category]
agent = request.query.get("agent")
if agent:
alerts = [a for a in alerts if a.get("agent") == agent]
domain = request.query.get("domain")
if domain:
alerts = [a for a in alerts if a.get("domain") == domain]
return web.json_response({
"alerts": alerts,
"total": len(alerts),
"last_check": _last_check,
})
async def handle_api_failure_report(request):
"""GET /api/failure-report/{agent} — generate failure report for an agent.
Query params:
hours: lookback window (default 24)
"""
agent = request.match_info["agent"]
hours = int(request.query.get("hours", "24"))
conn = request.app["_alerting_conn_func"]()
report = generate_failure_report(conn, agent, hours)
if not report:
return web.json_response({"agent": agent, "status": "no_rejections", "period_hours": hours})
return web.json_response(report)
def register_alerting_routes(app, get_conn_func):
"""Register alerting routes on the app.
get_conn_func: callable that returns a read-only sqlite3.Connection
"""
app["_alerting_conn_func"] = get_conn_func
app.router.add_get("/check", handle_check)
app.router.add_get("/api/alerts", handle_api_alerts)
app.router.add_get("/api/failure-report/{agent}", handle_api_failure_report)

View file

@ -93,7 +93,115 @@ echo "Deploy complete."
if $RESTART; then
echo ""
echo "=== Restarting services ==="
ssh "$VPS_HOST" "sudo systemctl restart teleo-pipeline teleo-diagnostics"
echo "Services restarted."
echo "=== Detecting services to restart ==="
# Determine which services need restart based on what was deployed.
# rsync touched these paths → these services:
# pipeline-v2/lib/, pipeline-v2/*.py → teleo-pipeline
# diagnostics/ → teleo-diagnostics
# agent-state/, research-session.sh → no restart (not daemons)
RESTART_SVCS=""
# Check VPS for recent file changes from this deploy
# Compare local files against VPS to see what actually changed
PIPELINE_CHANGED=false
DIAG_CHANGED=false
# Pipeline: lib/ or top-level scripts
if ! rsync -avzn --exclude='__pycache__' --exclude='*.pyc' --exclude='*.bak*' \
"$REPO_ROOT/ops/pipeline-v2/lib/" "$VPS_HOST:$VPS_PIPELINE/lib/" 2>/dev/null | grep -q '\.py$'; then
true # no python changes
else
PIPELINE_CHANGED=true
fi
for f in teleo-pipeline.py reweave.py; do
if [ -f "$REPO_ROOT/ops/pipeline-v2/$f" ]; then
if rsync -avzn "$REPO_ROOT/ops/pipeline-v2/$f" "$VPS_HOST:$VPS_PIPELINE/$f" 2>/dev/null | grep -q "$f"; then
PIPELINE_CHANGED=true
fi
fi
done
# Diagnostics
if rsync -avzn --exclude='__pycache__' --exclude='*.pyc' --exclude='*.bak*' \
"$REPO_ROOT/ops/diagnostics/" "$VPS_HOST:$VPS_DIAGNOSTICS/" 2>/dev/null | grep -q '\.py$'; then
DIAG_CHANGED=true
fi
if $PIPELINE_CHANGED; then
RESTART_SVCS="$RESTART_SVCS teleo-pipeline"
echo " teleo-pipeline: files changed, will restart"
else
echo " teleo-pipeline: no changes, skipping"
fi
if $DIAG_CHANGED; then
RESTART_SVCS="$RESTART_SVCS teleo-diagnostics"
echo " teleo-diagnostics: files changed, will restart"
else
echo " teleo-diagnostics: no changes, skipping"
fi
if [ -z "$RESTART_SVCS" ]; then
echo ""
echo "No service files changed. Skipping restart."
else
echo ""
echo "=== Restarting:$RESTART_SVCS ==="
ssh "$VPS_HOST" "sudo systemctl restart $RESTART_SVCS"
echo "Services restarted. Waiting 5s for startup..."
sleep 5
echo ""
echo "=== Smoke test ==="
SMOKE_FAIL=0
# Check systemd unit status for restarted services
for svc in $RESTART_SVCS; do
if ssh "$VPS_HOST" "systemctl is-active --quiet $svc"; then
echo " $svc: active"
else
echo " $svc: FAILED"
ssh "$VPS_HOST" "journalctl -u $svc -n 10 --no-pager" || true
SMOKE_FAIL=1
fi
done
# Hit health endpoints for restarted services
if echo "$RESTART_SVCS" | grep -q "teleo-pipeline"; then
if ssh "$VPS_HOST" "curl -sf --connect-timeout 3 http://localhost:8080/health > /dev/null"; then
echo " pipeline health (8080): OK"
else
echo " pipeline health (8080): FAILED"
SMOKE_FAIL=1
fi
fi
if echo "$RESTART_SVCS" | grep -q "teleo-diagnostics"; then
if ssh "$VPS_HOST" "curl -sf --connect-timeout 3 http://localhost:8081/ops > /dev/null"; then
echo " diagnostics (8081): OK"
else
echo " diagnostics (8081): FAILED"
SMOKE_FAIL=1
fi
fi
# Tail logs for quick visual check
echo ""
echo "=== Recent logs (10s) ==="
JOURNAL_UNITS=""
for svc in $RESTART_SVCS; do
JOURNAL_UNITS="$JOURNAL_UNITS -u $svc"
done
ssh "$VPS_HOST" "journalctl $JOURNAL_UNITS --since '-10s' --no-pager -n 20" || true
if [ "$SMOKE_FAIL" -gt 0 ]; then
echo ""
echo "WARNING: Smoke test detected failures. Check logs above."
exit 1
fi
echo ""
echo "Smoke test passed."
fi
fi

View file

@ -0,0 +1,141 @@
# Diagnostics Consolidation Diff Log
# Branch: epimetheus/consolidate-infra
# Date: 2026-04-13
## Files with multiple copies — resolution
### alerting.py
- ROOT diagnostics/alerting.py (22320 bytes) — KEPT (newer: has _ALLOWED_DIM_EXPRS SQL injection protection, stricter dim_expr validation)
- ops/diagnostics/alerting.py (22039 bytes) — OVERWRITTEN (missing SQL injection guards)
- VPS /opt/teleo-eval/diagnostics/alerting.py (22039 bytes) — matches ops/ version, needs deploy
### alerting_routes.py
- ROOT diagnostics/alerting_routes.py (4216 bytes) — KEPT (newer: proper try/finally/conn.close, ValueError catch on hours param)
- ops/diagnostics/alerting_routes.py (4043 bytes) — OVERWRITTEN (missing error handling, missing conn.close)
- VPS /opt/teleo-eval/diagnostics/alerting_routes.py (4043 bytes) — matches ops/ version, needs deploy
### vitality.py
- ROOT diagnostics/vitality.py (25548 bytes) — KEPT (only copy in repo, larger than VPS)
- VPS /opt/teleo-eval/diagnostics/vitality.py (18539 bytes) — older version, needs deploy
- MOVED TO: ops/diagnostics/vitality.py
### vitality_routes.py
- ROOT diagnostics/vitality_routes.py (10824 bytes) — KEPT (only copy in repo, larger than VPS)
- VPS /opt/teleo-eval/diagnostics/vitality_routes.py (9729 bytes) — older version, needs deploy
- MOVED TO: ops/diagnostics/vitality_routes.py
## Files moved
| From | To | Reason |
|------|-----|--------|
| diagnostics/vitality.py | ops/diagnostics/vitality.py | Consolidate to canonical location |
| diagnostics/vitality_routes.py | ops/diagnostics/vitality_routes.py | Consolidate to canonical location |
| diagnostics/alerting.py | ops/diagnostics/alerting.py | Newer version overwrites older |
| diagnostics/alerting_routes.py | ops/diagnostics/alerting_routes.py | Newer version overwrites older |
## Root diagnostics/ after consolidation
- PATCH_INSTRUCTIONS.md — kept (documentation, not code)
- evolution.md — kept (documentation)
- weekly/2026-03-25-week3.md — kept (report)
- ops/sessions/*.json — kept (session data)
- alerting.py, alerting_routes.py REMOVED by this consolidation
- vitality.py, vitality_routes.py were already absent (moved in prior commit)
- No .py files remain in root diagnostics/
## VPS .bak files inventory (30+ files)
All in /opt/teleo-eval/diagnostics/. Git is the backup now. Safe to delete after consolidation verified.
## VPS deploy needed after merge
alerting.py, alerting_routes.py, vitality.py, vitality_routes.py — all local versions are newer than VPS.
---
## Root Patch Script Audit (Epimetheus's 7 patches)
### patch-prompt-version.py — APPLIED
- **Target:** db.py, merge.py, extract.py, extraction_prompt.py
- **What:** Schema v17 migration for prompt_version/pipeline_version columns, version stamping on PR discovery, feedback param for re-extraction
- **Status:** All 4 targets have changes. Schema is at v19 (includes this migration). merge.py stamps versions. extract.py has feedback param. extraction_prompt.py has previous_feedback.
- **Action:** SAFE TO DELETE
### tmp-patch-research-state.py — APPLIED
- **Target:** research-session.sh
- **What:** Integrates agent-state hooks (state_start_session, state_update_report, state_journal_append)
- **Status:** All hooks present in research-session.sh (STATE_LIB sourcing, HAS_STATE init, session lifecycle calls)
- **Action:** SAFE TO DELETE
### patch-dashboard-cost.py — STALE (superseded)
- **Target:** dashboard_routes.py
- **What:** Adds per-PR cost queries via audit_log (cost_map, triage_cost_map)
- **Status:** Cost tracking implemented differently in current codebase — uses `costs` table and p.cost_usd column, not audit_log aggregation. Patch logic abandoned in favor of newer approach.
- **Action:** SAFE TO DELETE (superseded by different implementation)
### patch-dashboard-prs-cost.py — STALE (superseded)
- **Target:** dashboard_prs.py
- **What:** Adds Cost column header, fmtCost() function, cost cell in row template
- **Status:** Cost KPI card exists (line 101) but implemented as card-based KPI, not table column. fmtCost() not present. Different UI approach than patch intended.
- **Action:** SAFE TO DELETE (superseded by card-based cost display)
### patch-cost-per-pr.py — NOT APPLIED
- **Target:** evaluate.py
- **What:** Adds _estimate_cost() helper function, cost instrumentation to audit events (haiku_triage, domain_rejected, approved, changes_requested)
- **Status:** _estimate_cost not found in evaluate.py. No cost fields in audit events. eval_checks.py has its own estimate_cost but for bot responses, not pipeline eval.
- **Action:** SAFE TO DELETE — eval_checks.py already has cost estimation for its own use case. The pipeline eval cost tracking was a different approach that was never completed.
### patch-dashboard-prs-version.py — NOT APPLIED
- **Target:** dashboard_prs.py
- **What:** Adds version badges (prompt_version, pipeline_version) to eval chain section and agent cell
- **Status:** No version badges in dashboard_prs.py. prompt_version/pipeline_version not displayed anywhere.
- **Action:** SAFE TO DELETE — version columns exist in schema (v17 migration) but UI display was never built. Low priority feature, can be re-implemented from schema when needed.
### patch-dashboard-version.py — NOT APPLIED
- **Target:** dashboard_routes.py, shared_ui.py
- **What:** Adds prompt_version/pipeline_version to SELECT query, version badges to shared_ui
- **Status:** Version fields not in SELECT. shared_ui.py exists but without version display.
- **Action:** SAFE TO DELETE — same reasoning as patch-dashboard-prs-version.py.
### Summary
| Script | Status | Action |
|--------|--------|--------|
| patch-prompt-version.py | APPLIED | Delete |
| tmp-patch-research-state.py | APPLIED | Delete |
| patch-dashboard-cost.py | STALE (superseded) | Delete |
| patch-dashboard-prs-cost.py | STALE (superseded) | Delete |
| patch-cost-per-pr.py | NOT APPLIED (abandoned) | Delete |
| patch-dashboard-prs-version.py | NOT APPLIED (low priority) | Delete |
| patch-dashboard-version.py | NOT APPLIED (low priority) | Delete |
All 7 safe to delete. 2 were applied, 2 were superseded by different implementations, 3 were never applied but the features either exist differently or are low priority.
---
## Root Orphan Files
### extract.py (693 lines)
- **Location:** Pentagon workspace root
- **Canonical:** teleo-codex/ops/pipeline-v2/openrouter-extract-v2.py (Apr 7+)
- **Status:** Older draft (Apr 1). Confirmed by Cory as safe to delete.
- **Action:** DELETE
### cascade.py (274 lines)
- **Location:** Pentagon workspace root
- **Canonical:** teleo-codex/ops/pipeline-v2/lib/cascade.py (10372 bytes, Apr 13)
- **Status:** Older draft. Confirmed by Cory as safe to delete.
- **Action:** DELETE
---
## Argus's Patch Scripts (in root diagnostics/)
8 patch scripts owned by Argus — audit responsibility is Argus's:
- diagnostics/compute_profile_patch.py
- diagnostics/dashboard_compute_patch.py
- diagnostics/patch_4page.py
- diagnostics/patch_dashboard_tokens.py
- diagnostics/patch_evaluate_costs.py
- diagnostics/patch_llm_cli.py
- diagnostics/patch_prs_page.py
- diagnostics/patch_vps_app.py
These remain in root diagnostics/ until Argus completes his audit.

View file

@ -157,8 +157,17 @@ def check_quality_regression(conn: sqlite3.Connection) -> list[dict]:
return alerts
_ALLOWED_DIM_EXPRS = frozenset({
"json_extract(detail, '$.agent')",
"json_extract(detail, '$.domain')",
"COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent'))",
})
def _check_approval_by_dimension(conn, alerts, dim_name, dim_expr):
"""Check approval rate regression grouped by a dimension (agent or domain)."""
"""Check approval rate regression grouped by a dimension. dim_expr must be in _ALLOWED_DIM_EXPRS."""
if dim_expr not in _ALLOWED_DIM_EXPRS:
raise ValueError(f"untrusted dim_expr: {dim_expr}")
# 7-day baseline per dimension
baseline_rows = conn.execute(
f"""SELECT {dim_expr} as dim_val,
@ -468,7 +477,7 @@ def generate_failure_report(conn: sqlite3.Connection, agent: str, hours: int = 2
FROM audit_log, json_each(json_extract(detail, '$.issues'))
WHERE stage='evaluate'
AND event IN ('changes_requested','domain_rejected','tier05_rejected')
AND COALESCE(json_extract(detail, '$.agent'), json_extract(detail, '$.domain_agent')) = ?
AND json_extract(detail, '$.agent') = ?
AND timestamp > datetime('now', ? || ' hours')
GROUP BY tag ORDER BY cnt DESC
LIMIT 5""",

View file

@ -26,22 +26,24 @@ async def handle_check(request):
conn = request.app["_alerting_conn_func"]()
try:
alerts = run_all_checks(conn)
# Generate failure reports for agents with stuck loops
failure_reports = {}
stuck_agents = {a["agent"] for a in alerts if a["category"] == "health" and "stuck" in a["id"] and a["agent"]}
for agent in stuck_agents:
report = generate_failure_report(conn, agent)
if report:
failure_reports[agent] = report
except Exception as e:
logger.error("Check failed: %s", e)
return web.json_response({"error": str(e)}, status=500)
finally:
conn.close()
global _active_alerts, _last_check
_active_alerts = alerts
_last_check = datetime.now(timezone.utc).isoformat()
# Generate failure reports for agents with stuck loops
failure_reports = {}
stuck_agents = {a["agent"] for a in alerts if a["category"] == "health" and "stuck" in a["id"] and a["agent"]}
for agent in stuck_agents:
report = generate_failure_report(conn, agent)
if report:
failure_reports[agent] = report
result = {
"checked_at": _last_check,
"alert_count": len(alerts),
@ -104,10 +106,15 @@ async def handle_api_failure_report(request):
hours: lookback window (default 24)
"""
agent = request.match_info["agent"]
hours = int(request.query.get("hours", "24"))
try:
hours = min(int(request.query.get("hours", "24")), 168)
except ValueError:
hours = 24
conn = request.app["_alerting_conn_func"]()
report = generate_failure_report(conn, agent, hours)
try:
report = generate_failure_report(conn, agent, hours)
finally:
conn.close()
if not report:
return web.json_response({"agent": agent, "status": "no_rejections", "period_hours": hours})

View file

@ -74,7 +74,7 @@ def render_epistemic_page(vital_signs: dict, now: datetime) -> str:
<div style="font-size:40px;margin-bottom:12px;opacity:0.3">&#9881;</div>
<div style="color:#8b949e">
Multi-model agreement rate requires the <code>model_evals</code> table.<br>
<span style="font-size:12px">Blocked on: model_evals table creation (Theseus 2 Phase 3)</span>
<span style="font-size:12px">Blocked on: model_evals table creation (Ship Phase 3)</span>
</div>
<div style="margin-top:16px;font-size:12px;color:#8b949e">
Current eval models: Haiku (triage), GPT-4o (domain), Sonnet/Opus (Leo).<br>

View file

@ -1,8 +1,8 @@
"""PR Lifecycle dashboard — single-page view of every PR through the pipeline.
Sortable table: PR#, summary, claims, domain, contributor, outcome, evals, evaluator, cost, date.
Click any row to expand: claim titles, eval chain, timeline, reviews, issues.
Hero cards: total PRs, merge rate, total claims, est. cost.
Sortable table: PR#, summary, claims, domain, outcome, evals, evaluator, cost, date.
Click any row to expand: timeline, claim list, issues summary.
Hero cards: total PRs, merge rate, median eval rounds, total claims, total cost.
Data sources: prs table, audit_log (eval rounds), review_records.
Owner: Ship
@ -14,7 +14,7 @@ from shared_ui import render_page
EXTRA_CSS = """
.content-wrapper { max-width: 1600px !important; }
.page-content { max-width: 1600px !important; }
.filters { display: flex; gap: 12px; flex-wrap: wrap; margin-bottom: 16px; }
.filters select, .filters input {
background: #161b22; color: #c9d1d9; border: 1px solid #30363d;
@ -22,15 +22,14 @@ EXTRA_CSS = """
.filters select:focus, .filters input:focus { border-color: #58a6ff; outline: none; }
.pr-table { width: 100%; border-collapse: collapse; font-size: 13px; table-layout: fixed; }
.pr-table th:nth-child(1) { width: 50px; } /* PR# */
.pr-table th:nth-child(2) { width: 28%; } /* Summary */
.pr-table th:nth-child(2) { width: 30%; } /* Summary */
.pr-table th:nth-child(3) { width: 50px; } /* Claims */
.pr-table th:nth-child(4) { width: 11%; } /* Domain */
.pr-table th:nth-child(5) { width: 10%; } /* Contributor */
.pr-table th:nth-child(6) { width: 10%; } /* Outcome */
.pr-table th:nth-child(7) { width: 44px; } /* Evals */
.pr-table th:nth-child(8) { width: 12%; } /* Evaluator */
.pr-table th:nth-child(9) { width: 60px; } /* Cost */
.pr-table th:nth-child(10) { width: 80px; } /* Date */
.pr-table th:nth-child(4) { width: 12%; } /* Domain */
.pr-table th:nth-child(5) { width: 10%; } /* Outcome */
.pr-table th:nth-child(6) { width: 50px; } /* Evals */
.pr-table th:nth-child(7) { width: 16%; } /* Evaluator */
.pr-table th:nth-child(8) { width: 70px; } /* Cost */
.pr-table th:nth-child(9) { width: 90px; } /* Date */
.pr-table td { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; padding: 8px 6px; }
.pr-table td:nth-child(2) { white-space: normal; overflow: visible; line-height: 1.4; }
.pr-table th { cursor: pointer; user-select: none; position: relative; padding: 8px 18px 8px 6px; }
@ -49,24 +48,22 @@ EXTRA_CSS = """
.pr-table .pr-link:hover { text-decoration: underline; }
.pr-table td .summary-text { font-size: 12px; color: #c9d1d9; }
.pr-table td .review-snippet { font-size: 11px; color: #f85149; margin-top: 2px; opacity: 0.8; }
.pr-table td .model-tag { font-size: 10px; color: #6e7681; background: #161b22; border-radius: 3px; padding: 1px 4px; }
.pr-table td .contributor-tag { font-size: 11px; color: #d2a8ff; }
.pr-table td .contributor-self { font-size: 11px; color: #6e7681; font-style: italic; }
.pr-table td .model-tag { font-size: 9px; color: #6e7681; background: #21262d; border-radius: 3px; padding: 1px 4px; display: inline-block; margin: 1px 0; }
.pr-table td .expand-chevron { display: inline-block; width: 12px; color: #484f58; font-size: 10px; transition: transform 0.2s; }
.pr-table tr.expanded .expand-chevron { transform: rotate(90deg); color: #58a6ff; }
.pr-table td .cost-val { font-size: 12px; color: #8b949e; }
.pr-table td .claims-count { font-size: 13px; color: #c9d1d9; text-align: center; }
.pr-table td .evals-count { font-size: 13px; text-align: center; }
.trace-panel { background: #0d1117; border: 1px solid #30363d; border-radius: 8px;
padding: 16px; margin: 4px 0 8px 0; font-size: 12px; display: none; }
.trace-panel.open { display: block; }
.trace-panel h4 { color: #58a6ff; font-size: 12px; margin: 12px 0 6px 0; }
.trace-panel h4:first-child { margin-top: 0; }
.claim-list { list-style: none; padding: 0; margin: 0; }
.claim-list li { padding: 4px 0 4px 16px; border-left: 2px solid #238636; color: #c9d1d9; font-size: 12px; line-height: 1.5; }
.claim-list li .claim-confidence { font-size: 10px; color: #8b949e; margin-left: 6px; }
.issues-box { background: #1c1210; border: 1px solid #f8514933; border-radius: 6px;
.trace-panel .section-title { color: #58a6ff; font-size: 12px; font-weight: 600; margin: 12px 0 6px; }
.trace-panel .section-title:first-child { margin-top: 0; }
.trace-panel .claim-list { list-style: none; padding: 0; margin: 0; }
.trace-panel .claim-list li { padding: 4px 0; border-bottom: 1px solid #21262d; color: #c9d1d9; font-size: 12px; }
.trace-panel .claim-list li:last-child { border-bottom: none; }
.trace-panel .issues-box { background: #1c1017; border: 1px solid #f8514930; border-radius: 6px;
padding: 8px 12px; margin: 4px 0; font-size: 12px; color: #f85149; }
.eval-chain { background: #161b22; border-radius: 6px; padding: 8px 12px; margin: 4px 0; font-size: 12px; }
.eval-chain .chain-step { display: inline-block; margin-right: 6px; }
.eval-chain .chain-arrow { color: #484f58; margin: 0 4px; }
.trace-timeline { list-style: none; padding: 0; }
.trace-timeline li { padding: 4px 0; border-left: 2px solid #30363d; padding-left: 12px; margin-left: 8px; }
.trace-timeline li .ts { color: #484f58; font-size: 11px; }
@ -76,6 +73,12 @@ EXTRA_CSS = """
.trace-timeline li.ev-changes .ev { color: #d29922; }
.review-text { background: #161b22; padding: 8px 12px; border-radius: 4px;
margin: 4px 0; white-space: pre-wrap; font-size: 11px; color: #8b949e; max-height: 200px; overflow-y: auto; }
.eval-chain { background: #161b22; border-radius: 6px; padding: 8px 12px; margin: 4px 0 8px;
font-size: 12px; display: flex; gap: 12px; flex-wrap: wrap; align-items: center; }
.eval-chain .step { display: flex; align-items: center; gap: 4px; }
.eval-chain .step-label { color: #8b949e; font-size: 11px; }
.eval-chain .step-model { color: #c9d1d9; font-size: 11px; font-weight: 600; }
.eval-chain .arrow { color: #484f58; }
.pagination { display: flex; gap: 8px; align-items: center; justify-content: center; margin-top: 16px; }
.pagination button { background: #161b22; color: #c9d1d9; border: 1px solid #30363d;
border-radius: 4px; padding: 4px 12px; cursor: pointer; font-size: 12px; }
@ -93,6 +96,7 @@ def render_prs_page(now: datetime) -> str:
<div class="grid" id="hero-cards">
<div class="card"><div class="label">Total PRs</div><div class="value blue" id="kpi-total">--</div><div class="detail" id="kpi-total-detail"></div></div>
<div class="card"><div class="label">Merge Rate</div><div class="value green" id="kpi-merge-rate">--</div><div class="detail" id="kpi-merge-detail"></div></div>
<div class="card"><div class="label">Median Eval Rounds</div><div class="value" id="kpi-rounds">--</div><div class="detail" id="kpi-rounds-detail"></div></div>
<div class="card"><div class="label">Total Claims</div><div class="value blue" id="kpi-claims">--</div><div class="detail" id="kpi-claims-detail"></div></div>
<div class="card"><div class="label">Est. Cost</div><div class="value" id="kpi-cost">--</div><div class="detail" id="kpi-cost-detail"></div></div>
</div>
@ -100,7 +104,6 @@ def render_prs_page(now: datetime) -> str:
<!-- Filters -->
<div class="filters">
<select id="filter-domain"><option value="">All Domains</option></select>
<select id="filter-contributor"><option value="">All Contributors</option></select>
<select id="filter-outcome">
<option value="">All Outcomes</option>
<option value="merged">Merged</option>
@ -130,10 +133,9 @@ def render_prs_page(now: datetime) -> str:
<th data-col="summary">Summary <span class="sort-arrow">&#9650;</span></th>
<th data-col="claims_count">Claims <span class="sort-arrow">&#9650;</span></th>
<th data-col="domain">Domain <span class="sort-arrow">&#9650;</span></th>
<th data-col="submitted_by">Contributor <span class="sort-arrow">&#9650;</span></th>
<th data-col="status">Outcome <span class="sort-arrow">&#9650;</span></th>
<th data-col="eval_rounds">Evals <span class="sort-arrow">&#9650;</span></th>
<th data-col="evaluator_label">Evaluator <span class="sort-arrow">&#9650;</span></th>
<th data-col="evaluator">Evaluator <span class="sort-arrow">&#9650;</span></th>
<th data-col="est_cost">Cost <span class="sort-arrow">&#9650;</span></th>
<th data-col="created_at">Date <span class="sort-arrow">&#9650;</span></th>
</tr>
@ -150,71 +152,42 @@ def render_prs_page(now: datetime) -> str:
</div>
"""
# Use single-quoted JS strings throughout to avoid Python/HTML escaping issues
scripts = """<script>
var PAGE_SIZE = 50;
var FORGEJO = 'https://git.livingip.xyz/teleo/teleo-codex/pulls/';
var allData = [];
var filtered = [];
var sortCol = 'number';
var sortAsc = false;
var page = 0;
var expandedPr = null;
// Tier-based cost estimates (per eval round)
var TIER_COSTS = {
'DEEP': 0.145, // Haiku triage + Gemini Flash domain + Opus Leo
'STANDARD': 0.043, // Haiku triage + Gemini Flash domain + Sonnet Leo
'LIGHT': 0.027 // Haiku triage + Gemini Flash domain only
};
function estimateCost(pr) {
var tier = pr.tier || 'STANDARD';
var rounds = pr.eval_rounds || 1;
var baseCost = TIER_COSTS[tier] || TIER_COSTS['STANDARD'];
return baseCost * rounds;
}
function fmtCost(val) {
if (val == null || val === 0) return '--';
return '$' + val.toFixed(3);
}
const PAGE_SIZE = 50;
const FORGEJO = 'https://git.livingip.xyz/teleo/teleo-codex/pulls/';
let allData = [];
let filtered = [];
let sortCol = 'number';
let sortAsc = false;
let page = 0;
let expandedPr = null;
function loadData() {
var days = document.getElementById('filter-days').value;
var url = '/api/pr-lifecycle' + (days !== '0' ? '?days=' + days : '?days=9999');
fetch(url).then(function(r) { return r.json(); }).then(function(data) {
allData = data.prs || [];
// Compute derived fields
allData.forEach(function(p) {
p.est_cost = estimateCost(p);
// Evaluator label for sorting
p.evaluator_label = p.domain_agent || p.agent || '--';
});
populateFilters(allData);
updateKPIs(data);
applyFilters();
}).catch(function() {
document.getElementById('pr-tbody').innerHTML =
'<tr><td colspan="10" style="text-align:center;color:#f85149;">Failed to load data</td></tr>';
'<tr><td colspan="9" style="text-align:center;color:#f85149;">Failed to load data</td></tr>';
});
}
function populateFilters(prs) {
var domains = [], contribs = [], seenD = {}, seenC = {};
var domains = [], seenD = {};
prs.forEach(function(p) {
if (p.domain && !seenD[p.domain]) { seenD[p.domain] = 1; domains.push(p.domain); }
var c = p.submitted_by || 'unknown';
if (!seenC[c]) { seenC[c] = 1; contribs.push(c); }
});
domains.sort(); contribs.sort();
domains.sort();
var domSel = document.getElementById('filter-domain');
var conSel = document.getElementById('filter-contributor');
var curDom = domSel.value, curCon = conSel.value;
var curDom = domSel.value;
domSel.innerHTML = '<option value="">All Domains</option>' +
domains.map(function(d) { return '<option value="' + esc(d) + '">' + esc(d) + '</option>'; }).join('');
conSel.innerHTML = '<option value="">All Contributors</option>' +
contribs.map(function(c) { return '<option value="' + esc(c) + '">' + esc(c) + '</option>'; }).join('');
domSel.value = curDom; conSel.value = curCon;
domSel.value = curDom;
}
function updateKPIs(data) {
@ -226,29 +199,47 @@ def render_prs_page(now: datetime) -> str:
document.getElementById('kpi-merge-rate').textContent = fmtPct(rate);
document.getElementById('kpi-merge-detail').textContent = fmtNum(data.open) + ' open';
var totalClaims = 0, mergedClaims = 0, totalCost = 0;
document.getElementById('kpi-rounds').textContent =
data.median_rounds != null ? data.median_rounds.toFixed(1) : '--';
document.getElementById('kpi-rounds-detail').textContent =
data.max_rounds != null ? 'max: ' + data.max_rounds : '';
var totalClaims = 0, mergedClaims = 0;
var totalCost = 0;
var actualCount = 0, estCount = 0;
(data.prs || []).forEach(function(p) {
totalClaims += (p.claims_count || 1);
if (p.status === 'merged') mergedClaims += (p.claims_count || 1);
totalCost += estimateCost(p);
totalCost += (p.cost || 0);
if (p.cost_is_actual) actualCount++; else estCount++;
});
document.getElementById('kpi-claims').textContent = fmtNum(totalClaims);
document.getElementById('kpi-claims-detail').textContent = fmtNum(mergedClaims) + ' merged';
document.getElementById('kpi-cost').textContent = '$' + totalCost.toFixed(2);
var perClaim = totalClaims > 0 ? totalCost / totalClaims : 0;
document.getElementById('kpi-cost-detail').textContent = '$' + perClaim.toFixed(3) + '/claim';
// Show actual DB total if available, otherwise sum from PRs
var costLabel = '';
if (data.actual_total_cost > 0) {
document.getElementById('kpi-cost').textContent = '$' + data.actual_total_cost.toFixed(2);
costLabel = 'from costs table';
} else if (actualCount > 0) {
document.getElementById('kpi-cost').textContent = '$' + totalCost.toFixed(2);
costLabel = actualCount + ' actual, ' + estCount + ' est.';
} else {
document.getElementById('kpi-cost').textContent = '$' + totalCost.toFixed(2);
costLabel = 'ALL ESTIMATED';
}
var costPerClaim = totalClaims > 0 ? totalCost / totalClaims : 0;
document.getElementById('kpi-cost-detail').textContent =
'$' + costPerClaim.toFixed(3) + '/claim \u00b7 ' + costLabel;
}
function applyFilters() {
var dom = document.getElementById('filter-domain').value;
var con = document.getElementById('filter-contributor').value;
var out = document.getElementById('filter-outcome').value;
var tier = document.getElementById('filter-tier').value;
filtered = allData.filter(function(p) {
if (dom && p.domain !== dom) return false;
if (con && (p.submitted_by || 'unknown') !== con) return false;
if (out && p.status !== out) return false;
if (tier && p.tier !== tier) return false;
return true;
@ -278,6 +269,19 @@ def render_prs_page(now: datetime) -> str:
return s.length > n ? s.substring(0, n) + '...' : s;
}
function shortModel(m) {
if (!m) return '';
// Shorten model names for display
if (m.indexOf('gemini-2.5-flash') !== -1) return 'Gemini Flash';
if (m.indexOf('claude-sonnet') !== -1 || m.indexOf('sonnet-4') !== -1) return 'Sonnet';
if (m.indexOf('claude-opus') !== -1 || m.indexOf('opus') !== -1) return 'Opus';
if (m.indexOf('haiku') !== -1) return 'Haiku';
if (m.indexOf('gpt-4o') !== -1) return 'GPT-4o';
// fallback: strip provider prefix
var parts = m.split('/');
return parts[parts.length - 1];
}
function renderTable() {
var tbody = document.getElementById('pr-tbody');
var start = page * PAGE_SIZE;
@ -285,7 +289,7 @@ def render_prs_page(now: datetime) -> str:
var totalPages = Math.ceil(filtered.length / PAGE_SIZE);
if (slice.length === 0) {
tbody.innerHTML = '<tr><td colspan="10" style="text-align:center;color:#8b949e;">No PRs match filters</td></tr>';
tbody.innerHTML = '<tr><td colspan="9" style="text-align:center;color:#8b949e;">No PRs match filters</td></tr>';
return;
}
@ -297,37 +301,40 @@ def render_prs_page(now: datetime) -> str:
(p.tier || '').toLowerCase() === 'standard' ? 'tier-standard' : 'tier-light';
var date = p.created_at ? p.created_at.substring(0, 10) : '--';
// Summary: first claim title
// Summary
var summary = p.summary || '--';
var reviewSnippet = '';
if (p.status === 'closed' && p.review_snippet) {
reviewSnippet = '<div class="review-snippet">' + esc(truncate(p.review_snippet, 120)) + '</div>';
}
// Outcome with tier badge
var outcomeLabel = esc(p.status || '--');
var tierBadge = p.tier ? ' <span class="' + tierClass + '" style="font-size:10px;">' + esc(p.tier) + '</span>' : '';
// Review snippet for issues
var reviewSnippet = '';
if (p.review_snippet) {
reviewSnippet = '<div class="review-snippet">' + esc(truncate(p.review_snippet, 100)) + '</div>';
}
// Contributor display
var contributor = p.submitted_by || '--';
var contribClass = 'contributor-tag';
if (contributor.indexOf('self-directed') >= 0 || contributor === 'unknown') {
contribClass = 'contributor-self';
}
// Evaluator: domain agent + model tag
// Evaluator column: domain agent + model
var evaluator = '';
if (p.domain_agent) {
var modelShort = '';
if (p.domain_model) {
var m = p.domain_model;
if (m.indexOf('gemini') >= 0) modelShort = 'Gemini Flash';
else if (m.indexOf('gpt-4o') >= 0) modelShort = 'GPT-4o';
else if (m.indexOf('sonnet') >= 0) modelShort = 'Sonnet';
else modelShort = m.split('/').pop();
evaluator = '<div style="font-size:12px;color:#c9d1d9;">' + esc(p.domain_agent) + '</div>';
}
if (p.domain_model) {
evaluator += '<div class="model-tag">' + esc(shortModel(p.domain_model)) + '</div>';
}
if (p.leo_model) {
evaluator += '<div class="model-tag">' + esc(shortModel(p.leo_model)) + '</div>';
}
if (!evaluator) evaluator = '<span style="color:#484f58;">--</span>';
// Cost actual from DB or estimated (flagged)
var costStr;
if (p.cost != null && p.cost > 0) {
if (p.cost_is_actual) {
costStr = '<span class="cost-val">$' + p.cost.toFixed(3) + '</span>';
} else {
costStr = '<span class="cost-val" style="opacity:0.5;" title="Estimated — no actual cost tracked">~$' + p.cost.toFixed(3) + '</span>';
}
evaluator = esc(p.domain_agent) + (modelShort ? ' <span class="model-tag">' + esc(modelShort) + '</span>' : '');
} else {
costStr = '<span style="color:#484f58;">--</span>';
}
rows.push(
@ -335,17 +342,16 @@ def render_prs_page(now: datetime) -> str:
'<td><span class="expand-chevron">&#9654;</span> ' +
'<a class="pr-link" href="' + FORGEJO + p.number + '" target="_blank" rel="noopener" onclick="event.stopPropagation();">#' + p.number + '</a></td>' +
'<td style="white-space:normal;"><span class="summary-text">' + esc(summary) + '</span>' + reviewSnippet + '</td>' +
'<td style="text-align:center;">' + (p.claims_count || 1) + '</td>' +
'<td style="text-align:center;">' + (p.claims_count || '--') + '</td>' +
'<td>' + esc(p.domain || '--') + '</td>' +
'<td><span class="' + contribClass + '">' + esc(truncate(contributor, 20)) + '</span></td>' +
'<td class="' + outClass + '">' + esc(p.status || '--') + tierBadge + '</td>' +
'<td class="' + outClass + '">' + outcomeLabel + tierBadge + '</td>' +
'<td style="text-align:center;">' + (p.eval_rounds || '--') + '</td>' +
'<td>' + evaluator + '</td>' +
'<td>' + fmtCost(p.est_cost) + '</td>' +
'<td>' + costStr + '</td>' +
'<td>' + date + '</td>' +
'</tr>' +
'<tr id="trace-' + p.number + '" style="display:none;"><td colspan="10" style="padding:0;">' +
'<div class="trace-panel" id="panel-' + p.number + '">Loading...</div>' +
'<tr id="trace-' + p.number + '" style="display:none;"><td colspan="9" style="padding:0;">' +
'<div class="trace-panel" id="panel-' + p.number + '">Loading trace...</div>' +
'</td></tr>'
);
});
@ -408,34 +414,46 @@ def render_prs_page(now: datetime) -> str:
});
function loadTrace(pr, panel) {
// Find the PR data for claim titles
// Also find this PR in allData for claim list
var prData = null;
for (var i = 0; i < allData.length; i++) {
if (allData[i].number == pr) { prData = allData[i]; break; }
}
allData.forEach(function(p) { if (p.number == pr) prData = p; });
fetch('/api/trace/' + pr).then(function(r) { return r.json(); }).then(function(data) {
var html = '';
// Claims contained in this PR
if (prData && prData.description) {
var titles = prData.description.split('|').map(function(t) { return t.trim(); }).filter(Boolean);
if (titles.length > 0) {
html += '<h4>Claims (' + titles.length + ')</h4>';
html += '<ul class="claim-list">';
titles.forEach(function(t) {
html += '<li>' + esc(t) + '</li>';
});
html += '</ul>';
}
// --- Claims contained in this PR ---
if (prData && prData.claim_titles && prData.claim_titles.length > 0) {
html += '<div class="section-title">Claims (' + prData.claim_titles.length + ')</div>';
html += '<ul class="claim-list">';
prData.claim_titles.forEach(function(t) {
html += '<li>' + esc(t) + '</li>';
});
html += '</ul>';
}
// Issues (if any)
// --- Issues summary ---
var issues = [];
if (data.timeline) {
data.timeline.forEach(function(ev) {
if (ev.detail && ev.detail.issues) {
var iss = ev.detail.issues;
if (typeof iss === 'string') { try { iss = JSON.parse(iss); } catch(e) { iss = [iss]; } }
if (Array.isArray(iss)) {
iss.forEach(function(i) {
var label = String(i).replace(/_/g, ' ');
if (issues.indexOf(label) === -1) issues.push(label);
});
}
}
});
}
if (prData && prData.review_snippet) {
html += '<div class="issues-box">' + esc(prData.review_snippet) + '</div>';
} else if (issues.length > 0) {
html += '<div class="issues-box">Issues: ' + issues.map(esc).join(', ') + '</div>';
}
// Eval chain with models
// --- Eval chain (who reviewed with what model) ---
var models = {};
if (data.timeline) {
data.timeline.forEach(function(ev) {
@ -446,38 +464,23 @@ def render_prs_page(now: datetime) -> str:
}
});
}
html += '<div class="eval-chain"><strong style="color:#58a6ff;">Eval Chain:</strong> ';
var chain = [];
if (models['triage.haiku_triage'] || models['triage.deterministic_triage']) {
chain.push('<span class="chain-step">Triage <span class="model-tag">' +
esc(models['triage.haiku_triage'] || 'deterministic') + '</span></span>');
}
if (models['domain_review']) {
chain.push('<span class="chain-step">Domain <span class="model-tag">' +
esc(models['domain_review']) + '</span></span>');
}
if (models['leo_review']) {
chain.push('<span class="chain-step">Leo <span class="model-tag">' +
esc(models['leo_review']) + '</span></span>');
}
html += chain.length > 0 ? chain.join('<span class="chain-arrow">&#8594;</span>') :
'<span style="color:#484f58;">No model data</span>';
html += '</div>';
// Source + contributor metadata
if (data.pr) {
html += '<div style="margin:8px 0;font-size:12px;color:#8b949e;">';
if (data.pr.source_path) html += 'Source: <span style="color:#c9d1d9;">' + esc(data.pr.source_path) + '</span> &middot; ';
if (prData && prData.submitted_by) html += 'Contributor: <span style="color:#d2a8ff;">' + esc(prData.submitted_by) + '</span> &middot; ';
if (data.pr.tier) html += 'Tier: <span style="color:#c9d1d9;">' + esc(data.pr.tier) + '</span> &middot; ';
html += '<a class="pr-link" href="' + FORGEJO + pr + '" target="_blank">View on Forgejo</a>';
if (Object.keys(models).length > 0) {
html += '<div class="eval-chain">';
html += '<strong style="color:#58a6ff;">Eval chain:</strong> ';
var parts = [];
if (models['triage.haiku_triage'] || models['triage.deterministic_triage'])
parts.push('<span class="step"><span class="step-label">Triage</span> <span class="step-model">' + shortModel(models['triage.haiku_triage'] || 'deterministic') + '</span></span>');
if (models['domain_review'])
parts.push('<span class="step"><span class="step-label">Domain</span> <span class="step-model">' + shortModel(models['domain_review']) + '</span></span>');
if (models['leo_review'])
parts.push('<span class="step"><span class="step-label">Leo</span> <span class="step-model">' + shortModel(models['leo_review']) + '</span></span>');
html += parts.length > 0 ? parts.join(' <span class="arrow">&#8594;</span> ') : '<span style="color:#484f58;">No model data</span>';
html += '</div>';
}
// Timeline
// --- Timeline ---
if (data.timeline && data.timeline.length > 0) {
html += '<h4>Timeline</h4>';
html += '<div class="section-title">Timeline</div>';
html += '<ul class="trace-timeline">';
data.timeline.forEach(function(ev) {
var cls = ev.event === 'approved' ? 'ev-approved' :
@ -488,7 +491,7 @@ def render_prs_page(now: datetime) -> str:
if (ev.detail) {
if (ev.detail.tier) detail += ' tier=' + ev.detail.tier;
if (ev.detail.reason) detail += ' &#8212; ' + esc(ev.detail.reason);
if (ev.detail.model) detail += ' [' + esc(ev.detail.model) + ']';
if (ev.detail.model) detail += ' [' + esc(shortModel(ev.detail.model)) + ']';
if (ev.detail.review_text) {
detail += '<div class="review-text">' + esc(ev.detail.review_text).substring(0, 2000) + '</div>';
}
@ -506,19 +509,19 @@ def render_prs_page(now: datetime) -> str:
});
html += '</ul>';
} else {
html += '<div style="color:#484f58;font-size:12px;margin:8px 0;">No timeline events</div>';
html += '<div style="color:#484f58;font-size:12px;margin-top:8px;">No timeline events</div>';
}
// Reviews
// --- Reviews ---
if (data.reviews && data.reviews.length > 0) {
html += '<h4>Reviews</h4>';
html += '<div class="section-title">Reviews</div>';
data.reviews.forEach(function(r) {
var cls = r.outcome === 'approved' ? 'badge-green' :
r.outcome === 'rejected' ? 'badge-red' : 'badge-yellow';
html += '<div style="margin:4px 0;">' +
'<span class="badge ' + cls + '">' + esc(r.outcome) + '</span> ' +
'<span style="color:#8b949e;font-size:11px;">' + esc(r.reviewer || '') + ' ' +
(r.model ? '[' + esc(r.model) + ']' : '') + ' ' +
(r.model ? '[' + esc(shortModel(r.model)) + ']' : '') + ' ' +
(r.reviewed_at || '').substring(0, 19) + '</span>';
if (r.rejection_reason) {
html += ' <code>' + esc(r.rejection_reason) + '</code>';
@ -537,7 +540,7 @@ def render_prs_page(now: datetime) -> str:
}
// Filter listeners
['filter-domain', 'filter-contributor', 'filter-outcome', 'filter-tier'].forEach(function(id) {
['filter-domain', 'filter-outcome', 'filter-tier'].forEach(function(id) {
document.getElementById(id).addEventListener('change', applyFilters);
});
document.getElementById('filter-days').addEventListener('change', loadData);

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,279 @@
"""Dashboard API routes for research session + cost tracking.
Argus-side read-only endpoints. These query the data that
research_tracking.py writes to pipeline.db.
Add to app.py after alerting_routes setup.
"""
import json
import sqlite3
from aiohttp import web
def _conn(app):
"""Read-only connection to pipeline.db."""
db_path = app["db_path"]
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
conn.row_factory = sqlite3.Row
return conn
async def handle_api_research_sessions(request):
"""GET /api/research-sessions?agent=&domain=&days=7
Returns research sessions with linked sources and cost data.
"""
agent = request.query.get("agent")
domain = request.query.get("domain")
try:
days = int(request.query.get("days", 7))
except (ValueError, TypeError):
days = 7
conn = _conn(request.app)
try:
where = ["rs.started_at >= datetime('now', ?)"]
params = [f"-{days} days"]
if agent:
where.append("rs.agent = ?")
params.append(agent)
if domain:
where.append("rs.domain = ?")
params.append(domain)
where_clause = " AND ".join(where)
sessions = conn.execute(f"""
SELECT rs.*,
GROUP_CONCAT(s.path, '||') as source_paths,
GROUP_CONCAT(s.status, '||') as source_statuses,
GROUP_CONCAT(s.claims_count, '||') as source_claims,
GROUP_CONCAT(COALESCE(s.cost_usd, 0), '||') as source_costs
FROM research_sessions rs
LEFT JOIN sources s ON s.session_id = rs.id
WHERE {where_clause}
GROUP BY rs.id
ORDER BY rs.started_at DESC
""", params).fetchall()
result = []
for s in sessions:
sources = []
if s["source_paths"]:
paths = s["source_paths"].split("||")
statuses = (s["source_statuses"] or "").split("||")
claims = (s["source_claims"] or "").split("||")
costs = (s["source_costs"] or "").split("||")
for i, p in enumerate(paths):
sources.append({
"path": p,
"status": statuses[i] if i < len(statuses) else None,
"claims_count": int(claims[i]) if i < len(claims) and claims[i] else 0,
"extraction_cost": float(costs[i]) if i < len(costs) and costs[i] else 0,
})
result.append({
"id": s["id"],
"agent": s["agent"],
"domain": s["domain"],
"topic": s["topic"],
"reasoning": s["reasoning"],
"summary": s["summary"],
"sources_planned": s["sources_planned"],
"sources_produced": s["sources_produced"],
"model": s["model"],
"input_tokens": s["input_tokens"],
"output_tokens": s["output_tokens"],
"research_cost": s["cost_usd"],
"extraction_cost": sum(src["extraction_cost"] for src in sources),
"total_cost": s["cost_usd"] + sum(src["extraction_cost"] for src in sources),
"total_claims": sum(src["claims_count"] for src in sources),
"status": s["status"],
"started_at": s["started_at"],
"completed_at": s["completed_at"],
"sources": sources,
})
# Summary stats
total_sessions = len(result)
total_cost = sum(r["total_cost"] for r in result)
total_claims = sum(r["total_claims"] for r in result)
total_sources = sum(r["sources_produced"] for r in result)
return web.json_response({
"summary": {
"sessions": total_sessions,
"total_cost": round(total_cost, 2),
"total_claims": total_claims,
"total_sources": total_sources,
"avg_cost_per_claim": round(total_cost / total_claims, 4) if total_claims else 0,
"avg_cost_per_session": round(total_cost / total_sessions, 4) if total_sessions else 0,
},
"sessions": result,
})
finally:
conn.close()
async def handle_api_costs(request):
"""GET /api/costs?days=14&by=stage|model|date
Comprehensive cost breakdown. Works with EXISTING data in costs table
plus the new extraction costs once backfilled.
"""
try:
days = int(request.query.get("days", 14))
except (ValueError, TypeError):
days = 14
group_by = request.query.get("by", "stage")
conn = _conn(request.app)
try:
valid_groups = {"stage", "model", "date"}
if group_by not in valid_groups:
group_by = "stage"
rows = conn.execute(f"""
SELECT {group_by},
SUM(calls) as total_calls,
SUM(input_tokens) as total_input,
SUM(output_tokens) as total_output,
SUM(cost_usd) as total_cost
FROM costs
WHERE date >= date('now', ?)
GROUP BY {group_by}
ORDER BY total_cost DESC
""", (f"-{days} days",)).fetchall()
result = []
for r in rows:
result.append({
group_by: r[group_by],
"calls": r["total_calls"],
"input_tokens": r["total_input"],
"output_tokens": r["total_output"],
"cost_usd": round(r["total_cost"], 4),
})
grand_total = sum(r["cost_usd"] for r in result)
# Also get per-agent cost from sources table (extraction costs)
agent_costs = conn.execute("""
SELECT p.agent,
COUNT(DISTINCT s.path) as sources,
SUM(s.cost_usd) as extraction_cost,
SUM(s.claims_count) as claims
FROM sources s
LEFT JOIN prs p ON p.source_path = s.path
WHERE s.cost_usd > 0
GROUP BY p.agent
ORDER BY extraction_cost DESC
""").fetchall()
agent_breakdown = []
for r in agent_costs:
agent_breakdown.append({
"agent": r["agent"] or "unlinked",
"sources": r["sources"],
"extraction_cost": round(r["extraction_cost"], 2),
"claims": r["claims"],
"cost_per_claim": round(r["extraction_cost"] / r["claims"], 4) if r["claims"] else 0,
})
return web.json_response({
"period_days": days,
"grand_total": round(grand_total, 2),
"by_" + group_by: result,
"by_agent": agent_breakdown,
})
finally:
conn.close()
async def handle_api_source_detail(request):
"""GET /api/source/{path}
Full lifecycle of a single source: research session extraction claims eval outcomes.
"""
source_path = request.match_info["path"]
conn = _conn(request.app)
try:
# Try exact match first, fall back to suffix match (anchored)
source = conn.execute(
"SELECT * FROM sources WHERE path = ?",
(source_path,),
).fetchone()
if not source:
# Suffix match — anchor with / prefix to avoid substring hits
source = conn.execute(
"SELECT * FROM sources WHERE path LIKE ? ORDER BY length(path) LIMIT 1",
(f"%/{source_path}",),
).fetchone()
if not source:
return web.json_response({"error": "Source not found"}, status=404)
result = dict(source)
# Get research session if linked
if source["session_id"]:
session = conn.execute(
"SELECT * FROM research_sessions WHERE id = ?",
(source["session_id"],),
).fetchone()
result["research_session"] = dict(session) if session else None
else:
result["research_session"] = None
# Get PRs from this source
prs = conn.execute(
"SELECT number, status, domain, agent, tier, leo_verdict, domain_verdict, "
"cost_usd, created_at, merged_at, commit_type, transient_retries, substantive_retries, last_error "
"FROM prs WHERE source_path = ?",
(source["path"],),
).fetchall()
result["prs"] = [dict(p) for p in prs]
# Get eval events from audit_log for those PRs
# NOTE: audit_log.detail is mixed — some rows are JSON (evaluate events),
# some are plain text. Use json_valid() to filter safely.
pr_numbers = [p["number"] for p in prs]
if pr_numbers:
placeholders = ",".join("?" * len(pr_numbers))
evals = conn.execute(f"""
SELECT * FROM audit_log
WHERE stage = 'evaluate'
AND json_valid(detail)
AND json_extract(detail, '$.pr') IN ({placeholders})
ORDER BY timestamp
""", pr_numbers).fetchall()
result["eval_history"] = [
{"timestamp": e["timestamp"], "event": e["event"],
"detail": json.loads(e["detail"]) if e["detail"] else None}
for e in evals
]
else:
result["eval_history"] = []
return web.json_response(result)
finally:
conn.close()
def setup_research_routes(app):
"""Register research tracking routes. Call from create_app()."""
app.router.add_get("/api/research-sessions", handle_api_research_sessions)
app.router.add_get("/api/costs", handle_api_costs)
app.router.add_get("/api/source/{path:.+}", handle_api_source_detail)
# Public paths to add to auth middleware
RESEARCH_PUBLIC_PATHS = frozenset({
"/api/research-sessions",
"/api/costs",
})
# /api/source/{path} needs prefix matching — add to auth middleware:
# if path.startswith("/api/source/"): allow

View file

@ -0,0 +1,419 @@
"""Research session tracking + cost attribution for the Teleo pipeline.
This module adds three capabilities:
1. research_sessions table tracks WHY agents researched, what they found interesting,
session cost, and links to generated sources
2. Extraction cost attribution writes per-source cost to sources.cost_usd after extraction
3. Source claim linkage ensures prs.source_path is always populated
Designed for Epimetheus to integrate into the pipeline. Argus built the spec;
Ganymede reviews; Epimetheus wires it in.
Data flow:
Agent research session research_sessions row (with reasoning + summary)
sources created (with session_id FK)
extraction runs (cost written to sources.cost_usd + costs table)
PRs created (source_path populated)
claims merged (traceable back to session)
"""
import json
import logging
import sqlite3
from datetime import datetime
from typing import Optional
logger = logging.getLogger("research_tracking")
# ---------------------------------------------------------------------------
# Migration v11: research_sessions table + sources.session_id FK
# (v9 is current; v10 is Epimetheus's eval pipeline migration)
# ---------------------------------------------------------------------------
MIGRATION_V11_SQL = """
-- Research session tracking table
CREATE TABLE IF NOT EXISTS research_sessions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
agent TEXT NOT NULL,
-- Which agent ran the research (leo, rio, astra, etc.)
domain TEXT,
-- Primary domain of the research
topic TEXT NOT NULL,
-- What they researched (short description)
reasoning TEXT,
-- WHY they chose this topic (agent's own explanation)
summary TEXT,
-- What they found most interesting/relevant
sources_planned INTEGER DEFAULT 0,
-- How many sources they intended to produce
sources_produced INTEGER DEFAULT 0,
-- How many actually materialized
model TEXT,
-- Model used for research (e.g. claude-opus-4-6)
input_tokens INTEGER DEFAULT 0,
output_tokens INTEGER DEFAULT 0,
cost_usd REAL DEFAULT 0,
-- Total research session cost (LLM calls for discovery + writing)
status TEXT DEFAULT 'running',
-- running, completed, failed, partial
started_at TEXT DEFAULT (datetime('now')),
completed_at TEXT,
metadata TEXT DEFAULT '{}'
-- JSON: any extra context (prompt version, search queries used, etc.)
);
CREATE INDEX IF NOT EXISTS idx_rs_agent ON research_sessions(agent);
CREATE INDEX IF NOT EXISTS idx_rs_domain ON research_sessions(domain);
CREATE INDEX IF NOT EXISTS idx_rs_started ON research_sessions(started_at);
-- Add session_id FK to sources table
ALTER TABLE sources ADD COLUMN session_id INTEGER REFERENCES research_sessions(id);
CREATE INDEX IF NOT EXISTS idx_sources_session ON sources(session_id);
-- Record migration
INSERT INTO schema_version (version) VALUES (11);
"""
# ---------------------------------------------------------------------------
# Cost attribution: write extraction cost to sources.cost_usd
# ---------------------------------------------------------------------------
# Pricing per million tokens (as of March 2026)
MODEL_PRICING = {
"anthropic/claude-sonnet-4.5": {"input": 3.00, "output": 15.00},
"anthropic/claude-sonnet-4-5": {"input": 3.00, "output": 15.00},
"anthropic/claude-haiku-4.5": {"input": 0.80, "output": 4.00},
"anthropic/claude-haiku-4-5-20251001": {"input": 0.80, "output": 4.00},
"minimax/minimax-m2.5": {"input": 0.14, "output": 0.56},
}
def calculate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
"""Calculate USD cost from model name and token counts."""
pricing = MODEL_PRICING.get(model)
if not pricing:
# Default to Sonnet 4.5 pricing as conservative estimate
logger.warning("Unknown model %s — using Sonnet 4.5 pricing", model)
pricing = {"input": 3.00, "output": 15.00}
return (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
def record_extraction_cost(
conn: sqlite3.Connection,
source_path: str,
model: str,
input_tokens: int,
output_tokens: int,
):
"""Write extraction cost to both sources.cost_usd and costs table.
Call this after each successful extraction call in openrouter-extract-v2.py.
This is the missing link the CSV logger records tokens but never writes
cost back to the DB.
"""
cost = calculate_cost(model, input_tokens, output_tokens)
# Update source row
conn.execute(
"UPDATE sources SET cost_usd = cost_usd + ?, extraction_model = ? WHERE path = ?",
(cost, model, source_path),
)
# Also record in costs table for dashboard aggregation
date = datetime.utcnow().strftime("%Y-%m-%d")
conn.execute(
"""INSERT INTO costs (date, model, stage, calls, input_tokens, output_tokens, cost_usd)
VALUES (?, ?, 'extraction', 1, ?, ?, ?)
ON CONFLICT(date, model, stage)
DO UPDATE SET calls = calls + 1,
input_tokens = input_tokens + excluded.input_tokens,
output_tokens = output_tokens + excluded.output_tokens,
cost_usd = cost_usd + excluded.cost_usd""",
(date, model, input_tokens, output_tokens, cost),
)
conn.commit()
logger.info(
"Recorded extraction cost for %s: $%.4f (%d in, %d out, %s)",
source_path, cost, input_tokens, output_tokens, model,
)
return cost
# ---------------------------------------------------------------------------
# Research session lifecycle
# ---------------------------------------------------------------------------
def start_session(
conn: sqlite3.Connection,
agent: str,
topic: str,
domain: Optional[str] = None,
reasoning: Optional[str] = None,
sources_planned: int = 0,
model: Optional[str] = None,
metadata: Optional[dict] = None,
) -> int:
"""Call at the START of a research session. Returns session_id.
The agent should call this before it begins producing sources,
explaining what it plans to research and why.
"""
cur = conn.execute(
"""INSERT INTO research_sessions
(agent, domain, topic, reasoning, sources_planned, model, metadata)
VALUES (?, ?, ?, ?, ?, ?, ?)""",
(
agent,
domain,
topic,
reasoning,
sources_planned,
model,
json.dumps(metadata or {}),
),
)
conn.commit()
session_id = cur.lastrowid
logger.info("Started research session #%d: %s / %s", session_id, agent, topic)
return session_id
def link_source_to_session(
conn: sqlite3.Connection,
source_path: str,
session_id: int,
):
"""Link a source file to its research session.
Call this when a source is written to inbox/ during a research session.
"""
conn.execute(
"UPDATE sources SET session_id = ? WHERE path = ?",
(session_id, source_path),
)
conn.execute(
"""UPDATE research_sessions
SET sources_produced = sources_produced + 1
WHERE id = ?""",
(session_id,),
)
conn.commit()
def complete_session(
conn: sqlite3.Connection,
session_id: int,
summary: str,
input_tokens: int = 0,
output_tokens: int = 0,
cost_usd: float = 0,
status: str = "completed",
):
"""Call at the END of a research session.
The agent should summarize what it found most interesting/relevant.
Cost should include ALL LLM calls made during the session (web search,
analysis, source writing everything).
"""
conn.execute(
"""UPDATE research_sessions
SET summary = ?, input_tokens = ?, output_tokens = ?,
cost_usd = ?, status = ?, completed_at = datetime('now')
WHERE id = ?""",
(summary, input_tokens, output_tokens, cost_usd, status, session_id),
)
conn.commit()
logger.info("Completed research session #%d: %s", session_id, status)
# ---------------------------------------------------------------------------
# Source → PR linkage fix
# ---------------------------------------------------------------------------
def ensure_source_path_on_pr(
conn: sqlite3.Connection,
pr_number: int,
source_path: str,
):
"""Ensure prs.source_path is populated. Call during PR creation.
Currently 0/1451 PRs have source_path set. This is the fix.
"""
conn.execute(
"UPDATE prs SET source_path = ? WHERE number = ? AND (source_path IS NULL OR source_path = '')",
(source_path, pr_number),
)
conn.commit()
# ---------------------------------------------------------------------------
# Backfill: attribute extraction costs from existing CSV log
# ---------------------------------------------------------------------------
def backfill_extraction_costs(conn: sqlite3.Connection, csv_path: str):
"""One-time backfill: read openrouter-usage.csv and write costs to sources + costs tables.
Run once to fill in the ~$338 of extraction costs that were logged to CSV
but never written to the database.
Safe to re-run only updates sources where cost_usd = 0, so partial
runs can be resumed without double-counting.
"""
import csv
count = 0
total_cost = 0.0
with open(csv_path) as f:
reader = csv.DictReader(f)
for row in reader:
source_file = row.get("source_file", "")
model = row.get("model", "")
try:
in_tok = int(row.get("input_tokens", 0) or 0)
out_tok = int(row.get("output_tokens", 0) or 0)
except (ValueError, TypeError):
continue
cost = calculate_cost(model, in_tok, out_tok)
if cost <= 0:
continue
# Try to match source_file to sources.path
# CSV has filename, DB has full path — match on exact suffix
# Use ORDER BY length(path) to prefer shortest (most specific) match
matched = conn.execute(
"SELECT path FROM sources WHERE path LIKE ? AND cost_usd = 0 ORDER BY length(path) LIMIT 1",
(f"%/{source_file}" if "/" not in source_file else f"%{source_file}",),
).fetchone()
if matched:
conn.execute(
"UPDATE sources SET cost_usd = ?, extraction_model = ? WHERE path = ?",
(cost, model, matched[0]),
)
# Always record in costs table
date = row.get("date", "unknown")
conn.execute(
"""INSERT INTO costs (date, model, stage, calls, input_tokens, output_tokens, cost_usd)
VALUES (?, ?, 'extraction', 1, ?, ?, ?)
ON CONFLICT(date, model, stage)
DO UPDATE SET calls = calls + 1,
input_tokens = input_tokens + excluded.input_tokens,
output_tokens = output_tokens + excluded.output_tokens,
cost_usd = cost_usd + excluded.cost_usd""",
(date, model, in_tok, out_tok, cost),
)
count += 1
total_cost += cost
conn.commit()
logger.info("Backfilled %d extraction cost records, total $%.2f", count, total_cost)
return count, total_cost
# ---------------------------------------------------------------------------
# Backfill: populate prs.source_path from branch naming convention
# ---------------------------------------------------------------------------
def backfill_source_paths(conn: sqlite3.Connection):
"""One-time backfill: derive source_path for existing PRs from branch names.
Branch format: extract/YYYY-MM-DD-source-name or similar patterns.
Source path format: inbox/queue/YYYY-MM-DD-source-name.md
"""
rows = conn.execute(
"SELECT number, branch FROM prs WHERE source_path IS NULL AND branch IS NOT NULL"
).fetchall()
count = 0
for number, branch in rows:
# Try to extract source name from branch
# Common patterns: extract/source-name, claims/source-name
parts = branch.split("/", 1)
if len(parts) < 2:
continue
source_stem = parts[1]
# Try to find matching source in DB — exact suffix match, shortest path wins
matched = conn.execute(
"SELECT path FROM sources WHERE path LIKE ? ORDER BY length(path) LIMIT 1",
(f"%/{source_stem}%" if source_stem else "",),
).fetchone()
if matched:
conn.execute(
"UPDATE prs SET source_path = ? WHERE number = ?",
(matched[0], number),
)
count += 1
conn.commit()
logger.info("Backfilled source_path for %d PRs", count)
return count
# ---------------------------------------------------------------------------
# Integration points (for Epimetheus to wire in)
# ---------------------------------------------------------------------------
INTEGRATION_GUIDE = """
## Where to wire this in
### 1. openrouter-extract-v2.py — after successful extraction call
from research_tracking import record_extraction_cost
# After line 430 (content, usage = call_openrouter(...))
# After line 672 (log_usage(...))
record_extraction_cost(
conn, args.source_file, args.model,
usage.get("prompt_tokens", 0),
usage.get("completion_tokens", 0),
)
### 2. Agent research scripts — wrap research sessions
from research_tracking import start_session, link_source_to_session, complete_session
# At start of research:
session_id = start_session(conn, agent="leo", topic="weapons stigmatization campaigns",
domain="grand-strategy",
reasoning="Following up on EU AI Act national security exclusion — exploring how stigmatization
campaigns have historically driven arms control policy",
sources_planned=6, model="claude-opus-4-6")
# As each source is written:
link_source_to_session(conn, source_path, session_id)
# At end of research:
complete_session(conn, session_id,
summary="Ottawa Treaty mine ban model is the strongest parallel to AI weapons — same
3-condition framework (humanitarian harm + low military utility + civil society
coalition). Ukraine Shahed case is a near-miss triggering event.",
input_tokens=total_in, output_tokens=total_out, cost_usd=total_cost)
### 3. PR creation in lib/merge.py or lib/validate.py — ensure source_path
from research_tracking import ensure_source_path_on_pr
# When creating a PR, pass the source:
ensure_source_path_on_pr(conn, pr_number, source_path)
### 4. One-time backfills (run manually after migration)
from research_tracking import backfill_extraction_costs, backfill_source_paths
backfill_extraction_costs(conn, "/opt/teleo-eval/logs/openrouter-usage.csv")
backfill_source_paths(conn)
### 5. Migration
Run MIGRATION_V11_SQL against pipeline.db after backing up.
"""

View file

@ -140,7 +140,7 @@ async def fetch_review_queue(
if forgejo_token:
headers["Authorization"] = f"token {forgejo_token}"
connector = aiohttp.TCPConnector(ssl=False)
connector = aiohttp.TCPConnector() # Default SSL verification — Forgejo token must not be exposed to MITM
async with aiohttp.ClientSession(headers=headers, connector=connector) as session:
# Fetch open PRs
url = f"{FORGEJO_BASE}/repos/{REPO}/pulls?state=open&limit=50&sort=oldest"

629
ops/diagnostics/vitality.py Normal file
View file

@ -0,0 +1,629 @@
"""Agent Vitality Diagnostics — data collection and schema.
Records daily vitality snapshots per agent across 10 dimensions.
Designed as the objective function for agent "aliveness" ranking.
Owner: Ship (data collection) + Argus (storage, API, dashboard)
Data sources: pipeline.db (read-only), claim-index API, agent-state filesystem, review_records
Dimension keys (agreed with Leo 2026-04-08):
knowledge_output, knowledge_quality, contributor_engagement,
review_performance, spend_efficiency, autonomy,
infrastructure_health, social_reach, capital, external_impact
"""
import json
import logging
import os
import sqlite3
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
logger = logging.getLogger("vitality")
# Known domain agents and their primary domains
AGENT_DOMAINS = {
"rio": ["internet-finance"],
"theseus": ["collective-intelligence", "living-agents"],
"astra": ["space-development", "energy", "manufacturing", "robotics"],
"vida": ["health"],
"clay": ["entertainment", "cultural-dynamics"],
"leo": ["grand-strategy", "teleohumanity"],
"hermes": [], # communications, no domain
"rhea": [], # infrastructure ops, no domain
"ganymede": [], # code review, no domain
"epimetheus": [], # pipeline, no domain
"oberon": [], # dashboard, no domain
"argus": [], # diagnostics, no domain
"ship": [], # engineering, no domain
}
# Agent file path prefixes — for matching claims by location, not just domain field.
# Handles claims in core/ and foundations/ that may not have a standard domain field
# in the claim-index (domain derived from directory path).
AGENT_PATHS = {
"rio": ["domains/internet-finance/"],
"theseus": ["domains/ai-alignment/", "core/living-agents/", "core/collective-intelligence/",
"foundations/collective-intelligence/"],
"astra": ["domains/space-development/", "domains/energy/",
"domains/manufacturing/", "domains/robotics/"],
"vida": ["domains/health/"],
"clay": ["domains/entertainment/", "foundations/cultural-dynamics/"],
"leo": ["core/grand-strategy/", "core/teleohumanity/", "core/mechanisms/",
"core/living-capital/", "foundations/teleological-economics/",
"foundations/critical-systems/"],
}
ALL_AGENTS = list(AGENT_DOMAINS.keys())
# Agent-state directory (VPS filesystem)
AGENT_STATE_DIR = Path(os.environ.get(
"AGENT_STATE_DIR", "/opt/teleo-eval/agent-state"
))
MIGRATION_SQL = """
CREATE TABLE IF NOT EXISTS vitality_snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
agent_name TEXT NOT NULL,
dimension TEXT NOT NULL,
metric TEXT NOT NULL,
value REAL NOT NULL DEFAULT 0,
unit TEXT NOT NULL DEFAULT '',
source TEXT,
recorded_at TEXT NOT NULL DEFAULT (datetime('now')),
UNIQUE(agent_name, dimension, metric, recorded_at)
);
CREATE INDEX IF NOT EXISTS idx_vitality_agent_time
ON vitality_snapshots(agent_name, recorded_at);
CREATE INDEX IF NOT EXISTS idx_vitality_dimension
ON vitality_snapshots(dimension, recorded_at);
"""
# Add source column if missing (idempotent upgrade from v1 schema)
UPGRADE_SQL = """
ALTER TABLE vitality_snapshots ADD COLUMN source TEXT;
"""
def ensure_schema(db_path: str):
"""Create vitality_snapshots table if it doesn't exist."""
conn = sqlite3.connect(db_path, timeout=30)
try:
conn.executescript(MIGRATION_SQL)
try:
conn.execute(UPGRADE_SQL)
except sqlite3.OperationalError:
pass # column already exists
conn.commit()
logger.info("vitality_snapshots schema ensured")
finally:
conn.close()
def _fetch_claim_index(url: str = "http://localhost:8080/claim-index") -> dict | None:
"""Fetch claim-index from pipeline health API."""
try:
req = urllib.request.Request(url, headers={"Accept": "application/json"})
with urllib.request.urlopen(req, timeout=10) as resp:
return json.loads(resp.read())
except Exception as e:
logger.warning("claim-index fetch failed: %s", e)
return None
def _ro_conn(db_path: str) -> sqlite3.Connection:
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
conn.row_factory = sqlite3.Row
return conn
# ---------------------------------------------------------------------------
# Dimension 1: knowledge_output — "How much has this agent produced?"
# ---------------------------------------------------------------------------
def collect_knowledge_output(conn: sqlite3.Connection, agent: str) -> list[dict]:
"""Claims merged, domain count, PRs submitted."""
metrics = []
row = conn.execute(
"SELECT COUNT(*) as cnt FROM prs WHERE agent = ? AND status = 'merged'",
(agent,),
).fetchone()
metrics.append({"metric": "claims_merged", "value": row["cnt"], "unit": "claims"})
row = conn.execute(
"SELECT COUNT(DISTINCT domain) as cnt FROM prs "
"WHERE agent = ? AND domain IS NOT NULL AND status = 'merged'",
(agent,),
).fetchone()
metrics.append({"metric": "domains_contributed", "value": row["cnt"], "unit": "domains"})
row = conn.execute(
"SELECT COUNT(*) as cnt FROM prs WHERE agent = ? AND created_at > datetime('now', '-7 days')",
(agent,),
).fetchone()
metrics.append({"metric": "prs_7d", "value": row["cnt"], "unit": "PRs"})
return metrics
# ---------------------------------------------------------------------------
# Dimension 2: knowledge_quality — "How good is the output?"
# ---------------------------------------------------------------------------
def collect_knowledge_quality(
conn: sqlite3.Connection, claim_index: dict | None, agent: str
) -> list[dict]:
"""Evidence density, challenge rate, cross-domain links, domain coverage."""
metrics = []
agent_domains = AGENT_DOMAINS.get(agent, [])
# Challenge rate = challenge PRs / total PRs
rows = conn.execute(
"SELECT commit_type, COUNT(*) as cnt FROM prs "
"WHERE agent = ? AND commit_type IS NOT NULL GROUP BY commit_type",
(agent,),
).fetchall()
total = sum(r["cnt"] for r in rows)
type_counts = {r["commit_type"]: r["cnt"] for r in rows}
challenge_rate = type_counts.get("challenge", 0) / total if total > 0 else 0
metrics.append({"metric": "challenge_rate", "value": round(challenge_rate, 4), "unit": "ratio"})
# Activity breadth (distinct commit types)
metrics.append({"metric": "activity_breadth", "value": len(type_counts), "unit": "types"})
# Evidence density + cross-domain links from claim-index
# Match by domain field OR file path prefix (catches core/, foundations/ claims)
agent_paths = AGENT_PATHS.get(agent, [])
if claim_index and (agent_domains or agent_paths):
claims = claim_index.get("claims", [])
agent_claims = [
c for c in claims
if c.get("domain") in agent_domains
or any(c.get("file", "").startswith(p) for p in agent_paths)
]
total_claims = len(agent_claims)
# Evidence density: claims with incoming links / total claims
linked = sum(1 for c in agent_claims if c.get("incoming_count", 0) > 0)
density = linked / total_claims if total_claims > 0 else 0
metrics.append({"metric": "evidence_density", "value": round(density, 4), "unit": "ratio"})
# Cross-domain links
cross_domain = sum(
1 for c in agent_claims
for link in c.get("outgoing_links", [])
if any(d in link for d in claim_index.get("domains", {}).keys()
if d not in agent_domains)
)
metrics.append({"metric": "cross_domain_links", "value": cross_domain, "unit": "links"})
# Domain coverage: agent's claims / average domain size
domains_data = claim_index.get("domains", {})
agent_claim_count = sum(domains_data.get(d, 0) for d in agent_domains)
avg_domain_size = (sum(domains_data.values()) / len(domains_data)) if domains_data else 1
coverage = min(agent_claim_count / avg_domain_size, 1.0) if avg_domain_size > 0 else 0
metrics.append({"metric": "domain_coverage", "value": round(coverage, 4), "unit": "ratio"})
else:
metrics.append({"metric": "evidence_density", "value": 0, "unit": "ratio"})
metrics.append({"metric": "cross_domain_links", "value": 0, "unit": "links"})
metrics.append({"metric": "domain_coverage", "value": 0, "unit": "ratio"})
return metrics
# ---------------------------------------------------------------------------
# Dimension 3: contributor_engagement — "Who contributes to this agent's domain?"
# ---------------------------------------------------------------------------
def collect_contributor_engagement(conn: sqlite3.Connection, agent: str) -> list[dict]:
"""Unique submitters to this agent's domain."""
row = conn.execute(
"SELECT COUNT(DISTINCT submitted_by) as cnt FROM prs "
"WHERE agent = ? AND submitted_by IS NOT NULL AND submitted_by != ''",
(agent,),
).fetchone()
return [
{"metric": "unique_submitters", "value": row["cnt"], "unit": "contributors"},
]
# ---------------------------------------------------------------------------
# Dimension 4: review_performance — "How good is the evaluator feedback loop?"
# ---------------------------------------------------------------------------
def collect_review_performance(conn: sqlite3.Connection, agent: str) -> list[dict]:
"""Approval rate, rejection reasons from review_records."""
metrics = []
# Check if review_records table exists
table_check = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='review_records'"
).fetchone()
if not table_check:
return [
{"metric": "approval_rate", "value": 0, "unit": "ratio"},
{"metric": "total_reviews", "value": 0, "unit": "reviews"},
]
# Overall approval rate for this agent's claims (join through prs table)
row = conn.execute(
"SELECT COUNT(*) as total, "
"SUM(CASE WHEN r.outcome = 'approved' THEN 1 ELSE 0 END) as approved, "
"SUM(CASE WHEN r.outcome = 'approved-with-changes' THEN 1 ELSE 0 END) as with_changes, "
"SUM(CASE WHEN r.outcome = 'rejected' THEN 1 ELSE 0 END) as rejected "
"FROM review_records r "
"JOIN prs p ON r.pr_number = p.pr_number "
"WHERE LOWER(p.agent) = LOWER(?)",
(agent,),
).fetchone()
total = row["total"] or 0
approved = (row["approved"] or 0) + (row["with_changes"] or 0)
rejected = row["rejected"] or 0
approval_rate = approved / total if total > 0 else 0
metrics.append({"metric": "total_reviews", "value": total, "unit": "reviews"})
metrics.append({"metric": "approval_rate", "value": round(approval_rate, 4), "unit": "ratio"})
metrics.append({"metric": "approved", "value": row["approved"] or 0, "unit": "reviews"})
metrics.append({"metric": "approved_with_changes", "value": row["with_changes"] or 0, "unit": "reviews"})
metrics.append({"metric": "rejected", "value": rejected, "unit": "reviews"})
# Top rejection reasons (last 30 days)
reasons = conn.execute(
"SELECT r.rejection_reason, COUNT(*) as cnt FROM review_records r "
"JOIN prs p ON r.pr_number = p.pr_number "
"WHERE LOWER(p.agent) = LOWER(?) AND r.outcome = 'rejected' "
"AND r.rejection_reason IS NOT NULL "
"AND r.review_date > datetime('now', '-30 days') "
"GROUP BY r.rejection_reason ORDER BY cnt DESC",
(agent,),
).fetchall()
for r in reasons:
metrics.append({
"metric": f"rejection_{r['rejection_reason']}",
"value": r["cnt"],
"unit": "rejections",
})
return metrics
# ---------------------------------------------------------------------------
# Dimension 5: spend_efficiency — "What does it cost per merged claim?"
# ---------------------------------------------------------------------------
def collect_spend_efficiency(conn: sqlite3.Connection, agent: str) -> list[dict]:
"""Cost per merged claim, total spend, response costs."""
metrics = []
# Pipeline cost attributed to this agent (from prs.cost_usd)
row = conn.execute(
"SELECT COALESCE(SUM(cost_usd), 0) as cost, COUNT(*) as merged "
"FROM prs WHERE agent = ? AND status = 'merged'",
(agent,),
).fetchone()
total_cost = row["cost"] or 0
merged = row["merged"] or 0
cost_per_claim = total_cost / merged if merged > 0 else 0
metrics.append({"metric": "total_pipeline_cost", "value": round(total_cost, 4), "unit": "USD"})
metrics.append({"metric": "cost_per_merged_claim", "value": round(cost_per_claim, 4), "unit": "USD"})
# Response audit costs (Telegram bot) — per-agent
row = conn.execute(
"SELECT COALESCE(SUM(generation_cost), 0) as cost, COUNT(*) as cnt "
"FROM response_audit WHERE agent = ?",
(agent,),
).fetchone()
metrics.append({"metric": "response_cost_total", "value": round(row["cost"], 4), "unit": "USD"})
metrics.append({"metric": "total_responses", "value": row["cnt"], "unit": "responses"})
# 24h spend snapshot
row = conn.execute(
"SELECT COALESCE(SUM(generation_cost), 0) as cost "
"FROM response_audit WHERE agent = ? AND timestamp > datetime('now', '-24 hours')",
(agent,),
).fetchone()
metrics.append({"metric": "response_cost_24h", "value": round(row["cost"], 4), "unit": "USD"})
return metrics
# ---------------------------------------------------------------------------
# Dimension 6: autonomy — "How independently does this agent act?"
# ---------------------------------------------------------------------------
def collect_autonomy(conn: sqlite3.Connection, agent: str) -> list[dict]:
"""Self-directed actions, active days."""
metrics = []
# Autonomous responses in last 24h
row = conn.execute(
"SELECT COUNT(*) as cnt FROM response_audit "
"WHERE agent = ? AND timestamp > datetime('now', '-24 hours')",
(agent,),
).fetchone()
metrics.append({"metric": "autonomous_responses_24h", "value": row["cnt"], "unit": "actions"})
# Active days in last 7
row = conn.execute(
"SELECT COUNT(DISTINCT date(created_at)) as days FROM prs "
"WHERE agent = ? AND created_at > datetime('now', '-7 days')",
(agent,),
).fetchone()
metrics.append({"metric": "active_days_7d", "value": row["days"], "unit": "days"})
return metrics
# ---------------------------------------------------------------------------
# Dimension 7: infrastructure_health — "Is the agent's machinery working?"
# ---------------------------------------------------------------------------
def collect_infrastructure_health(conn: sqlite3.Connection, agent: str) -> list[dict]:
"""Circuit breakers, PR success rate, agent-state liveness."""
metrics = []
# Circuit breakers
rows = conn.execute(
"SELECT name, state FROM circuit_breakers WHERE name LIKE ?",
(f"%{agent}%",),
).fetchall()
open_breakers = sum(1 for r in rows if r["state"] != "closed")
metrics.append({"metric": "open_circuit_breakers", "value": open_breakers, "unit": "breakers"})
# PR success rate last 7 days
row = conn.execute(
"SELECT COUNT(*) as total, "
"SUM(CASE WHEN status='merged' THEN 1 ELSE 0 END) as merged "
"FROM prs WHERE agent = ? AND created_at > datetime('now', '-7 days')",
(agent,),
).fetchone()
total = row["total"]
rate = row["merged"] / total if total > 0 else 0
metrics.append({"metric": "merge_rate_7d", "value": round(rate, 4), "unit": "ratio"})
# Agent-state liveness (read metrics.json from filesystem)
state_file = AGENT_STATE_DIR / agent / "metrics.json"
if state_file.exists():
try:
with open(state_file) as f:
state = json.load(f)
lifetime = state.get("lifetime", {})
metrics.append({
"metric": "sessions_total",
"value": lifetime.get("sessions_total", 0),
"unit": "sessions",
})
metrics.append({
"metric": "sessions_timeout",
"value": lifetime.get("sessions_timeout", 0),
"unit": "sessions",
})
metrics.append({
"metric": "sessions_error",
"value": lifetime.get("sessions_error", 0),
"unit": "sessions",
})
except (json.JSONDecodeError, OSError) as e:
logger.warning("Failed to read agent-state for %s: %s", agent, e)
return metrics
# ---------------------------------------------------------------------------
# Dimensions 8-10: Stubs (no data sources yet)
# ---------------------------------------------------------------------------
def collect_social_reach(agent: str) -> list[dict]:
"""Social dimension: stub zeros until X API accounts are active."""
return [
{"metric": "followers", "value": 0, "unit": "followers"},
{"metric": "impressions_7d", "value": 0, "unit": "impressions"},
{"metric": "engagement_rate", "value": 0, "unit": "ratio"},
]
def collect_capital(agent: str) -> list[dict]:
"""Capital dimension: stub zeros until treasury/revenue tracking exists."""
return [
{"metric": "aum", "value": 0, "unit": "USD"},
{"metric": "treasury", "value": 0, "unit": "USD"},
]
def collect_external_impact(agent: str) -> list[dict]:
"""External impact dimension: stub zeros until manual tracking exists."""
return [
{"metric": "decisions_informed", "value": 0, "unit": "decisions"},
{"metric": "deals_sourced", "value": 0, "unit": "deals"},
]
# ---------------------------------------------------------------------------
# Orchestration
# ---------------------------------------------------------------------------
DIMENSION_MAP = {
"knowledge_output": lambda conn, ci, agent: collect_knowledge_output(conn, agent),
"knowledge_quality": collect_knowledge_quality,
"contributor_engagement": lambda conn, ci, agent: collect_contributor_engagement(conn, agent),
"review_performance": lambda conn, ci, agent: collect_review_performance(conn, agent),
"spend_efficiency": lambda conn, ci, agent: collect_spend_efficiency(conn, agent),
"autonomy": lambda conn, ci, agent: collect_autonomy(conn, agent),
"infrastructure_health": lambda conn, ci, agent: collect_infrastructure_health(conn, agent),
"social_reach": lambda conn, ci, agent: collect_social_reach(agent),
"capital": lambda conn, ci, agent: collect_capital(agent),
"external_impact": lambda conn, ci, agent: collect_external_impact(agent),
}
def collect_all_for_agent(
db_path: str,
agent: str,
claim_index_url: str = "http://localhost:8080/claim-index",
) -> dict:
"""Collect all 10 vitality dimensions for a single agent.
Returns {dimension: [metrics]}.
"""
claim_index = _fetch_claim_index(claim_index_url)
conn = _ro_conn(db_path)
try:
result = {}
for dim_key, collector in DIMENSION_MAP.items():
try:
result[dim_key] = collector(conn, claim_index, agent)
except Exception as e:
logger.error("collector %s failed for %s: %s", dim_key, agent, e)
result[dim_key] = []
return result
finally:
conn.close()
def collect_system_aggregate(
db_path: str,
claim_index_url: str = "http://localhost:8080/claim-index",
) -> dict:
"""System-level aggregate vitality metrics."""
claim_index = _fetch_claim_index(claim_index_url)
conn = _ro_conn(db_path)
try:
metrics = {}
# Knowledge totals
total_claims = claim_index["total_claims"] if claim_index else 0
orphan_ratio = claim_index.get("orphan_ratio", 0) if claim_index else 0
domain_count = len(claim_index.get("domains", {})) if claim_index else 0
metrics["knowledge_output"] = [
{"metric": "total_claims", "value": total_claims, "unit": "claims"},
{"metric": "total_domains", "value": domain_count, "unit": "domains"},
{"metric": "orphan_ratio", "value": round(orphan_ratio, 4), "unit": "ratio"},
]
# Cross-domain citation rate
if claim_index:
claims = claim_index.get("claims", [])
total_links = sum(c.get("outgoing_count", 0) for c in claims)
cross_domain = 0
for c in claims:
src_domain = c.get("domain")
for link in c.get("outgoing_links", []):
linked_claims = [
x for x in claims
if x.get("stem") in link or x.get("file", "").endswith(link + ".md")
]
for lc in linked_claims:
if lc.get("domain") != src_domain:
cross_domain += 1
metrics["knowledge_quality"] = [
{"metric": "cross_domain_citation_rate",
"value": round(cross_domain / max(total_links, 1), 4),
"unit": "ratio"},
]
# Pipeline throughput
row = conn.execute(
"SELECT COUNT(*) as merged FROM prs "
"WHERE status='merged' AND merged_at > datetime('now', '-24 hours')"
).fetchone()
row2 = conn.execute("SELECT COUNT(*) as total FROM sources").fetchone()
row3 = conn.execute(
"SELECT COUNT(*) as pending FROM prs "
"WHERE status NOT IN ('merged','rejected','closed')"
).fetchone()
metrics["infrastructure_health"] = [
{"metric": "prs_merged_24h", "value": row["merged"], "unit": "PRs/day"},
{"metric": "total_sources", "value": row2["total"], "unit": "sources"},
{"metric": "queue_depth", "value": row3["pending"], "unit": "PRs"},
]
# Total spend
row = conn.execute(
"SELECT COALESCE(SUM(cost_usd), 0) as cost "
"FROM costs WHERE date > date('now', '-1 day')"
).fetchone()
row2 = conn.execute(
"SELECT COALESCE(SUM(generation_cost), 0) as cost FROM response_audit "
"WHERE timestamp > datetime('now', '-24 hours')"
).fetchone()
metrics["spend_efficiency"] = [
{"metric": "pipeline_cost_24h", "value": round(row["cost"], 4), "unit": "USD"},
{"metric": "response_cost_24h", "value": round(row2["cost"], 4), "unit": "USD"},
{"metric": "total_cost_24h",
"value": round(row["cost"] + row2["cost"], 4), "unit": "USD"},
]
# Stubs
metrics["social_reach"] = [{"metric": "total_followers", "value": 0, "unit": "followers"}]
metrics["capital"] = [{"metric": "total_aum", "value": 0, "unit": "USD"}]
return metrics
finally:
conn.close()
def record_snapshot(
db_path: str,
claim_index_url: str = "http://localhost:8080/claim-index",
):
"""Run a full vitality snapshot — one row per agent per dimension per metric."""
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
rows = []
# Per-agent snapshots
for agent in ALL_AGENTS:
try:
dimensions = collect_all_for_agent(db_path, agent, claim_index_url)
for dim_name, metrics in dimensions.items():
collector_name = f"{dim_name}_collector"
for m in metrics:
rows.append((
agent, dim_name, m["metric"], m["value"],
m["unit"], collector_name, now,
))
except Exception as e:
logger.error("vitality collection failed for %s: %s", agent, e)
# System aggregate
try:
system = collect_system_aggregate(db_path, claim_index_url)
for dim_name, metrics in system.items():
for m in metrics:
rows.append((
"_system", dim_name, m["metric"], m["value"],
m["unit"], "system_aggregate", now,
))
except Exception as e:
logger.error("vitality system aggregate failed: %s", e)
# Write all rows
ensure_schema(db_path)
conn = sqlite3.connect(db_path, timeout=30)
try:
conn.executemany(
"INSERT OR REPLACE INTO vitality_snapshots "
"(agent_name, dimension, metric, value, unit, source, recorded_at) "
"VALUES (?, ?, ?, ?, ?, ?, ?)",
rows,
)
conn.commit()
logger.info(
"vitality snapshot recorded: %d rows for %d agents + system",
len(rows), len(ALL_AGENTS),
)
return {"rows_written": len(rows), "agents": len(ALL_AGENTS), "recorded_at": now}
finally:
conn.close()
if __name__ == "__main__":
"""CLI: python3 vitality.py [db_path] — runs a snapshot."""
import sys
logging.basicConfig(level=logging.INFO)
db = sys.argv[1] if len(sys.argv) > 1 else "/opt/teleo-eval/pipeline/pipeline.db"
result = record_snapshot(db)
print(json.dumps(result, indent=2))

View file

@ -0,0 +1,293 @@
"""Vitality API routes for Argus diagnostics dashboard.
Endpoints:
GET /api/vitality latest snapshot + time-series for all agents or one
GET /api/vitality/snapshot trigger a new snapshot (POST-like via GET for cron curl)
GET /api/vitality/leaderboard agents ranked by composite vitality score
Owner: Argus
"""
import json
import logging
import sqlite3
from pathlib import Path
from aiohttp import web
from vitality import (
ALL_AGENTS,
MIGRATION_SQL,
collect_all_for_agent,
collect_system_aggregate,
record_snapshot,
)
logger = logging.getLogger("argus.vitality")
# Composite vitality weights — Leo-approved 2026-04-08
# Dimension keys match Ship's refactored vitality.py DIMENSION_MAP
VITALITY_WEIGHTS = {
"knowledge_output": 0.30, # primary output — highest weight
"knowledge_quality": 0.20, # was "diversity" — quality of output
"contributor_engagement": 0.15, # attracting external contributors
"review_performance": 0.00, # new dim, zero until review_records populated
"autonomy": 0.15, # independent action
"infrastructure_health": 0.05, # machinery working
"spend_efficiency": 0.05, # cost discipline
"social_reach": 0.00, # zero until accounts active
"capital": 0.00, # zero until treasury exists
"external_impact": 0.00, # zero until measurable
}
# Public paths (no auth required)
VITALITY_PUBLIC_PATHS = frozenset({
"/api/vitality",
"/api/vitality/snapshot",
"/api/vitality/leaderboard",
})
def _ro_conn(db_path: str) -> sqlite3.Connection:
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
conn.row_factory = sqlite3.Row
return conn
async def handle_vitality(request: web.Request) -> web.Response:
"""GET /api/vitality?agent=<name>&days=7
Returns latest snapshot and time-series data.
If agent is specified, returns that agent only. Otherwise returns all.
"""
db_path = request.app["db_path"]
agent = request.query.get("agent")
try:
days = min(int(request.query.get("days", "7")), 90)
except ValueError:
days = 7
conn = _ro_conn(db_path)
try:
# Check if table exists
table_check = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='vitality_snapshots'"
).fetchone()
if not table_check:
return web.json_response({
"error": "No vitality data yet. Trigger a snapshot first via /api/vitality/snapshot",
"has_data": False
})
# Latest snapshot timestamp
latest = conn.execute(
"SELECT MAX(recorded_at) as ts FROM vitality_snapshots"
).fetchone()
latest_ts = latest["ts"] if latest else None
if not latest_ts:
return web.json_response({"has_data": False})
# Latest snapshot data
if agent:
agents_filter = [agent]
else:
agents_filter = ALL_AGENTS + ["_system"]
result = {"latest_snapshot": latest_ts, "agents": {}}
for a in agents_filter:
rows = conn.execute(
"SELECT dimension, metric, value, unit FROM vitality_snapshots "
"WHERE agent_name = ? AND recorded_at = ?",
(a, latest_ts)
).fetchall()
if not rows:
continue
dimensions = {}
for r in rows:
dim = r["dimension"]
if dim not in dimensions:
dimensions[dim] = []
dimensions[dim].append({
"metric": r["metric"],
"value": r["value"],
"unit": r["unit"],
})
result["agents"][a] = dimensions
# Time-series for trend charts (one data point per snapshot)
ts_query_agent = agent if agent else "_system"
ts_rows = conn.execute(
"SELECT recorded_at, dimension, metric, value "
"FROM vitality_snapshots "
"WHERE agent_name = ? AND recorded_at > datetime('now', ?)"
"ORDER BY recorded_at",
(ts_query_agent, f"-{days} days")
).fetchall()
time_series = {}
for r in ts_rows:
key = f"{r['dimension']}.{r['metric']}"
if key not in time_series:
time_series[key] = []
time_series[key].append({
"t": r["recorded_at"],
"v": r["value"],
})
result["time_series"] = time_series
result["has_data"] = True
return web.json_response(result)
finally:
conn.close()
async def handle_vitality_snapshot(request: web.Request) -> web.Response:
"""GET /api/vitality/snapshot — trigger a new snapshot collection.
Used by cron: curl http://localhost:8081/api/vitality/snapshot
Requires ?confirm=1 to prevent accidental triggers from crawlers/prefetch.
"""
if request.query.get("confirm") != "1":
return web.json_response(
{"status": "noop", "error": "Add ?confirm=1 to trigger a snapshot write"},
status=400,
)
db_path = request.app["db_path"]
claim_index_url = request.app.get("claim_index_url", "http://localhost:8080/claim-index")
try:
result = record_snapshot(db_path, claim_index_url)
return web.json_response({"status": "ok", **result})
except Exception as e:
logger.error("vitality snapshot failed: %s", e)
return web.json_response({"status": "error", "error": str(e)}, status=500)
async def handle_vitality_leaderboard(request: web.Request) -> web.Response:
"""GET /api/vitality/leaderboard — agents ranked by composite vitality score.
Scoring approach:
- Each dimension gets a 0-1 normalized score based on the metric values
- Weighted sum produces composite score
- Agents ranked by composite score descending
"""
db_path = request.app["db_path"]
conn = _ro_conn(db_path)
try:
table_check = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='vitality_snapshots'"
).fetchone()
if not table_check:
return web.json_response({"error": "No vitality data yet", "has_data": False})
latest = conn.execute(
"SELECT MAX(recorded_at) as ts FROM vitality_snapshots"
).fetchone()
if not latest or not latest["ts"]:
return web.json_response({"has_data": False})
latest_ts = latest["ts"]
# Collect all agents' latest data
agent_scores = []
for agent in ALL_AGENTS:
rows = conn.execute(
"SELECT dimension, metric, value FROM vitality_snapshots "
"WHERE agent_name = ? AND recorded_at = ?",
(agent, latest_ts)
).fetchall()
if not rows:
continue
dims = {}
for r in rows:
dim = r["dimension"]
if dim not in dims:
dims[dim] = {}
dims[dim][r["metric"]] = r["value"]
# Normalize each dimension to 0-1
# Dimension keys match Ship's refactored vitality.py DIMENSION_MAP
dim_scores = {}
# knowledge_output: claims_merged (cap at 100 = 1.0)
ko = dims.get("knowledge_output", {})
claims = ko.get("claims_merged", 0)
dim_scores["knowledge_output"] = min(claims / 100, 1.0)
# knowledge_quality: challenge_rate + breadth + evidence_density + domain_coverage
kq = dims.get("knowledge_quality", {})
cr = kq.get("challenge_rate", 0)
breadth = kq.get("activity_breadth", 0)
evidence = kq.get("evidence_density", 0)
coverage = kq.get("domain_coverage", 0)
dim_scores["knowledge_quality"] = min(
(cr / 0.1 * 0.2 + breadth / 4 * 0.2 + evidence * 0.3 + coverage * 0.3), 1.0
)
# contributor_engagement: unique_submitters (cap at 5 = 1.0)
ce = dims.get("contributor_engagement", {})
dim_scores["contributor_engagement"] = min(ce.get("unique_submitters", 0) / 5, 1.0)
# review_performance: approval_rate from review_records (0 until populated)
rp = dims.get("review_performance", {})
dim_scores["review_performance"] = rp.get("approval_rate", 0)
# autonomy: active_days_7d (7 = 1.0)
am = dims.get("autonomy", {})
dim_scores["autonomy"] = min(am.get("active_days_7d", 0) / 7, 1.0)
# infrastructure_health: merge_rate_7d directly (already 0-1)
ih = dims.get("infrastructure_health", {})
dim_scores["infrastructure_health"] = ih.get("merge_rate_7d", 0)
# spend_efficiency: inverted — lower cost per claim is better
se = dims.get("spend_efficiency", {})
daily_cost = se.get("response_cost_24h", 0)
dim_scores["spend_efficiency"] = max(1.0 - daily_cost / 10.0, 0)
# Social/Capital/External: stubbed at 0
dim_scores["social_reach"] = 0
dim_scores["capital"] = 0
dim_scores["external_impact"] = 0
# Composite weighted score
composite = sum(
dim_scores.get(dim, 0) * weight
for dim, weight in VITALITY_WEIGHTS.items()
)
agent_scores.append({
"agent": agent,
"composite_score": round(composite, 4),
"dimension_scores": {k: round(v, 4) for k, v in dim_scores.items()},
"raw_highlights": {
"claims_merged": int(claims),
"merge_rate": round(ih.get("merge_rate_7d", 0) * 100, 1),
"active_days": int(am.get("active_days_7d", 0)),
"challenge_rate": round(cr * 100, 1),
"evidence_density": round(evidence * 100, 1),
},
})
# Sort by composite score descending
agent_scores.sort(key=lambda x: x["composite_score"], reverse=True)
return web.json_response({
"has_data": True,
"snapshot_at": latest_ts,
"leaderboard": agent_scores,
})
finally:
conn.close()
def register_vitality_routes(app: web.Application):
"""Register vitality endpoints on the aiohttp app."""
app.router.add_get("/api/vitality", handle_vitality)
app.router.add_get("/api/vitality/snapshot", handle_vitality_snapshot)
app.router.add_get("/api/vitality/leaderboard", handle_vitality_leaderboard)

View file

@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""One-time backfill: populate prs.description with claim titles from merged files.
For PRs that have description=NULL or empty, reads the claim files on main
(for merged PRs) or on the branch (for open PRs) and extracts H1 titles.
Usage: python3 backfill-descriptions.py [--dry-run]
Requires: run from the teleo-codex git worktree (main branch).
"""
import re
import sqlite3
import subprocess
import sys
from pathlib import Path
DB_PATH = Path("/opt/teleo-eval/pipeline/pipeline.db")
MAIN_WORKTREE = Path("/opt/teleo-eval/teleo-codex")
CLAIM_DIRS = ("domains/", "core/", "foundations/")
dry_run = "--dry-run" in sys.argv
def get_pr_claim_titles(pr_number: int, branch: str, status: str) -> list[str]:
"""Extract H1 claim titles from a PR's changed files."""
titles = []
# For merged PRs: diff the merge commit on main
# For open PRs: diff against main
try:
if status == "merged":
# Get the diff from the branch name — files are on main now
# Use git log to find the merge and diff its changes
result = subprocess.run(
["git", "diff", "--name-only", f"origin/main...origin/{branch}"],
capture_output=True, text=True, timeout=10,
cwd=str(MAIN_WORKTREE),
)
if result.returncode != 0:
# Branch may be deleted — try reading files from main directly
# We can't reconstruct the diff, but we can search by PR number in audit_log
return titles
else:
result = subprocess.run(
["git", "diff", "--name-only", f"origin/main...origin/{branch}"],
capture_output=True, text=True, timeout=10,
cwd=str(MAIN_WORKTREE),
)
if result.returncode != 0:
return titles
changed_files = [
f.strip() for f in result.stdout.strip().split("\n")
if f.strip() and any(f.strip().startswith(d) for d in CLAIM_DIRS) and f.strip().endswith(".md")
]
for fpath in changed_files:
# Read from main for merged, from branch for open
ref = "origin/main" if status == "merged" else f"origin/{branch}"
show = subprocess.run(
["git", "show", f"{ref}:{fpath}"],
capture_output=True, text=True, timeout=5,
cwd=str(MAIN_WORKTREE),
)
if show.returncode == 0:
for line in show.stdout.split("\n"):
if line.startswith("# ") and len(line) > 3:
titles.append(line[2:].strip())
break
except (subprocess.TimeoutExpired, Exception) as e:
print(f" PR #{pr_number}: error — {e}")
return titles
def main():
conn = sqlite3.connect(str(DB_PATH))
conn.row_factory = sqlite3.Row
# Find PRs with empty description
rows = conn.execute(
"SELECT number, branch, status FROM prs WHERE description IS NULL OR description = '' ORDER BY number DESC"
).fetchall()
print(f"Found {len(rows)} PRs with empty description")
updated = 0
skipped = 0
for row in rows:
pr_num = row["number"]
branch = row["branch"]
status = row["status"]
if not branch:
skipped += 1
continue
titles = get_pr_claim_titles(pr_num, branch, status)
if titles:
desc = " | ".join(titles)
if dry_run:
print(f" PR #{pr_num} ({status}): would set → {desc[:100]}...")
else:
conn.execute(
"UPDATE prs SET description = ? WHERE number = ?",
(desc, pr_num),
)
updated += 1
if updated % 50 == 0:
conn.commit()
print(f" ...{updated} updated so far")
else:
skipped += 1
if not dry_run:
conn.commit()
conn.close()
print(f"\nDone. Updated: {updated}, Skipped: {skipped}, Total: {len(rows)}")
if dry_run:
print("(dry run — no changes written)")
if __name__ == "__main__":
main()

View file

@ -9,7 +9,7 @@ the same atomic-write pattern as lib-state.sh.
"""
import asyncio
import hashlib
import secrets
import json
import logging
import os
@ -116,8 +116,8 @@ def _write_inbox_message(agent: str, subject: str, body: str) -> bool:
return False
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
file_hash = hashlib.md5(f"{agent}-{subject}-{body[:200]}".encode()).hexdigest()[:8]
filename = f"cascade-{ts}-{subject[:60]}-{file_hash}.md"
nonce = secrets.token_hex(3)
filename = f"cascade-{ts}-{nonce}-{subject[:60]}.md"
final_path = inbox_dir / filename
try:

View file

@ -479,6 +479,9 @@ def migrate(conn: sqlite3.Connection):
logger.info("Migration v11: added auto_merge column to prs table")
# v12-v16 ran manually on VPS before code was version-controlled.
# Their changes are consolidated into v17+ migrations below.
if current < 17:
# Add prompt/pipeline version tracking per PR
for col, default in [

View file

@ -376,6 +376,7 @@ async def _extract_one_source(
filename = c.get("filename", "")
if not filename:
continue
filename = Path(filename).name # Strip directory components — LLM output may contain path traversal
if not filename.endswith(".md"):
filename += ".md"
content = _build_claim_content(c, agent_lower)
@ -387,6 +388,7 @@ async def _extract_one_source(
filename = e.get("filename", "")
if not filename:
continue
filename = Path(filename).name # Strip directory components — LLM output may contain path traversal
if not filename.endswith(".md"):
filename += ".md"
action = e.get("action", "create")

View file

@ -0,0 +1,94 @@
"""Stale extraction PR cleanup — closes extraction PRs that produce no claims.
When an extraction PR sits open >30 min with claims_count=0, it indicates:
- Extraction failed (model couldn't extract anything useful)
- Batch job stalled (no claims written)
- Source material is empty/junk
Auto-closing prevents zombie PRs from blocking the pipeline.
Logs each close for root cause analysis (model failures, bad sources, etc.).
Epimetheus owns this module.
"""
import json
import logging
from datetime import datetime, timezone
from . import config, db
from .forgejo import api, repo_path
logger = logging.getLogger("pipeline.stale_pr")
STALE_THRESHOLD_MINUTES = 45
async def check_stale_prs(conn) -> tuple[int, int]:
"""Auto-close extraction PRs open >30 min with zero claims.
Returns (stale_closed, stale_errors) count of closed PRs and close failures.
"""
stale_closed = 0
stale_errors = 0
# Find extraction PRs: open >30 min, source has 0 claims
stale_prs = conn.execute(
"""SELECT p.number, p.branch, p.source_path, p.created_at
FROM prs p
LEFT JOIN sources s ON p.source_path = s.path
WHERE p.status = 'open'
AND p.commit_type = 'extract'
AND datetime(p.created_at) < datetime('now', '-' || ? || ' minutes')
AND COALESCE(s.claims_count, 0) = 0""",
(STALE_THRESHOLD_MINUTES,),
).fetchall()
for pr in stale_prs:
pr_num = pr["number"]
source_path = pr["source_path"] or "unknown"
try:
# Close the PR via Forgejo
result = await api(
"PATCH",
repo_path(f"pulls/{pr_num}"),
body={"state": "closed"},
)
if result is None:
stale_errors += 1
logger.warning(
"Failed to close stale extraction PR #%d (%s, %s)",
pr_num, source_path, pr["branch"],
)
continue
# Update local DB status
conn.execute(
"UPDATE prs SET status = 'closed' WHERE number = ?",
(pr_num,),
)
db.audit(
conn,
"watchdog",
"stale_pr_closed",
json.dumps({
"pr": pr_num,
"branch": pr["branch"],
"source": source_path,
"open_minutes": STALE_THRESHOLD_MINUTES,
}),
)
stale_closed += 1
logger.info(
"WATCHDOG: closed stale extraction PR #%d (no claims after %d min): %s",
pr_num, STALE_THRESHOLD_MINUTES, source_path,
)
except Exception as e:
stale_errors += 1
logger.warning(
"Stale PR close exception for #%d: %s",
pr_num, e,
)
return stale_closed, stale_errors

View file

@ -620,6 +620,27 @@ async def validate_pr(conn, pr_number: int) -> dict:
# Extract claim files (domains/, core/, foundations/)
claim_files = extract_claim_files_from_diff(diff)
# ── Backfill description (claim titles) if missing ──
# discover_external_prs creates rows without description. Extract H1 titles
# from the diff so the dashboard shows what the PR actually contains.
existing_desc = conn.execute(
"SELECT description FROM prs WHERE number = ?", (pr_number,)
).fetchone()
if existing_desc and not (existing_desc["description"] or "").strip() and claim_files:
titles = []
for _fp, content in claim_files.items():
for line in content.split("\n"):
if line.startswith("# ") and len(line) > 3:
titles.append(line[2:].strip())
break
if titles:
desc = " | ".join(titles)
conn.execute(
"UPDATE prs SET description = ? WHERE number = ? AND (description IS NULL OR description = '')",
(desc, pr_number),
)
logger.info("PR #%d: backfilled description with %d claim titles", pr_number, len(titles))
# ── Tier 0: per-claim validation ──
# Only validates NEW files (not modified). Modified files have partial content
# from diffs (only + lines) — frontmatter parsing fails on partial content,

View file

@ -19,6 +19,7 @@ import logging
from datetime import datetime, timezone
from . import config, db
from .stale_pr import check_stale_prs
logger = logging.getLogger("pipeline.watchdog")
@ -103,17 +104,94 @@ async def watchdog_check(conn) -> dict:
"action": "GC should auto-close these — check fixer.py GC logic",
})
# 5. Tier0 blockage: many PRs with tier0_pass=0 (potential validation bug)
# 5. Tier0 blockage: auto-reset stuck PRs with retry cap
MAX_TIER0_RESETS = 3
TIER0_RESET_COOLDOWN_S = 3600
tier0_blocked = conn.execute(
"SELECT COUNT(*) as n FROM prs WHERE status = 'open' AND tier0_pass = 0"
).fetchone()["n"]
if tier0_blocked >= 5:
issues.append({
"type": "tier0_blockage",
"severity": "warning",
"detail": f"{tier0_blocked} PRs blocked at tier0_pass=0",
"action": "Check validate.py — may be the modified-file or wiki-link bug recurring",
})
"SELECT number, branch FROM prs WHERE status = 'open' AND tier0_pass = 0"
).fetchall()
if tier0_blocked:
reset_count = 0
permanent_count = 0
for pr in tier0_blocked:
row = conn.execute(
"""SELECT COUNT(*) as n, MAX(timestamp) as last_ts FROM audit_log
WHERE stage = 'watchdog' AND event = 'tier0_reset'
AND json_extract(detail, '$.pr') = ?""",
(pr["number"],),
).fetchone()
prior_resets = row["n"]
if prior_resets >= MAX_TIER0_RESETS:
permanent_count += 1
continue
last_reset = row["last_ts"]
if last_reset:
try:
last_ts = datetime.fromisoformat(last_reset).replace(tzinfo=timezone.utc)
age = (datetime.now(timezone.utc) - last_ts).total_seconds()
if age < TIER0_RESET_COOLDOWN_S:
continue
except (ValueError, TypeError):
pass
conn.execute(
"UPDATE prs SET tier0_pass = NULL WHERE number = ?",
(pr["number"],),
)
db.audit(
conn, "watchdog", "tier0_reset",
json.dumps({
"pr": pr["number"],
"branch": pr["branch"],
"attempt": prior_resets + 1,
"max": MAX_TIER0_RESETS,
}),
)
reset_count += 1
logger.info(
"WATCHDOG: auto-reset tier0 for PR #%d (attempt %d/%d)",
pr["number"], prior_resets + 1, MAX_TIER0_RESETS,
)
if reset_count:
issues.append({
"type": "tier0_reset",
"severity": "info",
"detail": f"Auto-reset {reset_count} PRs stuck at tier0_pass=0 for re-validation",
"action": "Monitor — if same PRs fail again, check validate.py",
})
if permanent_count:
issues.append({
"type": "tier0_permanent_failure",
"severity": "warning",
"detail": f"{permanent_count} PRs exhausted {MAX_TIER0_RESETS} tier0 retries — manual intervention needed",
"action": "Inspect PR content or close stale PRs",
})
# 6. Stale extraction PRs: open >30 min with no claim files
try:
stale_closed, stale_errors = await check_stale_prs(conn)
if stale_closed > 0:
issues.append({
"type": "stale_prs_closed",
"severity": "info",
"detail": f"Auto-closed {stale_closed} stale extraction PRs (no claims after 30 min)",
"action": "Check batch-extract logs for extraction failures",
})
if stale_errors > 0:
issues.append({
"type": "stale_pr_close_failed",
"severity": "warning",
"detail": f"Failed to close {stale_errors} stale PRs",
"action": "Check Forgejo API connectivity",
})
except Exception as e:
logger.warning("Stale PR check failed: %s", e)
# Log issues
healthy = len(issues) == 0
@ -124,7 +202,7 @@ async def watchdog_check(conn) -> dict:
else:
logger.info("WATCHDOG: %s%s", issue["type"], issue["detail"])
return {"healthy": healthy, "issues": issues, "checks_run": 5}
return {"healthy": healthy, "issues": issues, "checks_run": 6}
async def watchdog_cycle(conn, max_workers=None) -> tuple[int, int]: