teleo-codex/ops/diagnostics/tier1_metrics.py
m3taversal 05d74d5e32 sync: import all VPS pipeline + diagnostics code as baseline
Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source
of truth. Previously only 8 of 67 files existed in repo — the rest were
deployed directly to VPS via SCP, causing massive drift.

Includes:
- pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.)
- pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh
- diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics)
- agent-state/: bootstrap, lib-state, cascade inbox processor, schema
- systemd/: service unit files for reference
- deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate
- research-session.sh: updated with Step 8.5 digest + cascade inbox processing

No new code written — all files are exact copies from VPS as of 2026-04-06.
From this point forward: edit in repo, commit, then deploy.sh.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 00:00:00 +01:00

476 lines
16 KiB
Python

"""Tier 1 Metrics — The three numbers that matter most for knowledge production.
1. Extraction yield: claims merged / claims evaluated, per agent, per week
2. Cost per merged claim: total spend / merged claims, per week
3. Fix success rate by rejection tag: which rejection reasons are fixable vs terminal
These queries run against pipeline.db (read-only) and power the /api/yield,
/api/cost-per-claim, and /api/fix-rates endpoints.
Owner: Argus <69AF7290-758F-464B-B472-04AFCA4AB340>
"""
import sqlite3
def extraction_yield(conn: sqlite3.Connection, days: int = 30) -> dict:
"""Extraction yield = merged / evaluated, trended per agent per week.
Returns:
{
"daily": [{"day": "2026-W13", "agent": "rio", "evaluated": 20, "merged": 8, "yield": 0.4}, ...],
"totals": [{"agent": "rio", "evaluated": 100, "merged": 40, "yield": 0.4}, ...],
"system": {"evaluated": 500, "merged": 200, "yield": 0.4}
}
"""
# Weekly yield per agent
# Uses strftime('%Y-W%W') for ISO week grouping
# evaluated = approved + rejected (all terminal eval events)
# merged = approved events only
weekly = conn.execute(
"""
SELECT date(timestamp) as day,
json_extract(detail, '$.agent') as agent,
COUNT(*) as evaluated,
SUM(CASE WHEN event = 'approved' THEN 1 ELSE 0 END) as merged
FROM audit_log
WHERE stage = 'evaluate'
AND event IN ('approved', 'changes_requested', 'domain_rejected', 'tier05_rejected')
AND timestamp > datetime('now', ? || ' days')
GROUP BY day, agent
ORDER BY day DESC, agent
""",
(f"-{days}",),
).fetchall()
daily_data = []
for r in weekly:
ev = r["evaluated"] or 0
mg = r["merged"] or 0
daily_data.append({
"day": r["day"],
"agent": r["agent"] or "unknown",
"evaluated": ev,
"merged": mg,
"yield": round(mg / ev, 3) if ev else 0,
})
# Per-agent totals (same window)
totals = conn.execute(
"""
SELECT json_extract(detail, '$.agent') as agent,
COUNT(*) as evaluated,
SUM(CASE WHEN event = 'approved' THEN 1 ELSE 0 END) as merged
FROM audit_log
WHERE stage = 'evaluate'
AND event IN ('approved', 'changes_requested', 'domain_rejected', 'tier05_rejected')
AND timestamp > datetime('now', ? || ' days')
GROUP BY agent
ORDER BY merged DESC
""",
(f"-{days}",),
).fetchall()
totals_data = []
for r in totals:
ev = r["evaluated"] or 0
mg = r["merged"] or 0
totals_data.append({
"agent": r["agent"] or "unknown",
"evaluated": ev,
"merged": mg,
"yield": round(mg / ev, 3) if ev else 0,
})
# System-wide total
sys_row = conn.execute(
"""
SELECT COUNT(*) as evaluated,
SUM(CASE WHEN event = 'approved' THEN 1 ELSE 0 END) as merged
FROM audit_log
WHERE stage = 'evaluate'
AND event IN ('approved', 'changes_requested', 'domain_rejected', 'tier05_rejected')
AND timestamp > datetime('now', ? || ' days')
""",
(f"-{days}",),
).fetchone()
sys_ev = sys_row["evaluated"] or 0
sys_mg = sys_row["merged"] or 0
return {
"days": days,
"daily": daily_data,
"totals": totals_data,
"system": {
"evaluated": sys_ev,
"merged": sys_mg,
"yield": round(sys_mg / sys_ev, 3) if sys_ev else 0,
},
}
def cost_per_merged_claim(conn: sqlite3.Connection, days: int = 30) -> dict:
"""Cost and compute per merged claim, trended per week.
Uses costs table for spend + tokens and prs table for merge counts.
Breaks down by stage. Separates API spend (dollars) from subscription
compute (tokens only — Claude Max is flat-rate, so dollars are meaningless).
Returns:
{
"daily": [{"day": "2026-W13", "api_cost": 1.50, "merged": 8,
"cost_per_claim": 0.19, "input_tokens": 50000,
"output_tokens": 5000, "total_tokens": 55000,
"tokens_per_claim": 6875}, ...],
"by_stage": [{"stage": "eval_leo:openrouter", "api_cost": 1.50,
"input_tokens": 300000, "output_tokens": 50000,
"calls": 100, "billing": "api"}, ...],
"system": {"api_cost": 2.36, "merged": 80, "cost_per_claim": 0.03,
"total_tokens": 1200000, "tokens_per_claim": 15000,
"subscription_tokens": 0, "api_tokens": 1200000}
}
"""
# Weekly: cost + tokens from costs table, merged count from prs table
daily_cost = conn.execute(
"""
SELECT date as day,
SUM(cost_usd) as api_cost,
SUM(cost_estimate_usd) as estimated_cost,
SUM(input_tokens) as input_tokens,
SUM(output_tokens) as output_tokens
FROM costs
WHERE date > date('now', ? || ' days')
GROUP BY day
ORDER BY day DESC
""",
(f"-{days}",),
).fetchall()
daily_merges = conn.execute(
"""
SELECT date(merged_at) as day,
COUNT(*) as merged
FROM prs
WHERE status = 'merged'
AND merged_at > datetime('now', ? || ' days')
GROUP BY day
ORDER BY day DESC
""",
(f"-{days}",),
).fetchall()
# Merge into combined weekly view
merge_map = {r["day"]: r["merged"] for r in daily_merges}
cost_map = {}
for r in daily_cost:
cost_map[r["day"]] = {
"api_cost": r["api_cost"] or 0,
"estimated_cost": r["estimated_cost"] or 0,
"input_tokens": r["input_tokens"] or 0,
"output_tokens": r["output_tokens"] or 0,
}
all_days = sorted(set(list(merge_map.keys()) + list(cost_map.keys())), reverse=True)
daily_data = []
for w in all_days:
c = cost_map.get(w, {"api_cost": 0, "estimated_cost": 0, "input_tokens": 0, "output_tokens": 0})
merged = merge_map.get(w, 0) or 0
total_tokens = c["input_tokens"] + c["output_tokens"]
daily_data.append({
"day": w,
"actual_spend": round(c["api_cost"], 4),
"estimated_cost": round(c["estimated_cost"], 4),
"merged": merged,
"cost_per_claim": round(c["estimated_cost"] / merged, 4) if merged else None,
"input_tokens": c["input_tokens"],
"output_tokens": c["output_tokens"],
"total_tokens": total_tokens,
"tokens_per_claim": round(total_tokens / merged) if merged else None,
})
# By stage with billing type (full window)
by_stage = conn.execute(
"""
SELECT stage,
SUM(cost_usd) as api_cost,
SUM(cost_estimate_usd) as estimated_cost,
SUM(input_tokens) as input_tokens,
SUM(output_tokens) as output_tokens,
SUM(calls) as calls
FROM costs
WHERE date > date('now', ? || ' days')
GROUP BY stage
ORDER BY SUM(input_tokens + output_tokens) DESC
""",
(f"-{days}",),
).fetchall()
stage_data = []
total_api_cost = 0
total_estimated_cost = 0
total_input = 0
total_output = 0
subscription_tokens = 0
api_tokens = 0
for r in by_stage:
cost = r["api_cost"] or 0
est = r["estimated_cost"] or 0
inp = r["input_tokens"] or 0
out = r["output_tokens"] or 0
calls = r["calls"] or 0
stage_name = r["stage"]
# :max suffix = subscription, :openrouter suffix = API
billing = "subscription" if ":max" in stage_name else "api"
total_api_cost += cost
total_estimated_cost += est
total_input += inp
total_output += out
if billing == "subscription":
subscription_tokens += inp + out
else:
api_tokens += inp + out
stage_data.append({
"stage": stage_name,
"api_cost": round(cost, 4),
"estimated_cost": round(est, 4),
"input_tokens": inp,
"output_tokens": out,
"calls": calls,
"billing": billing,
})
# System totals
sys_merged = conn.execute(
"SELECT COUNT(*) as n FROM prs WHERE status='merged' AND merged_at > datetime('now', ? || ' days')",
(f"-{days}",),
).fetchone()["n"] or 0
total_tokens = total_input + total_output
return {
"days": days,
"daily": daily_data,
"by_stage": stage_data,
"system": {
"actual_spend": round(total_api_cost, 4),
"estimated_cost": round(total_estimated_cost, 4),
"merged": sys_merged,
"cost_per_claim": round(total_estimated_cost / sys_merged, 4) if sys_merged else None,
"total_tokens": total_tokens,
"tokens_per_claim": round(total_tokens / sys_merged) if sys_merged else None,
"subscription_tokens": subscription_tokens,
"api_tokens": api_tokens,
"note": "estimated_cost = API-rate equivalent for all calls (unified metric). actual_spend = real dollars charged to OpenRouter.",
},
}
def fix_success_by_tag(conn: sqlite3.Connection, days: int = 30) -> dict:
"""Fix success rate broken down by rejection reason.
For each rejection tag: how many PRs got that rejection, how many eventually
merged (successful fix), how many are still open (in progress), how many
were abandoned (closed/zombie without merge).
Returns:
{
"tags": [
{
"tag": "insufficient_evidence",
"total": 50,
"fixed": 10,
"in_progress": 5,
"terminal": 35,
"fix_rate": 0.2,
"terminal_rate": 0.7
}, ...
]
}
"""
# Get all rejection events with their tags and PR numbers
# Then join with prs table to see final outcome
rows = conn.execute(
"""
SELECT value as tag,
json_extract(al.detail, '$.pr') as pr_number
FROM audit_log al, json_each(json_extract(al.detail, '$.issues'))
WHERE al.stage = 'evaluate'
AND al.event IN ('changes_requested', 'domain_rejected', 'tier05_rejected')
AND al.timestamp > datetime('now', ? || ' days')
""",
(f"-{days}",),
).fetchall()
# Collect unique PRs per tag
tag_prs: dict[str, set] = {}
for r in rows:
tag = r["tag"]
pr = r["pr_number"]
if tag not in tag_prs:
tag_prs[tag] = set()
if pr is not None:
tag_prs[tag].add(pr)
if not tag_prs:
return {"days": days, "tags": []}
# Get status for all referenced PRs in one query
all_prs = set()
for prs in tag_prs.values():
all_prs.update(prs)
if not all_prs:
return {"days": days, "tags": []}
placeholders = ",".join("?" for _ in all_prs)
pr_statuses = conn.execute(
f"SELECT number, status FROM prs WHERE number IN ({placeholders})",
list(all_prs),
).fetchall()
status_map = {r["number"]: r["status"] for r in pr_statuses}
# Compute per-tag outcomes
tag_data = []
for tag, prs in sorted(tag_prs.items(), key=lambda x: -len(x[1])):
fixed = 0
in_progress = 0
terminal = 0
for pr in prs:
st = status_map.get(pr, "unknown")
if st == "merged":
fixed += 1
elif st in ("open", "validating", "reviewing", "merging"):
in_progress += 1
else:
# closed, zombie, conflict, unknown
terminal += 1
total = len(prs)
# Fix rate excludes in-progress (only counts resolved PRs)
resolved = fixed + terminal
tag_data.append({
"tag": tag,
"total": total,
"fixed": fixed,
"in_progress": in_progress,
"terminal": terminal,
"fix_rate": round(fixed / resolved, 3) if resolved else None,
"terminal_rate": round(terminal / resolved, 3) if resolved else None,
})
return {"days": days, "tags": tag_data}
def compute_profile(conn: "sqlite3.Connection", days: int = 30) -> dict:
"""Compute profile — Max subscription telemetry alongside API usage.
Surfaces: cache hit rates, latency, cost estimates (API-equivalent),
token breakdown by billing type.
"""
rows = conn.execute(
"""
SELECT stage, model,
SUM(calls) as calls,
SUM(input_tokens) as input_tokens,
SUM(output_tokens) as output_tokens,
SUM(cost_usd) as api_cost,
SUM(duration_ms) as duration_ms,
SUM(cache_read_tokens) as cache_read_tokens,
SUM(cache_write_tokens) as cache_write_tokens,
SUM(cost_estimate_usd) as cost_estimate_usd
FROM costs
WHERE date > date('now', ? || ' days')
GROUP BY stage, model
ORDER BY SUM(input_tokens + output_tokens) DESC
""",
(f"-{days}",),
).fetchall()
stage_data = []
total_calls = 0
total_tokens = 0
total_duration = 0
total_cache_read = 0
total_cache_write = 0
api_calls = 0
sub_calls = 0
api_spend = 0.0
sub_estimate = 0.0
sub_input_tokens = 0
for r in rows:
calls = r["calls"] or 0
inp = r["input_tokens"] or 0
out = r["output_tokens"] or 0
dur = r["duration_ms"] or 0
cr = r["cache_read_tokens"] or 0
cw = r["cache_write_tokens"] or 0
cost = r["api_cost"] or 0
est = r["cost_estimate_usd"] or 0
stage_name = r["stage"]
billing = "subscription" if ":max" in stage_name else "api"
total_calls += calls
total_tokens += inp + out
total_duration += dur
total_cache_read += cr
total_cache_write += cw
if billing == "subscription":
sub_calls += calls
sub_estimate += est
sub_input_tokens += inp
else:
api_calls += calls
api_spend += cost
stage_data.append({
"stage": stage_name,
"model": r["model"],
"calls": calls,
"input_tokens": inp,
"output_tokens": out,
"total_tokens": inp + out,
"duration_ms": dur,
"avg_latency_ms": round(dur / calls) if calls else 0,
"cache_read_tokens": cr,
"cache_write_tokens": cw,
"cache_hit_rate": round(cr / (cr + inp), 3) if (cr + inp) else 0,
"api_cost": round(cost, 4),
"cost_estimate_usd": round(est, 4),
"billing": billing,
})
# Cache summary (only meaningful for subscription/Max calls)
total_cacheable = total_cache_read + total_cache_write + sub_input_tokens
cache_hit_rate = round(total_cache_read / total_cacheable, 3) if total_cacheable else 0
return {
"days": days,
"by_stage": stage_data,
"cache": {
"read_tokens": total_cache_read,
"write_tokens": total_cache_write,
"hit_rate": cache_hit_rate,
"note": "Cache hits are prompt tokens served from cache (cheaper/faster)",
},
"latency": {
"total_ms": total_duration,
"avg_ms_per_call": round(total_duration / total_calls) if total_calls else 0,
"note": "Wall-clock time including network. Only populated for Claude Max calls.",
},
"subscription_estimate": {
"total_cost_usd": round(sub_estimate, 4),
"note": "What subscription calls would cost at API rates. Actual cost: $0 (flat-rate Max plan).",
},
"system": {
"total_calls": total_calls,
"total_tokens": total_tokens,
"api_calls": api_calls,
"subscription_calls": sub_calls,
"api_spend": round(api_spend, 4),
"subscription_estimate": round(sub_estimate, 4),
"cache_hit_rate": cache_hit_rate,
},
}