Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source of truth. Previously only 8 of 67 files existed in repo — the rest were deployed directly to VPS via SCP, causing massive drift. Includes: - pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.) - pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh - diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics) - agent-state/: bootstrap, lib-state, cascade inbox processor, schema - systemd/: service unit files for reference - deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate - research-session.sh: updated with Step 8.5 digest + cascade inbox processing No new code written — all files are exact copies from VPS as of 2026-04-06. From this point forward: edit in repo, commit, then deploy.sh. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
476 lines
16 KiB
Python
476 lines
16 KiB
Python
"""Tier 1 Metrics — The three numbers that matter most for knowledge production.
|
|
|
|
1. Extraction yield: claims merged / claims evaluated, per agent, per week
|
|
2. Cost per merged claim: total spend / merged claims, per week
|
|
3. Fix success rate by rejection tag: which rejection reasons are fixable vs terminal
|
|
|
|
These queries run against pipeline.db (read-only) and power the /api/yield,
|
|
/api/cost-per-claim, and /api/fix-rates endpoints.
|
|
|
|
Owner: Argus <69AF7290-758F-464B-B472-04AFCA4AB340>
|
|
"""
|
|
|
|
import sqlite3
|
|
|
|
|
|
def extraction_yield(conn: sqlite3.Connection, days: int = 30) -> dict:
|
|
"""Extraction yield = merged / evaluated, trended per agent per week.
|
|
|
|
Returns:
|
|
{
|
|
"daily": [{"day": "2026-W13", "agent": "rio", "evaluated": 20, "merged": 8, "yield": 0.4}, ...],
|
|
"totals": [{"agent": "rio", "evaluated": 100, "merged": 40, "yield": 0.4}, ...],
|
|
"system": {"evaluated": 500, "merged": 200, "yield": 0.4}
|
|
}
|
|
"""
|
|
# Weekly yield per agent
|
|
# Uses strftime('%Y-W%W') for ISO week grouping
|
|
# evaluated = approved + rejected (all terminal eval events)
|
|
# merged = approved events only
|
|
weekly = conn.execute(
|
|
"""
|
|
SELECT date(timestamp) as day,
|
|
json_extract(detail, '$.agent') as agent,
|
|
COUNT(*) as evaluated,
|
|
SUM(CASE WHEN event = 'approved' THEN 1 ELSE 0 END) as merged
|
|
FROM audit_log
|
|
WHERE stage = 'evaluate'
|
|
AND event IN ('approved', 'changes_requested', 'domain_rejected', 'tier05_rejected')
|
|
AND timestamp > datetime('now', ? || ' days')
|
|
GROUP BY day, agent
|
|
ORDER BY day DESC, agent
|
|
""",
|
|
(f"-{days}",),
|
|
).fetchall()
|
|
|
|
daily_data = []
|
|
for r in weekly:
|
|
ev = r["evaluated"] or 0
|
|
mg = r["merged"] or 0
|
|
daily_data.append({
|
|
"day": r["day"],
|
|
"agent": r["agent"] or "unknown",
|
|
"evaluated": ev,
|
|
"merged": mg,
|
|
"yield": round(mg / ev, 3) if ev else 0,
|
|
})
|
|
|
|
# Per-agent totals (same window)
|
|
totals = conn.execute(
|
|
"""
|
|
SELECT json_extract(detail, '$.agent') as agent,
|
|
COUNT(*) as evaluated,
|
|
SUM(CASE WHEN event = 'approved' THEN 1 ELSE 0 END) as merged
|
|
FROM audit_log
|
|
WHERE stage = 'evaluate'
|
|
AND event IN ('approved', 'changes_requested', 'domain_rejected', 'tier05_rejected')
|
|
AND timestamp > datetime('now', ? || ' days')
|
|
GROUP BY agent
|
|
ORDER BY merged DESC
|
|
""",
|
|
(f"-{days}",),
|
|
).fetchall()
|
|
|
|
totals_data = []
|
|
for r in totals:
|
|
ev = r["evaluated"] or 0
|
|
mg = r["merged"] or 0
|
|
totals_data.append({
|
|
"agent": r["agent"] or "unknown",
|
|
"evaluated": ev,
|
|
"merged": mg,
|
|
"yield": round(mg / ev, 3) if ev else 0,
|
|
})
|
|
|
|
# System-wide total
|
|
sys_row = conn.execute(
|
|
"""
|
|
SELECT COUNT(*) as evaluated,
|
|
SUM(CASE WHEN event = 'approved' THEN 1 ELSE 0 END) as merged
|
|
FROM audit_log
|
|
WHERE stage = 'evaluate'
|
|
AND event IN ('approved', 'changes_requested', 'domain_rejected', 'tier05_rejected')
|
|
AND timestamp > datetime('now', ? || ' days')
|
|
""",
|
|
(f"-{days}",),
|
|
).fetchone()
|
|
|
|
sys_ev = sys_row["evaluated"] or 0
|
|
sys_mg = sys_row["merged"] or 0
|
|
|
|
return {
|
|
"days": days,
|
|
"daily": daily_data,
|
|
"totals": totals_data,
|
|
"system": {
|
|
"evaluated": sys_ev,
|
|
"merged": sys_mg,
|
|
"yield": round(sys_mg / sys_ev, 3) if sys_ev else 0,
|
|
},
|
|
}
|
|
|
|
|
|
def cost_per_merged_claim(conn: sqlite3.Connection, days: int = 30) -> dict:
|
|
"""Cost and compute per merged claim, trended per week.
|
|
|
|
Uses costs table for spend + tokens and prs table for merge counts.
|
|
Breaks down by stage. Separates API spend (dollars) from subscription
|
|
compute (tokens only — Claude Max is flat-rate, so dollars are meaningless).
|
|
|
|
Returns:
|
|
{
|
|
"daily": [{"day": "2026-W13", "api_cost": 1.50, "merged": 8,
|
|
"cost_per_claim": 0.19, "input_tokens": 50000,
|
|
"output_tokens": 5000, "total_tokens": 55000,
|
|
"tokens_per_claim": 6875}, ...],
|
|
"by_stage": [{"stage": "eval_leo:openrouter", "api_cost": 1.50,
|
|
"input_tokens": 300000, "output_tokens": 50000,
|
|
"calls": 100, "billing": "api"}, ...],
|
|
"system": {"api_cost": 2.36, "merged": 80, "cost_per_claim": 0.03,
|
|
"total_tokens": 1200000, "tokens_per_claim": 15000,
|
|
"subscription_tokens": 0, "api_tokens": 1200000}
|
|
}
|
|
"""
|
|
# Weekly: cost + tokens from costs table, merged count from prs table
|
|
daily_cost = conn.execute(
|
|
"""
|
|
SELECT date as day,
|
|
SUM(cost_usd) as api_cost,
|
|
SUM(cost_estimate_usd) as estimated_cost,
|
|
SUM(input_tokens) as input_tokens,
|
|
SUM(output_tokens) as output_tokens
|
|
FROM costs
|
|
WHERE date > date('now', ? || ' days')
|
|
GROUP BY day
|
|
ORDER BY day DESC
|
|
""",
|
|
(f"-{days}",),
|
|
).fetchall()
|
|
|
|
daily_merges = conn.execute(
|
|
"""
|
|
SELECT date(merged_at) as day,
|
|
COUNT(*) as merged
|
|
FROM prs
|
|
WHERE status = 'merged'
|
|
AND merged_at > datetime('now', ? || ' days')
|
|
GROUP BY day
|
|
ORDER BY day DESC
|
|
""",
|
|
(f"-{days}",),
|
|
).fetchall()
|
|
|
|
# Merge into combined weekly view
|
|
merge_map = {r["day"]: r["merged"] for r in daily_merges}
|
|
cost_map = {}
|
|
for r in daily_cost:
|
|
cost_map[r["day"]] = {
|
|
"api_cost": r["api_cost"] or 0,
|
|
"estimated_cost": r["estimated_cost"] or 0,
|
|
"input_tokens": r["input_tokens"] or 0,
|
|
"output_tokens": r["output_tokens"] or 0,
|
|
}
|
|
|
|
all_days = sorted(set(list(merge_map.keys()) + list(cost_map.keys())), reverse=True)
|
|
daily_data = []
|
|
for w in all_days:
|
|
c = cost_map.get(w, {"api_cost": 0, "estimated_cost": 0, "input_tokens": 0, "output_tokens": 0})
|
|
merged = merge_map.get(w, 0) or 0
|
|
total_tokens = c["input_tokens"] + c["output_tokens"]
|
|
daily_data.append({
|
|
"day": w,
|
|
"actual_spend": round(c["api_cost"], 4),
|
|
"estimated_cost": round(c["estimated_cost"], 4),
|
|
"merged": merged,
|
|
"cost_per_claim": round(c["estimated_cost"] / merged, 4) if merged else None,
|
|
"input_tokens": c["input_tokens"],
|
|
"output_tokens": c["output_tokens"],
|
|
"total_tokens": total_tokens,
|
|
"tokens_per_claim": round(total_tokens / merged) if merged else None,
|
|
})
|
|
|
|
# By stage with billing type (full window)
|
|
by_stage = conn.execute(
|
|
"""
|
|
SELECT stage,
|
|
SUM(cost_usd) as api_cost,
|
|
SUM(cost_estimate_usd) as estimated_cost,
|
|
SUM(input_tokens) as input_tokens,
|
|
SUM(output_tokens) as output_tokens,
|
|
SUM(calls) as calls
|
|
FROM costs
|
|
WHERE date > date('now', ? || ' days')
|
|
GROUP BY stage
|
|
ORDER BY SUM(input_tokens + output_tokens) DESC
|
|
""",
|
|
(f"-{days}",),
|
|
).fetchall()
|
|
|
|
stage_data = []
|
|
total_api_cost = 0
|
|
total_estimated_cost = 0
|
|
total_input = 0
|
|
total_output = 0
|
|
subscription_tokens = 0
|
|
api_tokens = 0
|
|
for r in by_stage:
|
|
cost = r["api_cost"] or 0
|
|
est = r["estimated_cost"] or 0
|
|
inp = r["input_tokens"] or 0
|
|
out = r["output_tokens"] or 0
|
|
calls = r["calls"] or 0
|
|
stage_name = r["stage"]
|
|
# :max suffix = subscription, :openrouter suffix = API
|
|
billing = "subscription" if ":max" in stage_name else "api"
|
|
total_api_cost += cost
|
|
total_estimated_cost += est
|
|
total_input += inp
|
|
total_output += out
|
|
if billing == "subscription":
|
|
subscription_tokens += inp + out
|
|
else:
|
|
api_tokens += inp + out
|
|
stage_data.append({
|
|
"stage": stage_name,
|
|
"api_cost": round(cost, 4),
|
|
"estimated_cost": round(est, 4),
|
|
"input_tokens": inp,
|
|
"output_tokens": out,
|
|
"calls": calls,
|
|
"billing": billing,
|
|
})
|
|
|
|
# System totals
|
|
sys_merged = conn.execute(
|
|
"SELECT COUNT(*) as n FROM prs WHERE status='merged' AND merged_at > datetime('now', ? || ' days')",
|
|
(f"-{days}",),
|
|
).fetchone()["n"] or 0
|
|
|
|
total_tokens = total_input + total_output
|
|
|
|
return {
|
|
"days": days,
|
|
"daily": daily_data,
|
|
"by_stage": stage_data,
|
|
"system": {
|
|
"actual_spend": round(total_api_cost, 4),
|
|
"estimated_cost": round(total_estimated_cost, 4),
|
|
"merged": sys_merged,
|
|
"cost_per_claim": round(total_estimated_cost / sys_merged, 4) if sys_merged else None,
|
|
"total_tokens": total_tokens,
|
|
"tokens_per_claim": round(total_tokens / sys_merged) if sys_merged else None,
|
|
"subscription_tokens": subscription_tokens,
|
|
"api_tokens": api_tokens,
|
|
"note": "estimated_cost = API-rate equivalent for all calls (unified metric). actual_spend = real dollars charged to OpenRouter.",
|
|
},
|
|
}
|
|
|
|
|
|
def fix_success_by_tag(conn: sqlite3.Connection, days: int = 30) -> dict:
|
|
"""Fix success rate broken down by rejection reason.
|
|
|
|
For each rejection tag: how many PRs got that rejection, how many eventually
|
|
merged (successful fix), how many are still open (in progress), how many
|
|
were abandoned (closed/zombie without merge).
|
|
|
|
Returns:
|
|
{
|
|
"tags": [
|
|
{
|
|
"tag": "insufficient_evidence",
|
|
"total": 50,
|
|
"fixed": 10,
|
|
"in_progress": 5,
|
|
"terminal": 35,
|
|
"fix_rate": 0.2,
|
|
"terminal_rate": 0.7
|
|
}, ...
|
|
]
|
|
}
|
|
"""
|
|
# Get all rejection events with their tags and PR numbers
|
|
# Then join with prs table to see final outcome
|
|
rows = conn.execute(
|
|
"""
|
|
SELECT value as tag,
|
|
json_extract(al.detail, '$.pr') as pr_number
|
|
FROM audit_log al, json_each(json_extract(al.detail, '$.issues'))
|
|
WHERE al.stage = 'evaluate'
|
|
AND al.event IN ('changes_requested', 'domain_rejected', 'tier05_rejected')
|
|
AND al.timestamp > datetime('now', ? || ' days')
|
|
""",
|
|
(f"-{days}",),
|
|
).fetchall()
|
|
|
|
# Collect unique PRs per tag
|
|
tag_prs: dict[str, set] = {}
|
|
for r in rows:
|
|
tag = r["tag"]
|
|
pr = r["pr_number"]
|
|
if tag not in tag_prs:
|
|
tag_prs[tag] = set()
|
|
if pr is not None:
|
|
tag_prs[tag].add(pr)
|
|
|
|
if not tag_prs:
|
|
return {"days": days, "tags": []}
|
|
|
|
# Get status for all referenced PRs in one query
|
|
all_prs = set()
|
|
for prs in tag_prs.values():
|
|
all_prs.update(prs)
|
|
|
|
if not all_prs:
|
|
return {"days": days, "tags": []}
|
|
|
|
placeholders = ",".join("?" for _ in all_prs)
|
|
pr_statuses = conn.execute(
|
|
f"SELECT number, status FROM prs WHERE number IN ({placeholders})",
|
|
list(all_prs),
|
|
).fetchall()
|
|
status_map = {r["number"]: r["status"] for r in pr_statuses}
|
|
|
|
# Compute per-tag outcomes
|
|
tag_data = []
|
|
for tag, prs in sorted(tag_prs.items(), key=lambda x: -len(x[1])):
|
|
fixed = 0
|
|
in_progress = 0
|
|
terminal = 0
|
|
for pr in prs:
|
|
st = status_map.get(pr, "unknown")
|
|
if st == "merged":
|
|
fixed += 1
|
|
elif st in ("open", "validating", "reviewing", "merging"):
|
|
in_progress += 1
|
|
else:
|
|
# closed, zombie, conflict, unknown
|
|
terminal += 1
|
|
|
|
total = len(prs)
|
|
# Fix rate excludes in-progress (only counts resolved PRs)
|
|
resolved = fixed + terminal
|
|
tag_data.append({
|
|
"tag": tag,
|
|
"total": total,
|
|
"fixed": fixed,
|
|
"in_progress": in_progress,
|
|
"terminal": terminal,
|
|
"fix_rate": round(fixed / resolved, 3) if resolved else None,
|
|
"terminal_rate": round(terminal / resolved, 3) if resolved else None,
|
|
})
|
|
|
|
return {"days": days, "tags": tag_data}
|
|
|
|
|
|
def compute_profile(conn: "sqlite3.Connection", days: int = 30) -> dict:
|
|
"""Compute profile — Max subscription telemetry alongside API usage.
|
|
|
|
Surfaces: cache hit rates, latency, cost estimates (API-equivalent),
|
|
token breakdown by billing type.
|
|
"""
|
|
rows = conn.execute(
|
|
"""
|
|
SELECT stage, model,
|
|
SUM(calls) as calls,
|
|
SUM(input_tokens) as input_tokens,
|
|
SUM(output_tokens) as output_tokens,
|
|
SUM(cost_usd) as api_cost,
|
|
SUM(duration_ms) as duration_ms,
|
|
SUM(cache_read_tokens) as cache_read_tokens,
|
|
SUM(cache_write_tokens) as cache_write_tokens,
|
|
SUM(cost_estimate_usd) as cost_estimate_usd
|
|
FROM costs
|
|
WHERE date > date('now', ? || ' days')
|
|
GROUP BY stage, model
|
|
ORDER BY SUM(input_tokens + output_tokens) DESC
|
|
""",
|
|
(f"-{days}",),
|
|
).fetchall()
|
|
|
|
stage_data = []
|
|
total_calls = 0
|
|
total_tokens = 0
|
|
total_duration = 0
|
|
total_cache_read = 0
|
|
total_cache_write = 0
|
|
api_calls = 0
|
|
sub_calls = 0
|
|
api_spend = 0.0
|
|
sub_estimate = 0.0
|
|
sub_input_tokens = 0
|
|
|
|
for r in rows:
|
|
calls = r["calls"] or 0
|
|
inp = r["input_tokens"] or 0
|
|
out = r["output_tokens"] or 0
|
|
dur = r["duration_ms"] or 0
|
|
cr = r["cache_read_tokens"] or 0
|
|
cw = r["cache_write_tokens"] or 0
|
|
cost = r["api_cost"] or 0
|
|
est = r["cost_estimate_usd"] or 0
|
|
stage_name = r["stage"]
|
|
billing = "subscription" if ":max" in stage_name else "api"
|
|
|
|
total_calls += calls
|
|
total_tokens += inp + out
|
|
total_duration += dur
|
|
total_cache_read += cr
|
|
total_cache_write += cw
|
|
|
|
if billing == "subscription":
|
|
sub_calls += calls
|
|
sub_estimate += est
|
|
sub_input_tokens += inp
|
|
else:
|
|
api_calls += calls
|
|
api_spend += cost
|
|
|
|
stage_data.append({
|
|
"stage": stage_name,
|
|
"model": r["model"],
|
|
"calls": calls,
|
|
"input_tokens": inp,
|
|
"output_tokens": out,
|
|
"total_tokens": inp + out,
|
|
"duration_ms": dur,
|
|
"avg_latency_ms": round(dur / calls) if calls else 0,
|
|
"cache_read_tokens": cr,
|
|
"cache_write_tokens": cw,
|
|
"cache_hit_rate": round(cr / (cr + inp), 3) if (cr + inp) else 0,
|
|
"api_cost": round(cost, 4),
|
|
"cost_estimate_usd": round(est, 4),
|
|
"billing": billing,
|
|
})
|
|
|
|
# Cache summary (only meaningful for subscription/Max calls)
|
|
total_cacheable = total_cache_read + total_cache_write + sub_input_tokens
|
|
cache_hit_rate = round(total_cache_read / total_cacheable, 3) if total_cacheable else 0
|
|
|
|
return {
|
|
"days": days,
|
|
"by_stage": stage_data,
|
|
"cache": {
|
|
"read_tokens": total_cache_read,
|
|
"write_tokens": total_cache_write,
|
|
"hit_rate": cache_hit_rate,
|
|
"note": "Cache hits are prompt tokens served from cache (cheaper/faster)",
|
|
},
|
|
"latency": {
|
|
"total_ms": total_duration,
|
|
"avg_ms_per_call": round(total_duration / total_calls) if total_calls else 0,
|
|
"note": "Wall-clock time including network. Only populated for Claude Max calls.",
|
|
},
|
|
"subscription_estimate": {
|
|
"total_cost_usd": round(sub_estimate, 4),
|
|
"note": "What subscription calls would cost at API rates. Actual cost: $0 (flat-rate Max plan).",
|
|
},
|
|
"system": {
|
|
"total_calls": total_calls,
|
|
"total_tokens": total_tokens,
|
|
"api_calls": api_calls,
|
|
"subscription_calls": sub_calls,
|
|
"api_spend": round(api_spend, 4),
|
|
"subscription_estimate": round(sub_estimate, 4),
|
|
"cache_hit_rate": cache_hit_rate,
|
|
},
|
|
}
|