"""Analytics module — time-series metrics snapshots + chart data endpoints. Records pipeline metrics every 15 minutes. Serves historical data for Chart.js dashboard. Tracks source origin (agent/human/scraper) for pipeline funnel visualization. Priority 1 from Cory via Ganymede. Epimetheus owns this module. """ import json import logging import re from datetime import datetime, timezone from . import config, db logger = logging.getLogger("pipeline.analytics") # ─── Snapshot recording ──────────────────────────────────────────────────── def record_snapshot(conn) -> dict: """Record a metrics snapshot. Called every 15 minutes by the pipeline daemon. Returns the snapshot dict for logging/debugging. """ # Throughput (last hour) throughput = conn.execute( """SELECT COUNT(*) as n FROM audit_log WHERE timestamp > datetime('now', '-1 hour') AND event IN ('approved', 'changes_requested', 'merged')""" ).fetchone() # PR status counts statuses = conn.execute("SELECT status, COUNT(*) as n FROM prs GROUP BY status").fetchall() status_map = {r["status"]: r["n"] for r in statuses} # Approval rate (24h) verdicts = conn.execute( """SELECT COUNT(*) as total, SUM(CASE WHEN status IN ('merged', 'approved') THEN 1 ELSE 0 END) as passed FROM prs WHERE last_attempt > datetime('now', '-24 hours')""" ).fetchone() total = verdicts["total"] or 0 passed = verdicts["passed"] or 0 approval_rate = round(passed / total, 3) if total > 0 else None # Evaluated in 24h evaluated = conn.execute( """SELECT COUNT(*) as n FROM prs WHERE last_attempt > datetime('now', '-24 hours') AND domain_verdict != 'pending'""" ).fetchone() # Fix success rate fix_stats = conn.execute( """SELECT COUNT(*) as attempted, SUM(CASE WHEN status IN ('merged', 'approved') THEN 1 ELSE 0 END) as succeeded FROM prs WHERE fix_attempts > 0""" ).fetchone() fix_rate = round((fix_stats["succeeded"] or 0) / fix_stats["attempted"], 3) if fix_stats["attempted"] else None # Rejection reasons (24h) issue_rows = conn.execute( """SELECT eval_issues FROM prs WHERE eval_issues IS NOT NULL AND eval_issues != '[]' AND last_attempt > datetime('now', '-24 hours')""" ).fetchall() tag_counts = {} for row in issue_rows: try: tags = json.loads(row["eval_issues"]) for tag in tags: if isinstance(tag, str): tag_counts[tag] = tag_counts.get(tag, 0) + 1 except (json.JSONDecodeError, TypeError): pass # Source origin counts (24h) — agent vs human vs scraper source_origins = _count_source_origins(conn) snapshot = { "throughput_1h": throughput["n"] if throughput else 0, "approval_rate": approval_rate, "open_prs": status_map.get("open", 0), "merged_total": status_map.get("merged", 0), "closed_total": status_map.get("closed", 0), "conflict_total": status_map.get("conflict", 0), "evaluated_24h": evaluated["n"] if evaluated else 0, "fix_success_rate": fix_rate, "rejection_broken_wiki_links": tag_counts.get("broken_wiki_links", 0), "rejection_frontmatter_schema": tag_counts.get("frontmatter_schema", 0), "rejection_near_duplicate": tag_counts.get("near_duplicate", 0), "rejection_confidence": tag_counts.get("confidence_miscalibration", 0), "rejection_other": sum(v for k, v in tag_counts.items() if k not in ("broken_wiki_links", "frontmatter_schema", "near_duplicate", "confidence_miscalibration")), "extraction_model": config.EXTRACT_MODEL, "eval_domain_model": config.EVAL_DOMAIN_MODEL, "eval_leo_model": config.EVAL_LEO_STANDARD_MODEL, "prompt_version": config.PROMPT_VERSION, "pipeline_version": config.PIPELINE_VERSION, "source_origin_agent": source_origins.get("agent", 0), "source_origin_human": source_origins.get("human", 0), "source_origin_scraper": source_origins.get("scraper", 0), } # Write to DB conn.execute( """INSERT INTO metrics_snapshots ( throughput_1h, approval_rate, open_prs, merged_total, closed_total, conflict_total, evaluated_24h, fix_success_rate, rejection_broken_wiki_links, rejection_frontmatter_schema, rejection_near_duplicate, rejection_confidence, rejection_other, extraction_model, eval_domain_model, eval_leo_model, prompt_version, pipeline_version, source_origin_agent, source_origin_human, source_origin_scraper ) VALUES ( :throughput_1h, :approval_rate, :open_prs, :merged_total, :closed_total, :conflict_total, :evaluated_24h, :fix_success_rate, :rejection_broken_wiki_links, :rejection_frontmatter_schema, :rejection_near_duplicate, :rejection_confidence, :rejection_other, :extraction_model, :eval_domain_model, :eval_leo_model, :prompt_version, :pipeline_version, :source_origin_agent, :source_origin_human, :source_origin_scraper )""", snapshot, ) logger.debug("Recorded metrics snapshot: approval=%.1f%%, throughput=%d/h", (approval_rate or 0) * 100, snapshot["throughput_1h"]) return snapshot def _count_source_origins(conn) -> dict[str, int]: """Count source origins from recent PRs. Returns {agent: N, human: N, scraper: N}.""" counts = {"agent": 0, "human": 0, "scraper": 0} rows = conn.execute( """SELECT origin, COUNT(*) as n FROM prs WHERE created_at > datetime('now', '-24 hours') GROUP BY origin""" ).fetchall() for row in rows: origin = row["origin"] or "pipeline" if origin == "human": counts["human"] += row["n"] elif origin == "pipeline": counts["agent"] += row["n"] else: counts["scraper"] += row["n"] return counts # ─── Chart data endpoints ───────────────────────────────────────────────── def get_snapshot_history(conn, days: int = 7) -> list[dict]: """Get snapshot history for charting. Returns list of snapshot dicts.""" rows = conn.execute( """SELECT * FROM metrics_snapshots WHERE ts > datetime('now', ? || ' days') ORDER BY ts ASC""", (f"-{days}",), ).fetchall() return [dict(row) for row in rows] def get_version_changes(conn, days: int = 30) -> list[dict]: """Get points where prompt_version or pipeline_version changed. Used for chart annotations — vertical lines marking deployments. """ rows = conn.execute( """SELECT ts, prompt_version, pipeline_version FROM metrics_snapshots WHERE ts > datetime('now', ? || ' days') ORDER BY ts ASC""", (f"-{days}",), ).fetchall() changes = [] prev_prompt = None prev_pipeline = None for row in rows: if row["prompt_version"] != prev_prompt and prev_prompt is not None: changes.append({ "ts": row["ts"], "type": "prompt", "from": prev_prompt, "to": row["prompt_version"], }) if row["pipeline_version"] != prev_pipeline and prev_pipeline is not None: changes.append({ "ts": row["ts"], "type": "pipeline", "from": prev_pipeline, "to": row["pipeline_version"], }) prev_prompt = row["prompt_version"] prev_pipeline = row["pipeline_version"] return changes