teleo-infrastructure/lib/health.py

"""Health API — HTTP server on configurable port for monitoring."""

import logging
from datetime import date, datetime, timezone

from aiohttp import web

from . import config, costs, db

logger = logging.getLogger("pipeline.health")


def _conn(request):
    """Get the persistent readonly connection from app state."""
    return request.app["db"]


async def handle_health(request):
    """GET /health — overall pipeline health."""
    conn = _conn(request)

    # Stage status from circuit breakers
    breakers = conn.execute(
        "SELECT name, state, failures, last_success_at, last_update FROM circuit_breakers"
    ).fetchall()

    # Queue depths
    sources_by_status = conn.execute("SELECT status, COUNT(*) as n FROM sources GROUP BY status").fetchall()
    prs_by_status = conn.execute("SELECT status, COUNT(*) as n FROM prs GROUP BY status").fetchall()

    # Per-domain merge queue depth (Vida)
    merge_queue = conn.execute(
        "SELECT domain, COUNT(*) as n FROM prs WHERE status = 'approved' GROUP BY domain"
    ).fetchall()

    # Cost
    budget = costs.check_budget(conn)

    # Metabolic metrics (Vida)
    null_rate = conn.execute(
        """SELECT
            CAST(SUM(CASE WHEN status = 'null_result' THEN 1 ELSE 0 END) AS REAL) /
            NULLIF(COUNT(*), 0) as rate
        FROM sources
        WHERE updated_at > datetime('now', '-24 hours')
            AND status IN ('extracted', 'null_result', 'error')"""
    ).fetchone()

    approval_rate = conn.execute(
        """SELECT
            CAST(SUM(CASE WHEN domain_verdict = 'approve' THEN 1 ELSE 0 END) AS REAL) /
            NULLIF(COUNT(*), 0) as domain_rate,
            CAST(SUM(CASE WHEN leo_verdict = 'approve' THEN 1 ELSE 0 END) AS REAL) /
            NULLIF(COUNT(*), 0) as leo_rate
        FROM prs
        WHERE last_attempt > datetime('now', '-24 hours')
            AND domain_verdict != 'pending'"""
    ).fetchone()

    # Recent activity (last hour)
    recent = conn.execute(
        """SELECT stage, event, COUNT(*) as n
           FROM audit_log
           WHERE timestamp > datetime('now', '-1 hour')
           GROUP BY stage, event"""
    ).fetchall()

    body = {
        "status": "healthy",
        "breakers": {},
        "sources": {r["status"]: r["n"] for r in sources_by_status},
        "prs": {r["status"]: r["n"] for r in prs_by_status},
        "merge_queue_by_domain": {r["domain"]: r["n"] for r in merge_queue},
        "budget": budget,
        "metabolic": {
            "null_result_rate_24h": round(null_rate["rate"], 3)
            if null_rate and null_rate["rate"] is not None
            else None,
            "domain_approval_rate_24h": round(approval_rate["domain_rate"], 3)
            if approval_rate and approval_rate["domain_rate"] is not None
            else None,
            "leo_approval_rate_24h": round(approval_rate["leo_rate"], 3)
            if approval_rate and approval_rate["leo_rate"] is not None
            else None,
        },
        "recent_activity": [{"stage": r["stage"], "event": r["event"], "count": r["n"]} for r in recent],
    }

    # Breaker state + stall detection (Vida: last_success_at heartbeat)
    for r in breakers:
        breaker_info = {"state": r["state"], "failures": r["failures"]}
        if r["last_success_at"]:
            last = datetime.fromisoformat(r["last_success_at"])
            if last.tzinfo is None:
                last = last.replace(tzinfo=timezone.utc)
            age_s = (datetime.now(timezone.utc) - last).total_seconds()
            breaker_info["last_success_age_s"] = round(age_s)
            # Stall detection: no success in 2x the stage's interval
            intervals = {
                "ingest": config.INGEST_INTERVAL,
                "validate": config.VALIDATE_INTERVAL,
                "evaluate": config.EVAL_INTERVAL,
                "merge": config.MERGE_INTERVAL,
            }
            threshold = intervals.get(r["name"], 60) * 2
            if age_s > threshold:
                breaker_info["stalled"] = True
        body["breakers"][r["name"]] = breaker_info

    # Overall status
    if any(b.get("stalled") for b in body["breakers"].values()):
        body["status"] = "stalled"
    if any(b["state"] == "open" for b in body["breakers"].values()):
        body["status"] = "degraded"
    if not budget["ok"]:
        body["status"] = "budget_exhausted"
    # Rubber-stamp warning (Vida)
    if approval_rate and approval_rate["domain_rate"] is not None and approval_rate["domain_rate"] > 0.95:
        body["metabolic"]["warning"] = "domain approval rate >95% — possible rubber-stamping"

    status_code = 200 if body["status"] == "healthy" else 503
    return web.json_response(body, status=status_code)


async def handle_costs(request):
    """GET /costs — daily cost breakdown."""
    conn = _conn(request)
    day = request.query.get("date", date.today().isoformat())
    breakdown = costs.get_daily_breakdown(conn, day)
    budget = costs.check_budget(conn)
    return web.json_response({"date": day, "budget": budget, "breakdown": breakdown})


async def handle_sources(request):
    """GET /sources — source pipeline status."""
    conn = _conn(request)
    status_filter = request.query.get("status")
    if status_filter:
        rows = conn.execute(
            "SELECT path, status, priority, claims_count, transient_retries, substantive_retries, updated_at FROM sources WHERE status = ? ORDER BY updated_at DESC LIMIT 50",
            (status_filter,),
        ).fetchall()
    else:
        rows = conn.execute(
            "SELECT path, status, priority, claims_count, transient_retries, substantive_retries, updated_at FROM sources ORDER BY updated_at DESC LIMIT 50"
        ).fetchall()
    return web.json_response({"sources": [dict(r) for r in rows]})


async def handle_prs(request):
    """GET /prs — PR pipeline status."""
    conn = _conn(request)
    status_filter = request.query.get("status")
    if status_filter:
        rows = conn.execute(
            "SELECT number, source_path, status, domain, tier, leo_verdict, domain_verdict, transient_retries, substantive_retries FROM prs WHERE status = ? ORDER BY number DESC LIMIT 50",
            (status_filter,),
        ).fetchall()
    else:
        rows = conn.execute(
            "SELECT number, source_path, status, domain, tier, leo_verdict, domain_verdict, transient_retries, substantive_retries FROM prs ORDER BY number DESC LIMIT 50"
        ).fetchall()
    return web.json_response({"prs": [dict(r) for r in rows]})


async def handle_breakers(request):
    """GET /breakers — circuit breaker states."""
    conn = _conn(request)
    rows = conn.execute("SELECT * FROM circuit_breakers").fetchall()
    return web.json_response({"breakers": [dict(r) for r in rows]})


async def handle_calibration(request):
    """GET /calibration — priority calibration analysis (Vida)."""
    conn = _conn(request)
    # Find sources where eval disagreed with ingest priority
    # Focus on upgrades (Theseus: upgrades are the learnable signal)
    rows = conn.execute(
        """SELECT path, priority, priority_log FROM sources
           WHERE json_array_length(priority_log) >= 2"""
    ).fetchall()

    upgrades = []
    downgrades = []
    for r in rows:
        import json

        log = json.loads(r["priority_log"] or "[]")
        if len(log) < 2:
            continue
        first = log[0]["priority"]
        last = log[-1]["priority"]
        levels = {"critical": 4, "high": 3, "medium": 2, "low": 1, "skip": 0}
        if levels.get(last, 2) > levels.get(first, 2):
            upgrades.append({"path": r["path"], "from": first, "to": last})
        elif levels.get(last, 2) < levels.get(first, 2):
            downgrades.append({"path": r["path"], "from": first, "to": last})

    return web.json_response(
        {
            "upgrades": upgrades[:20],
            "downgrades_count": len(downgrades),
            "upgrades_count": len(upgrades),
            "note": "Focus on upgrades — downgrades are expected (downstream has more context)",
        }
    )


def create_app() -> web.Application:
    """Create the health API application."""
    app = web.Application()
    # Persistent readonly connection — one connection, no churn (Ganymede)
    app["db"] = db.get_connection(readonly=True)
    app.router.add_get("/health", handle_health)
    app.router.add_get("/costs", handle_costs)
    app.router.add_get("/sources", handle_sources)
    app.router.add_get("/prs", handle_prs)
    app.router.add_get("/breakers", handle_breakers)
    app.router.add_get("/calibration", handle_calibration)
    app.on_cleanup.append(_cleanup)
    return app


async def _cleanup(app):
    app["db"].close()


async def start_health_server(runner_ref: list):
    """Start the health HTTP server. Stores runner in runner_ref for shutdown."""
    app = create_app()
    runner = web.AppRunner(app)
    await runner.setup()
    # Bind to 127.0.0.1 only — use reverse proxy for external access (Ganymede)
    site = web.TCPSite(runner, "127.0.0.1", config.HEALTH_PORT)
    await site.start()
    runner_ref.append(runner)
    logger.info("Health API listening on 127.0.0.1:%d", config.HEALTH_PORT)


async def stop_health_server(runner_ref: list):
    """Stop the health HTTP server."""
    for runner in runner_ref:
        await runner.cleanup()
    logger.info("Health API stopped")