teleo-codex/ops/diagnostics/daily_digest.py

"""Daily digest: aggregates 24h activity for Telegram bot consumption.

Data sources:
  - pipeline.db: merged PRs, audit events, contributor activity
  - Forgejo API: PR descriptions for claim summaries
  - claim-index: total claims, domain breakdown
  - review queue: pending approval counts

Endpoint: GET /api/daily-digest?hours=24
"""

import asyncio
import logging
import sqlite3
from datetime import datetime, timezone, timedelta
from typing import Any

import aiohttp

logger = logging.getLogger("argus.daily_digest")

FORGEJO_BASE = "https://git.livingip.xyz/api/v1"
REPO = "teleo/teleo-codex"
CLAIM_INDEX_URL = "http://localhost:8080/claim-index"


async def fetch_daily_digest(
    db_path: str,
    forgejo_token: str | None = None,
    hours: int = 24,
    timeout_s: int = 15,
) -> dict[str, Any]:
    """Build the daily digest payload.

    Returns structured data for Epimetheus's Telegram bot to format and send.
    """
    cutoff = (datetime.now(timezone.utc) - timedelta(hours=hours)).isoformat()

    # Parallel: DB queries + HTTP fetches
    db_data = _query_db(db_path, cutoff, hours)

    headers = {"Accept": "application/json"}
    if forgejo_token:
        headers["Authorization"] = f"token {forgejo_token}"

    connector = aiohttp.TCPConnector(ssl=False)
    async with aiohttp.ClientSession(headers=headers, connector=connector) as session:
        # Fetch claim-index, merged PR details from Forgejo, and open PR count in parallel
        merged_numbers = [pr["number"] for pr in db_data["merged_prs"]]

        tasks = [
            _fetch_claim_index(session, timeout_s),
            _fetch_merged_pr_details(session, merged_numbers, timeout_s),
            _fetch_open_pr_count(session, timeout_s),
        ]
        claim_index, pr_details, open_pr_count = await asyncio.gather(*tasks)

    # Enrich merged PRs with Forgejo descriptions
    merged_claims = _build_merged_claims(db_data["merged_prs"], pr_details)

    return {
        "period_hours": hours,
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "claims_merged": merged_claims,
        "pipeline_stats": {
            "prs_merged": db_data["prs_merged"],
            "prs_opened": db_data["prs_opened"],
            "prs_rejected": db_data["prs_rejected"],
            "approval_rate": db_data["approval_rate"],
            "top_rejection_reasons": db_data["top_rejection_reasons"],
        },
        "agent_activity": db_data["agent_activity"],
        "pending_review": {
            "open_prs": open_pr_count,
        },
        "knowledge_base": {
            "total_claims": claim_index.get("total_claims", 0),
            "domains": claim_index.get("domains", {}),
            "orphan_ratio": claim_index.get("orphan_ratio", 0),
            "cross_domain_links": claim_index.get("cross_domain_links", 0),
        },
    }


def _query_db(db_path: str, cutoff: str, hours: int) -> dict[str, Any]:
    """Run all DB queries synchronously (SQLite is fast enough for digest)."""
    conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
    conn.row_factory = sqlite3.Row
    try:
        # Merged PRs in period
        merged_prs = conn.execute(
            """SELECT number, branch, domain, agent, commit_type, merged_at, cost_usd
               FROM prs WHERE status = 'merged' AND merged_at >= ?
               ORDER BY merged_at DESC""",
            (cutoff,),
        ).fetchall()

        prs_merged = len(merged_prs)

        # PRs opened in period
        prs_opened = conn.execute(
            "SELECT COUNT(*) FROM prs WHERE created_at >= ?", (cutoff,)
        ).fetchone()[0]

        # Rejected PRs in period (closed/zombie with rejection events)
        prs_rejected = conn.execute(
            """SELECT COUNT(DISTINCT json_extract(detail, '$.pr'))
               FROM audit_log
               WHERE stage = 'evaluate'
                 AND event IN ('domain_rejected', 'tier05_rejected')
                 AND timestamp >= ?""",
            (cutoff,),
        ).fetchone()[0]

        # Approval rate
        total_evaluated = prs_merged + prs_rejected
        approval_rate = round(prs_merged / total_evaluated * 100, 1) if total_evaluated > 0 else 0.0

        # Top rejection reasons
        rejection_rows = conn.execute(
            """SELECT json_extract(detail, '$.issues') as issues
               FROM audit_log
               WHERE stage = 'evaluate'
                 AND event IN ('domain_rejected', 'tier05_rejected')
                 AND timestamp >= ?
                 AND json_valid(detail)""",
            (cutoff,),
        ).fetchall()

        reason_counts: dict[str, int] = {}
        import json
        for row in rejection_rows:
            if row["issues"]:
                try:
                    issues = json.loads(row["issues"])
                    if isinstance(issues, list):
                        for issue in issues:
                            reason_counts[issue] = reason_counts.get(issue, 0) + 1
                except (json.JSONDecodeError, TypeError):
                    pass

        top_rejection_reasons = sorted(reason_counts.items(), key=lambda x: -x[1])[:5]
        top_rejection_reasons = [{"reason": r, "count": c} for r, c in top_rejection_reasons]

        # Agent activity — who contributed what
        agent_rows = conn.execute(
            """SELECT agent,
                      COUNT(*) as total,
                      SUM(CASE WHEN status = 'merged' THEN 1 ELSE 0 END) as merged,
                      SUM(CASE WHEN commit_type = 'extract' OR commit_type = 'research' THEN 1 ELSE 0 END) as extractions,
                      SUM(CASE WHEN commit_type = 'challenge' THEN 1 ELSE 0 END) as challenges,
                      SUM(CASE WHEN commit_type = 'enrich' OR commit_type = 'reweave' THEN 1 ELSE 0 END) as enrichments,
                      SUM(CASE WHEN commit_type = 'synthesize' THEN 1 ELSE 0 END) as syntheses
               FROM prs
               WHERE created_at >= ? AND agent IS NOT NULL AND agent != ''
               GROUP BY agent
               ORDER BY merged DESC""",
            (cutoff,),
        ).fetchall()

        agent_activity = [
            {
                "agent": row["agent"],
                "prs_total": row["total"],
                "prs_merged": row["merged"],
                "extractions": row["extractions"],
                "challenges": row["challenges"],
                "enrichments": row["enrichments"],
                "syntheses": row["syntheses"],
            }
            for row in agent_rows
        ]

        return {
            "merged_prs": [dict(pr) for pr in merged_prs],
            "prs_merged": prs_merged,
            "prs_opened": prs_opened,
            "prs_rejected": prs_rejected,
            "approval_rate": approval_rate,
            "top_rejection_reasons": top_rejection_reasons,
            "agent_activity": agent_activity,
        }
    finally:
        conn.close()


async def _fetch_claim_index(session: aiohttp.ClientSession, timeout_s: int) -> dict:
    """Fetch claim-index summary stats."""
    try:
        async with session.get(
            CLAIM_INDEX_URL,
            timeout=aiohttp.ClientTimeout(total=timeout_s),
        ) as resp:
            if resp.status == 200:
                data = await resp.json()
                return {
                    "total_claims": data.get("total_claims", 0),
                    "domains": data.get("domains", {}),
                    "orphan_ratio": data.get("orphan_ratio", 0),
                    "cross_domain_links": data.get("cross_domain_links", 0),
                }
    except Exception as e:
        logger.warning("Failed to fetch claim-index: %s", e)
    return {}


async def _fetch_merged_pr_details(
    session: aiohttp.ClientSession,
    pr_numbers: list[int],
    timeout_s: int,
) -> dict[int, dict]:
    """Fetch PR details from Forgejo for merged PRs (parallel)."""
    if not pr_numbers:
        return {}

    async def _fetch_one(n: int) -> tuple[int, dict]:
        url = f"{FORGEJO_BASE}/repos/{REPO}/pulls/{n}"
        try:
            async with session.get(url, timeout=aiohttp.ClientTimeout(total=timeout_s)) as resp:
                if resp.status == 200:
                    return n, await resp.json()
        except Exception as e:
            logger.warning("Failed to fetch PR #%d: %s", n, e)
        return n, {}

    results = await asyncio.gather(*[_fetch_one(n) for n in pr_numbers])
    return {n: data for n, data in results}


async def _fetch_open_pr_count(session: aiohttp.ClientSession, timeout_s: int) -> int:
    """Get count of open PRs from Forgejo."""
    url = f"{FORGEJO_BASE}/repos/{REPO}/pulls?state=open&limit=1"
    try:
        async with session.get(url, timeout=aiohttp.ClientTimeout(total=timeout_s)) as resp:
            if resp.status == 200:
                # Forgejo returns X-Total-Count header
                total = resp.headers.get("X-Total-Count")
                if total is not None:
                    return int(total)
                # Fallback: fetch all and count
                data = await resp.json()
                return len(data)
    except Exception as e:
        logger.warning("Failed to fetch open PR count: %s", e)
    return 0


def _build_merged_claims(
    merged_prs: list[dict],
    pr_details: dict[int, dict],
) -> list[dict]:
    """Build claim summaries from merged PRs + Forgejo PR bodies."""
    claims = []
    for pr in merged_prs:
        number = pr["number"]
        detail = pr_details.get(number, {})

        # Extract summary from PR body (first paragraph or first 200 chars)
        body = detail.get("body", "") or ""
        summary = _extract_summary(body)

        claims.append({
            "pr_number": number,
            "title": detail.get("title", pr.get("branch", f"PR #{number}")),
            "agent": pr.get("agent", "unknown"),
            "domain": pr.get("domain", "unknown"),
            "commit_type": pr.get("commit_type", "knowledge"),
            "summary": summary,
            "merged_at": pr.get("merged_at", ""),
            "cost_usd": pr.get("cost_usd", 0.0),
            "url": detail.get("html_url", ""),
        })

    return claims


def _extract_summary(body: str) -> str:
    """Extract a 1-2 sentence summary from PR body markdown.

    Looks for a Summary section first, then falls back to first non-header paragraph.
    """
    if not body:
        return ""

    lines = body.strip().split("\n")

    # Look for ## Summary section
    in_summary = False
    summary_lines = []
    for line in lines:
        if line.strip().lower().startswith("## summary"):
            in_summary = True
            continue
        if in_summary:
            if line.startswith("##"):
                break
            stripped = line.strip()
            if stripped and not stripped.startswith("- ["):  # skip checklists
                summary_lines.append(stripped)
            if len(summary_lines) >= 3:
                break

    if summary_lines:
        return " ".join(summary_lines)[:300]

    # Fallback: first non-header, non-empty paragraph
    for line in lines:
        stripped = line.strip()
        if stripped and not stripped.startswith("#") and not stripped.startswith("- ["):
            return stripped[:300]

    return ""