teleo-codex/ops/diagnostics/daily_digest.py
m3taversal 05d74d5e32 sync: import all VPS pipeline + diagnostics code as baseline
Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source
of truth. Previously only 8 of 67 files existed in repo — the rest were
deployed directly to VPS via SCP, causing massive drift.

Includes:
- pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.)
- pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh
- diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics)
- agent-state/: bootstrap, lib-state, cascade inbox processor, schema
- systemd/: service unit files for reference
- deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate
- research-session.sh: updated with Step 8.5 digest + cascade inbox processing

No new code written — all files are exact copies from VPS as of 2026-04-06.
From this point forward: edit in repo, commit, then deploy.sh.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 00:00:00 +01:00

312 lines
11 KiB
Python

"""Daily digest: aggregates 24h activity for Telegram bot consumption.
Data sources:
- pipeline.db: merged PRs, audit events, contributor activity
- Forgejo API: PR descriptions for claim summaries
- claim-index: total claims, domain breakdown
- review queue: pending approval counts
Endpoint: GET /api/daily-digest?hours=24
"""
import asyncio
import logging
import sqlite3
from datetime import datetime, timezone, timedelta
from typing import Any
import aiohttp
logger = logging.getLogger("argus.daily_digest")
FORGEJO_BASE = "https://git.livingip.xyz/api/v1"
REPO = "teleo/teleo-codex"
CLAIM_INDEX_URL = "http://localhost:8080/claim-index"
async def fetch_daily_digest(
db_path: str,
forgejo_token: str | None = None,
hours: int = 24,
timeout_s: int = 15,
) -> dict[str, Any]:
"""Build the daily digest payload.
Returns structured data for Epimetheus's Telegram bot to format and send.
"""
cutoff = (datetime.now(timezone.utc) - timedelta(hours=hours)).isoformat()
# Parallel: DB queries + HTTP fetches
db_data = _query_db(db_path, cutoff, hours)
headers = {"Accept": "application/json"}
if forgejo_token:
headers["Authorization"] = f"token {forgejo_token}"
connector = aiohttp.TCPConnector(ssl=False)
async with aiohttp.ClientSession(headers=headers, connector=connector) as session:
# Fetch claim-index, merged PR details from Forgejo, and open PR count in parallel
merged_numbers = [pr["number"] for pr in db_data["merged_prs"]]
tasks = [
_fetch_claim_index(session, timeout_s),
_fetch_merged_pr_details(session, merged_numbers, timeout_s),
_fetch_open_pr_count(session, timeout_s),
]
claim_index, pr_details, open_pr_count = await asyncio.gather(*tasks)
# Enrich merged PRs with Forgejo descriptions
merged_claims = _build_merged_claims(db_data["merged_prs"], pr_details)
return {
"period_hours": hours,
"generated_at": datetime.now(timezone.utc).isoformat(),
"claims_merged": merged_claims,
"pipeline_stats": {
"prs_merged": db_data["prs_merged"],
"prs_opened": db_data["prs_opened"],
"prs_rejected": db_data["prs_rejected"],
"approval_rate": db_data["approval_rate"],
"top_rejection_reasons": db_data["top_rejection_reasons"],
},
"agent_activity": db_data["agent_activity"],
"pending_review": {
"open_prs": open_pr_count,
},
"knowledge_base": {
"total_claims": claim_index.get("total_claims", 0),
"domains": claim_index.get("domains", {}),
"orphan_ratio": claim_index.get("orphan_ratio", 0),
"cross_domain_links": claim_index.get("cross_domain_links", 0),
},
}
def _query_db(db_path: str, cutoff: str, hours: int) -> dict[str, Any]:
"""Run all DB queries synchronously (SQLite is fast enough for digest)."""
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
conn.row_factory = sqlite3.Row
try:
# Merged PRs in period
merged_prs = conn.execute(
"""SELECT number, branch, domain, agent, commit_type, merged_at, cost_usd
FROM prs WHERE status = 'merged' AND merged_at >= ?
ORDER BY merged_at DESC""",
(cutoff,),
).fetchall()
prs_merged = len(merged_prs)
# PRs opened in period
prs_opened = conn.execute(
"SELECT COUNT(*) FROM prs WHERE created_at >= ?", (cutoff,)
).fetchone()[0]
# Rejected PRs in period (closed/zombie with rejection events)
prs_rejected = conn.execute(
"""SELECT COUNT(DISTINCT json_extract(detail, '$.pr'))
FROM audit_log
WHERE stage = 'evaluate'
AND event IN ('domain_rejected', 'tier05_rejected')
AND timestamp >= ?""",
(cutoff,),
).fetchone()[0]
# Approval rate
total_evaluated = prs_merged + prs_rejected
approval_rate = round(prs_merged / total_evaluated * 100, 1) if total_evaluated > 0 else 0.0
# Top rejection reasons
rejection_rows = conn.execute(
"""SELECT json_extract(detail, '$.issues') as issues
FROM audit_log
WHERE stage = 'evaluate'
AND event IN ('domain_rejected', 'tier05_rejected')
AND timestamp >= ?
AND json_valid(detail)""",
(cutoff,),
).fetchall()
reason_counts: dict[str, int] = {}
import json
for row in rejection_rows:
if row["issues"]:
try:
issues = json.loads(row["issues"])
if isinstance(issues, list):
for issue in issues:
reason_counts[issue] = reason_counts.get(issue, 0) + 1
except (json.JSONDecodeError, TypeError):
pass
top_rejection_reasons = sorted(reason_counts.items(), key=lambda x: -x[1])[:5]
top_rejection_reasons = [{"reason": r, "count": c} for r, c in top_rejection_reasons]
# Agent activity — who contributed what
agent_rows = conn.execute(
"""SELECT agent,
COUNT(*) as total,
SUM(CASE WHEN status = 'merged' THEN 1 ELSE 0 END) as merged,
SUM(CASE WHEN commit_type = 'extract' OR commit_type = 'research' THEN 1 ELSE 0 END) as extractions,
SUM(CASE WHEN commit_type = 'challenge' THEN 1 ELSE 0 END) as challenges,
SUM(CASE WHEN commit_type = 'enrich' OR commit_type = 'reweave' THEN 1 ELSE 0 END) as enrichments,
SUM(CASE WHEN commit_type = 'synthesize' THEN 1 ELSE 0 END) as syntheses
FROM prs
WHERE created_at >= ? AND agent IS NOT NULL AND agent != ''
GROUP BY agent
ORDER BY merged DESC""",
(cutoff,),
).fetchall()
agent_activity = [
{
"agent": row["agent"],
"prs_total": row["total"],
"prs_merged": row["merged"],
"extractions": row["extractions"],
"challenges": row["challenges"],
"enrichments": row["enrichments"],
"syntheses": row["syntheses"],
}
for row in agent_rows
]
return {
"merged_prs": [dict(pr) for pr in merged_prs],
"prs_merged": prs_merged,
"prs_opened": prs_opened,
"prs_rejected": prs_rejected,
"approval_rate": approval_rate,
"top_rejection_reasons": top_rejection_reasons,
"agent_activity": agent_activity,
}
finally:
conn.close()
async def _fetch_claim_index(session: aiohttp.ClientSession, timeout_s: int) -> dict:
"""Fetch claim-index summary stats."""
try:
async with session.get(
CLAIM_INDEX_URL,
timeout=aiohttp.ClientTimeout(total=timeout_s),
) as resp:
if resp.status == 200:
data = await resp.json()
return {
"total_claims": data.get("total_claims", 0),
"domains": data.get("domains", {}),
"orphan_ratio": data.get("orphan_ratio", 0),
"cross_domain_links": data.get("cross_domain_links", 0),
}
except Exception as e:
logger.warning("Failed to fetch claim-index: %s", e)
return {}
async def _fetch_merged_pr_details(
session: aiohttp.ClientSession,
pr_numbers: list[int],
timeout_s: int,
) -> dict[int, dict]:
"""Fetch PR details from Forgejo for merged PRs (parallel)."""
if not pr_numbers:
return {}
async def _fetch_one(n: int) -> tuple[int, dict]:
url = f"{FORGEJO_BASE}/repos/{REPO}/pulls/{n}"
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=timeout_s)) as resp:
if resp.status == 200:
return n, await resp.json()
except Exception as e:
logger.warning("Failed to fetch PR #%d: %s", n, e)
return n, {}
results = await asyncio.gather(*[_fetch_one(n) for n in pr_numbers])
return {n: data for n, data in results}
async def _fetch_open_pr_count(session: aiohttp.ClientSession, timeout_s: int) -> int:
"""Get count of open PRs from Forgejo."""
url = f"{FORGEJO_BASE}/repos/{REPO}/pulls?state=open&limit=1"
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=timeout_s)) as resp:
if resp.status == 200:
# Forgejo returns X-Total-Count header
total = resp.headers.get("X-Total-Count")
if total is not None:
return int(total)
# Fallback: fetch all and count
data = await resp.json()
return len(data)
except Exception as e:
logger.warning("Failed to fetch open PR count: %s", e)
return 0
def _build_merged_claims(
merged_prs: list[dict],
pr_details: dict[int, dict],
) -> list[dict]:
"""Build claim summaries from merged PRs + Forgejo PR bodies."""
claims = []
for pr in merged_prs:
number = pr["number"]
detail = pr_details.get(number, {})
# Extract summary from PR body (first paragraph or first 200 chars)
body = detail.get("body", "") or ""
summary = _extract_summary(body)
claims.append({
"pr_number": number,
"title": detail.get("title", pr.get("branch", f"PR #{number}")),
"agent": pr.get("agent", "unknown"),
"domain": pr.get("domain", "unknown"),
"commit_type": pr.get("commit_type", "knowledge"),
"summary": summary,
"merged_at": pr.get("merged_at", ""),
"cost_usd": pr.get("cost_usd", 0.0),
"url": detail.get("html_url", ""),
})
return claims
def _extract_summary(body: str) -> str:
"""Extract a 1-2 sentence summary from PR body markdown.
Looks for a Summary section first, then falls back to first non-header paragraph.
"""
if not body:
return ""
lines = body.strip().split("\n")
# Look for ## Summary section
in_summary = False
summary_lines = []
for line in lines:
if line.strip().lower().startswith("## summary"):
in_summary = True
continue
if in_summary:
if line.startswith("##"):
break
stripped = line.strip()
if stripped and not stripped.startswith("- ["): # skip checklists
summary_lines.append(stripped)
if len(summary_lines) >= 3:
break
if summary_lines:
return " ".join(summary_lines)[:300]
# Fallback: first non-header, non-empty paragraph
for line in lines:
stripped = line.strip()
if stripped and not stripped.startswith("#") and not stripped.startswith("- ["):
return stripped[:300]
return ""