teleo-infrastructure/telegram/bot.py
m3taversal d97f68714a epimetheus: fix 2 nits from Ganymede final review
1. _merge_pr marked as CURRENTLY UNUSED (local ff-push is primary path)
2. Conversation window messages skip cold rate limit check (window counter IS the limit)

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-20 20:25:06 +00:00

713 lines
27 KiB
Python

#!/usr/bin/env python3
"""Teleo Telegram Bot — Rio as analytical agent in community groups.
Architecture:
- Always-on ingestion: captures all messages, batch triage every N minutes
- Tag-based response: Opus-quality KB-grounded responses when @tagged
- Conversation-window triage: identifies coherent claims across message threads
- Full eval tracing: Rio's responses are logged as KB claims, accountable
Two paths (Ganymede architecture):
- Fast path (read): tag → KB query → Opus response → post to group
- Slow path (write): batch triage → archive to inbox/ → pipeline extracts
Separate systemd service: teleo-telegram.service
Does NOT integrate with pipeline daemon.
Epimetheus owns this module.
"""
import asyncio
import logging
import os
import re
import sqlite3
import sys
import time
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
# Add pipeline lib to path for shared modules
sys.path.insert(0, "/opt/teleo-eval/pipeline")
from telegram import Update
from telegram.ext import (
Application,
CommandHandler,
ContextTypes,
MessageHandler,
filters,
)
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from kb_retrieval import KBIndex, format_context_for_prompt, retrieve_context
from market_data import get_token_price, format_price_context
from worktree_lock import main_worktree_lock
# ─── Config ─────────────────────────────────────────────────────────────
BOT_TOKEN_FILE = "/opt/teleo-eval/secrets/telegram-bot-token"
OPENROUTER_KEY_FILE = "/opt/teleo-eval/secrets/openrouter-key"
PIPELINE_DB = "/opt/teleo-eval/pipeline/pipeline.db"
KB_READ_DIR = "/opt/teleo-eval/workspaces/main" # For KB retrieval (clean main branch)
ARCHIVE_DIR = "/opt/teleo-eval/workspaces/main" # For archiving sources (push_main_with_retry)
LOG_FILE = "/opt/teleo-eval/logs/telegram-bot.log"
# Triage interval (seconds)
TRIAGE_INTERVAL = 900 # 15 minutes
# Models
RESPONSE_MODEL = "anthropic/claude-opus-4-6" # Opus for tagged responses
TRIAGE_MODEL = "anthropic/claude-haiku-4.5" # Haiku for batch triage
# Rate limits
MAX_RESPONSE_PER_USER_PER_HOUR = 6
MIN_MESSAGE_LENGTH = 20 # Skip very short messages
# ─── Logging ────────────────────────────────────────────────────────────
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(name)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler(),
],
)
logger = logging.getLogger("telegram-bot")
# ─── State ──────────────────────────────────────────────────────────────
# Message buffer for batch triage
message_buffer: list[dict] = []
# Rate limiting
user_response_times: dict[int, list[float]] = defaultdict(list)
# Allowed group IDs (set after first message received, or configure)
allowed_groups: set[int] = set()
# Shared KB index (built once, refreshed on mtime change)
kb_index = KBIndex(KB_READ_DIR)
# Conversation windows — track active conversations per (chat_id, user_id)
# Rhea's model: count unanswered messages, reset on bot response, expire at threshold
CONVERSATION_WINDOW = 5 # expire after 5 unanswered messages
unanswered_count: dict[tuple[int, int], int] = {} # (chat_id, user_id) → unanswered count
# Conversation history — last N exchanges for prompt context (Ganymede: high-value change)
MAX_HISTORY = 5
conversation_history: dict[tuple[int, int], list[dict]] = {} # (chat_id, user_id) → [{user, bot}]
# ─── Helpers ────────────────────────────────────────────────────────────
def get_db_stats() -> dict:
"""Get basic KB stats from pipeline DB."""
try:
conn = sqlite3.connect(PIPELINE_DB, timeout=5)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA query_only=ON")
merged = conn.execute("SELECT COUNT(*) as n FROM prs WHERE status='merged'").fetchone()["n"]
contributors = conn.execute("SELECT COUNT(*) as n FROM contributors").fetchone()["n"]
conn.close()
return {"merged_claims": merged, "contributors": contributors}
except Exception:
return {"merged_claims": "?", "contributors": "?"}
async def call_openrouter(model: str, prompt: str, max_tokens: int = 2048) -> str | None:
"""Call OpenRouter API."""
import aiohttp
key = Path(OPENROUTER_KEY_FILE).read_text().strip()
payload = {
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.3,
}
try:
async with aiohttp.ClientSession() as session:
async with session.post(
"https://openrouter.ai/api/v1/chat/completions",
headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
json=payload,
timeout=aiohttp.ClientTimeout(total=120),
) as resp:
if resp.status >= 400:
logger.error("OpenRouter %s%d", model, resp.status)
return None
data = await resp.json()
return data.get("choices", [{}])[0].get("message", {}).get("content")
except Exception as e:
logger.error("OpenRouter error: %s", e)
return None
def is_rate_limited(user_id: int) -> bool:
"""Check if a user has exceeded the response rate limit."""
now = time.time()
times = user_response_times[user_id]
# Prune old entries
times[:] = [t for t in times if now - t < 3600]
return len(times) >= MAX_RESPONSE_PER_USER_PER_HOUR
def sanitize_message(text: str) -> str:
"""Sanitize message content before sending to LLM. (Ganymede: security)"""
# Strip code blocks (potential prompt injection)
text = re.sub(r"```.*?```", "[code block removed]", text, flags=re.DOTALL)
# Strip anything that looks like system instructions
text = re.sub(r"(system:|assistant:|human:|<\|.*?\|>)", "", text, flags=re.IGNORECASE)
# Truncate
return text[:2000]
def _git_commit_archive(archive_path, filename: str):
"""Commit archived source to git so it survives git clean. (Rio review: data loss bug)"""
import subprocess
try:
cwd = ARCHIVE_DIR
subprocess.run(["git", "add", str(archive_path)], cwd=cwd, timeout=10,
capture_output=True, check=False)
result = subprocess.run(
["git", "commit", "-m", f"telegram: archive {filename}\n\n"
"Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>"],
cwd=cwd, timeout=10, capture_output=True, check=False,
)
if result.returncode == 0:
# Push with retry (Ganymede: abort rebase on failure, don't lose the file)
for attempt in range(3):
rebase = subprocess.run(["git", "pull", "--rebase", "origin", "main"],
cwd=cwd, timeout=30, capture_output=True, check=False)
if rebase.returncode != 0:
subprocess.run(["git", "rebase", "--abort"], cwd=cwd, timeout=10,
capture_output=True, check=False)
logger.warning("Git rebase failed for archive %s (attempt %d), aborted", filename, attempt + 1)
continue
push = subprocess.run(["git", "push", "origin", "main"],
cwd=cwd, timeout=30, capture_output=True, check=False)
if push.returncode == 0:
logger.info("Git committed archive: %s", filename)
return
# All retries failed — file is still on filesystem (safety net), commit is uncommitted
logger.warning("Git push failed for archive %s after 3 attempts (file preserved on disk)", filename)
except Exception as e:
logger.warning("Git commit archive failed: %s", e)
def _format_conversation_history(chat_id: int, user_id: int) -> str:
"""Format conversation history for injection into the Opus prompt."""
key = (chat_id, user_id)
history = conversation_history.get(key, [])
if not history:
return "(No prior conversation)"
lines = []
for exchange in history:
lines.append(f"User: {exchange['user']}")
lines.append(f"Rio: {exchange['bot']}")
lines.append("")
return "\n".join(lines)
# ─── Message Handlers ───────────────────────────────────────────────────
def _is_reply_to_bot(update: Update, context: ContextTypes.DEFAULT_TYPE) -> bool:
"""Check if a message is a reply to one of the bot's own messages."""
msg = update.message
if not msg or not msg.reply_to_message:
return False
replied = msg.reply_to_message
return replied.from_user is not None and replied.from_user.id == context.bot.id
async def handle_reply_to_bot(update: Update, context: ContextTypes.DEFAULT_TYPE):
"""Handle replies to the bot's messages — treat as tagged conversation."""
if not _is_reply_to_bot(update, context):
# Not a reply to us — fall through to buffer handler
await handle_message(update, context)
return
logger.info("Reply to bot from @%s",
update.message.from_user.username if update.message.from_user else "unknown")
await handle_tagged(update, context)
async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
"""Handle ALL incoming group messages — buffer for triage."""
if not update.message or not update.message.text:
return
msg = update.message
text = msg.text.strip()
# Skip very short messages
if len(text) < MIN_MESSAGE_LENGTH:
return
# Check if user is in an active conversation window (Rhea: unanswered counter model)
# Window counter IS the rate limit — don't check cold rate limit (Ganymede: separate budget)
user = msg.from_user
if user:
key = (msg.chat_id, user.id)
if key in unanswered_count and unanswered_count[key] < CONVERSATION_WINDOW:
unanswered_count[key] += 1
logger.info("Conversation window: @%s msg %d/%d",
user.username or "?", unanswered_count[key], CONVERSATION_WINDOW)
await handle_tagged(update, context)
return
# Buffer for batch triage
message_buffer.append({
"text": sanitize_message(text),
"user_id": msg.from_user.id if msg.from_user else None,
"username": msg.from_user.username if msg.from_user else None,
"display_name": msg.from_user.full_name if msg.from_user else None,
"chat_id": msg.chat_id,
"message_id": msg.message_id,
"timestamp": msg.date.isoformat() if msg.date else datetime.now(timezone.utc).isoformat(),
"reply_to": msg.reply_to_message.message_id if msg.reply_to_message else None,
})
async def handle_tagged(update: Update, context: ContextTypes.DEFAULT_TYPE):
"""Handle messages that tag the bot — Rio responds with Opus."""
if not update.message or not update.message.text:
return
msg = update.message
user = msg.from_user
text = sanitize_message(msg.text)
# Rate limit check
if user and is_rate_limited(user.id):
await msg.reply_text("I'm processing other requests — try again in a few minutes.")
return
logger.info("Tagged by @%s: %s", user.username if user else "unknown", text[:100])
# Send typing indicator
await msg.chat.send_action("typing")
# Retrieve full KB context (entity resolution + claim search + agent positions)
kb_ctx = retrieve_context(text, KB_READ_DIR, index=kb_index)
kb_context_text = format_context_for_prompt(kb_ctx)
stats = get_db_stats()
# Fetch live market data for any tokens mentioned (Rhea: market-data API)
market_context = ""
token_mentions = re.findall(r"\$([A-Z]{2,10})", text.upper())
# Entity name → token mapping for natural language mentions
ENTITY_TOKEN_MAP = {
"omnipair": "OMFG", "metadao": "META", "sanctum": "CLOUD",
"drift": "DRIFT", "ore": "ORE", "jupiter": "JUP",
}
text_lower = text.lower()
for name, ticker in ENTITY_TOKEN_MAP.items():
if name in text_lower:
token_mentions.append(ticker)
# Also check entity matches from KB retrieval
for ent in kb_ctx.entities:
for tag in ent.tags:
if tag.upper() in ENTITY_TOKEN_MAP.values():
token_mentions.append(tag.upper())
for token in set(token_mentions):
try:
data = await get_token_price(token)
if data:
price_str = format_price_context(data, token)
if price_str:
market_context += price_str + "\n"
except Exception:
pass # Market data is supplementary — never block on failure
# Build Opus prompt — Rio's voice
prompt = f"""You are Rio, an internet finance analyst responding in a Telegram group. You have deep knowledge about futarchy, prediction markets, token governance, and the MetaDAO ecosystem.
## How to sound
Write like a sharp analyst talking to peers, not like an AI. Specifically:
- Use your knowledge naturally. Don't say "the KB tracks" or "at experimental confidence" or "our claims show." Just state what you know and how confident you are in plain language.
- Have a take. You're an analyst, not a summarizer. Say what you actually think.
- Keep it tight. 2-3 paragraphs. No walls of text.
- Sound human. Avoid em dashes, avoid starting sentences with "That said" or "The honest X is." Vary your sentence structure. Be direct.
- No markdown. Plain text only, no asterisks or formatting. Use line breaks between paragraphs.
- When you're uncertain, just say so simply. "I'm not sure about X" beats "we don't have data on this yet."
## What you know about this topic
{kb_context_text}
{f"## Live Market Data{chr(10)}{market_context}" if market_context else ""}
## Conversation History
{_format_conversation_history(msg.chat_id, user.id if user else 0)}
## The message you're responding to
From: @{user.username if user else 'unknown'}
Message: {text}
Respond now. Be substantive but concise. If they're wrong about something, say so directly. If they know something you don't, tell them it's worth digging into. If they correct you, accept it and build on the correction."""
# Call Opus
response = await call_openrouter(RESPONSE_MODEL, prompt, max_tokens=1024)
if not response:
await msg.reply_text("Processing error — I'll get back to you.")
return
# Post response
await msg.reply_text(response)
# Update conversation state: reset window, store history (Ganymede+Rhea)
if user:
key = (msg.chat_id, user.id)
unanswered_count[key] = 0 # reset — conversation alive
history = conversation_history.setdefault(key, [])
history.append({"user": text[:500], "bot": response[:500]})
if len(history) > MAX_HISTORY:
history.pop(0)
# Record rate limit
if user:
user_response_times[user.id].append(time.time())
# Log the exchange for audit trail
logger.info("Rio responded to @%s (msg_id=%d)", user.username if user else "?", msg.message_id)
# Detect and fetch URLs for pipeline ingestion
urls = _extract_urls(text)
url_content = None
if urls:
logger.info("Fetching URL: %s", urls[0])
url_content = await _fetch_url_content(urls[0])
if url_content:
logger.info("Fetched %d chars from %s", len(url_content), urls[0])
# Archive the exchange as a source for pipeline (slow path)
_archive_exchange(text, response, user, msg, url_content=url_content, urls=urls)
async def _fetch_url_content(url: str) -> str | None:
"""Fetch article/page content from a URL for pipeline ingestion."""
import aiohttp
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
if resp.status >= 400:
return None
html = await resp.text()
# Strip HTML tags for plain text (basic — upgrade to readability later)
text = re.sub(r"<script.*?</script>", "", html, flags=re.DOTALL)
text = re.sub(r"<style.*?</style>", "", text, flags=re.DOTALL)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text[:10000] # Cap at 10K chars
except Exception as e:
logger.warning("Failed to fetch URL %s: %s", url, e)
return None
def _extract_urls(text: str) -> list[str]:
"""Extract URLs from message text."""
return re.findall(r"https?://[^\s<>\"']+", text)
def _archive_exchange(user_text: str, rio_response: str, user, msg,
url_content: str | None = None, urls: list[str] | None = None):
"""Archive a tagged exchange to inbox/queue/ for pipeline processing."""
try:
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
username = user.username if user else "anonymous"
slug = re.sub(r"[^a-z0-9]+", "-", user_text[:50].lower()).strip("-")
filename = f"{date_str}-telegram-{username}-{slug}.md"
archive_path = Path(ARCHIVE_DIR) / "inbox" / "queue" / filename
archive_path.parent.mkdir(parents=True, exist_ok=True)
# Extract rationale (the user's text minus the @mention and URL)
rationale = re.sub(r"@\w+", "", user_text).strip()
for url in (urls or []):
rationale = rationale.replace(url, "").strip()
# Determine priority — directed contribution with rationale gets high priority
priority = "high" if rationale and len(rationale) > 20 else "medium"
intake_tier = "directed" if rationale and len(rationale) > 20 else "undirected"
url_section = ""
if url_content:
url_section = f"\n## Article Content (fetched)\n\n{url_content[:8000]}\n"
content = f"""---
type: source
source_type: telegram
title: "Telegram: @{username}{slug}"
author: "@{username}"
url: "{urls[0] if urls else ''}"
date: {date_str}
domain: internet-finance
format: conversation
status: unprocessed
priority: {priority}
intake_tier: {intake_tier}
rationale: "{rationale[:200]}"
proposed_by: "@{username}"
tags: [telegram, ownership-community]
---
## Conversation
**@{username}:**
{user_text}
**Rio (response):**
{rio_response}
{url_section}
## Agent Notes
**Why archived:** Tagged exchange in ownership community.
**Rationale from contributor:** {rationale if rationale else 'No rationale provided (bare link or question)'}
**Intake tier:** {intake_tier}{'fast-tracked, contributor provided reasoning' if intake_tier == 'directed' else 'standard processing'}
**Triage:** Conversation may contain [CLAIM], [ENTITY], or [EVIDENCE] for extraction.
"""
with main_worktree_lock(timeout=10):
archive_path.write_text(content)
logger.info("Archived exchange to %s (tier: %s, urls: %d)",
filename, intake_tier, len(urls or []))
_git_commit_archive(archive_path, filename)
except TimeoutError:
logger.warning("Failed to archive exchange: worktree lock timeout")
except Exception as e:
logger.error("Failed to archive exchange: %s", e)
# ─── Batch Triage ───────────────────────────────────────────────────────
async def run_batch_triage(context: ContextTypes.DEFAULT_TYPE):
"""Batch triage of buffered messages every TRIAGE_INTERVAL seconds.
Groups messages into conversation windows, sends to Haiku for classification,
archives substantive findings.
"""
global message_buffer
if not message_buffer:
return
# Grab and clear buffer
messages = message_buffer[:]
message_buffer = []
logger.info("Batch triage: %d messages to process", len(messages))
# Group into conversation windows (messages within 5 min of each other)
windows = _group_into_windows(messages, window_seconds=300)
if not windows:
return
# Build triage prompt
windows_text = ""
for i, window in enumerate(windows):
window_msgs = "\n".join(
f" @{m.get('username', '?')}: {m['text'][:200]}"
for m in window
)
windows_text += f"\n--- Window {i+1} ({len(window)} messages) ---\n{window_msgs}\n"
prompt = f"""Classify each conversation window. For each, respond with ONE tag:
[CLAIM] — Contains a specific, disagreeable proposition about how something works
[ENTITY] — Contains factual data about a company, protocol, person, or market
[EVIDENCE] — Contains data or argument that supports or challenges an existing claim about internet finance, futarchy, prediction markets, or token governance
[SKIP] — Casual conversation, not relevant to the knowledge base
Be generous with EVIDENCE — even confirming evidence strengthens the KB.
{windows_text}
Respond with ONLY the window numbers and tags, one per line:
1: [TAG]
2: [TAG]
..."""
result = await call_openrouter(TRIAGE_MODEL, prompt, max_tokens=500)
if not result:
logger.warning("Triage LLM call failed — buffered messages dropped")
return
# Parse triage results
for line in result.strip().split("\n"):
match = re.match(r"(\d+):\s*\[(\w+)\]", line)
if not match:
continue
idx = int(match.group(1)) - 1
tag = match.group(2).upper()
if idx < 0 or idx >= len(windows):
continue
if tag in ("CLAIM", "ENTITY", "EVIDENCE"):
_archive_window(windows[idx], tag)
logger.info("Triage complete: %d windows processed", len(windows))
def _group_into_windows(messages: list[dict], window_seconds: int = 300) -> list[list[dict]]:
"""Group messages into conversation windows by time proximity."""
if not messages:
return []
# Sort by timestamp
messages.sort(key=lambda m: m.get("timestamp", ""))
windows = []
current_window = [messages[0]]
for msg in messages[1:]:
# Simple grouping: if within window_seconds of previous message, same window
current_window.append(msg)
if len(current_window) >= 10: # Cap window size
windows.append(current_window)
current_window = []
if current_window:
windows.append(current_window)
return windows
def _archive_window(window: list[dict], tag: str):
"""Archive a triaged conversation window to inbox/queue/."""
try:
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
first_user = window[0].get("username", "group")
slug = re.sub(r"[^a-z0-9]+", "-", window[0]["text"][:40].lower()).strip("-")
filename = f"{date_str}-telegram-{first_user}-{slug}.md"
archive_path = Path(ARCHIVE_DIR) / "inbox" / "queue" / filename
archive_path.parent.mkdir(parents=True, exist_ok=True)
# Build conversation content
conversation = ""
contributors = set()
for msg in window:
username = msg.get("username", "anonymous")
contributors.add(username)
conversation += f"**@{username}:** {msg['text']}\n\n"
content = f"""---
type: source
source_type: telegram
title: "Telegram conversation: {slug}"
author: "{', '.join(contributors)}"
date: {date_str}
domain: internet-finance
format: conversation
status: unprocessed
priority: medium
triage_tag: {tag.lower()}
tags: [telegram, ownership-community]
---
## Conversation ({len(window)} messages, {len(contributors)} participants)
{conversation}
## Agent Notes
**Triage:** [{tag}] — classified by batch triage
**Participants:** {', '.join(f'@{u}' for u in contributors)}
"""
with main_worktree_lock(timeout=10):
archive_path.write_text(content)
logger.info("Archived window [%s]: %s (%d msgs, %d participants)",
tag, filename, len(window), len(contributors))
_git_commit_archive(archive_path, filename)
except TimeoutError:
logger.warning("Failed to archive window: worktree lock timeout")
except Exception as e:
logger.error("Failed to archive window: %s", e)
# ─── Bot Setup ──────────────────────────────────────────────────────────
async def start_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
"""Handle /start command."""
await update.message.reply_text(
"I'm Rio, the internet finance agent for TeleoHumanity's collective knowledge base. "
"Tag me with @teleo to ask about futarchy, prediction markets, token governance, "
"or anything in our domain. I'll ground my response in our KB's evidence."
)
async def stats_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
"""Handle /stats command — show KB stats."""
kb_index.ensure_fresh()
stats = get_db_stats()
await update.message.reply_text(
f"📊 KB Stats:\n"
f"{len(kb_index._claims)} claims indexed\n"
f"{len(kb_index._entities)} entities tracked\n"
f"{len(kb_index._positions)} agent positions\n"
f"{stats['merged_claims']} PRs merged\n"
f"{stats['contributors']} contributors"
)
def main():
"""Start the bot."""
# Load token
token_path = Path(BOT_TOKEN_FILE)
if not token_path.exists():
logger.error("Bot token not found at %s", BOT_TOKEN_FILE)
sys.exit(1)
token = token_path.read_text().strip()
logger.info("Starting Teleo Telegram bot (Rio)...")
# Build application
app = Application.builder().token(token).build()
# Command handlers
app.add_handler(CommandHandler("start", start_command))
app.add_handler(CommandHandler("stats", stats_command))
# Tag handler — messages mentioning the bot
# python-telegram-bot filters.Mention doesn't work for bot mentions in groups
# Use a regex filter for the bot username
app.add_handler(MessageHandler(
filters.TEXT & filters.Regex(r"(?i)(@teleo|@futairdbot)"),
handle_tagged,
))
# Reply handler — replies to the bot's own messages continue the conversation
reply_to_bot_filter = filters.TEXT & filters.REPLY & ~filters.COMMAND
app.add_handler(MessageHandler(
reply_to_bot_filter,
handle_reply_to_bot,
))
# All other text messages — buffer for triage
app.add_handler(MessageHandler(
filters.TEXT & ~filters.COMMAND,
handle_message,
))
# Batch triage job
app.job_queue.run_repeating(
run_batch_triage,
interval=TRIAGE_INTERVAL,
first=TRIAGE_INTERVAL,
)
# Run
logger.info("Bot running. Triage interval: %ds", TRIAGE_INTERVAL)
app.run_polling(drop_pending_updates=True)
if __name__ == "__main__":
main()