commit telegram bot module from VPS — 20 files never previously in repo
Pulled from /opt/teleo-eval/telegram/ on VPS. Includes: - bot.py (92K), kb_retrieval.py, kb_tools.py (agentic retrieval) - retrieval.py (RRF merge, query decomposition, entity traversal) - response.py (system prompt builder, response parser) - agent_config.py, agent_runner.py (multi-agent template unit support) - approval_stages.py, approvals.py, digest.py (approval workflow) - eval_checks.py, eval.py (response quality checks) - output_gate.py, x_publisher.py, x_client.py, x_search.py (X pipeline) - market_data.py, worktree_lock.py (utilities) - rio.yaml, theseus.yaml (agent configs) These files were deployed to VPS but never committed to the repo. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
7ba6247b9d
commit
7bfce6b706
20 changed files with 6668 additions and 0 deletions
160
ops/pipeline-v2/telegram/agent_config.py
Normal file
160
ops/pipeline-v2/telegram/agent_config.py
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Agent config loader and validator.
|
||||
|
||||
Loads YAML config files from telegram/agents/*.yaml, validates required fields,
|
||||
resolves file paths. Used by bot.py and future agent_runner.py.
|
||||
|
||||
Epimetheus owns this module.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger("tg.agent_config")
|
||||
|
||||
SECRETS_DIR = "/opt/teleo-eval/secrets"
|
||||
WORKTREE_DIR = "/opt/teleo-eval/workspaces/main"
|
||||
|
||||
REQUIRED_FIELDS = ["name", "handle", "bot_token_file", "pentagon_agent_id", "domain"]
|
||||
REQUIRED_VOICE_FIELDS = ["voice_summary", "voice_definition"]
|
||||
REQUIRED_KB_FIELDS = ["kb_scope"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgentConfig:
|
||||
"""Validated agent configuration loaded from YAML."""
|
||||
name: str
|
||||
handle: str
|
||||
x_handle: Optional[str]
|
||||
bot_token_file: str
|
||||
pentagon_agent_id: str
|
||||
domain: str
|
||||
kb_scope_primary: list[str]
|
||||
voice_summary: str
|
||||
voice_definition: str
|
||||
domain_expertise: str
|
||||
learnings_file: str
|
||||
opsec_additional_patterns: list[str] = field(default_factory=list)
|
||||
response_model: str = "anthropic/claude-opus-4-6"
|
||||
triage_model: str = "anthropic/claude-haiku-4.5"
|
||||
max_tokens: int = 1024
|
||||
max_response_per_user_per_hour: int = 30
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dict for passing to build_system_prompt."""
|
||||
return {
|
||||
"name": self.name,
|
||||
"handle": self.handle,
|
||||
"x_handle": self.x_handle,
|
||||
"domain": self.domain,
|
||||
"voice_definition": self.voice_definition,
|
||||
"voice_summary": self.voice_summary,
|
||||
"domain_expertise": self.domain_expertise,
|
||||
"pentagon_agent_id": self.pentagon_agent_id,
|
||||
}
|
||||
|
||||
@property
|
||||
def bot_token_path(self) -> str:
|
||||
return os.path.join(SECRETS_DIR, self.bot_token_file)
|
||||
|
||||
@property
|
||||
def learnings_path(self) -> str:
|
||||
return os.path.join(WORKTREE_DIR, self.learnings_file)
|
||||
|
||||
@property
|
||||
def handle_regex(self) -> re.Pattern:
|
||||
"""Regex matching this agent's @handle with optional @botname suffix."""
|
||||
clean = self.handle.lstrip("@")
|
||||
return re.compile(rf"@{re.escape(clean)}(?:@\w+)?", re.IGNORECASE)
|
||||
|
||||
|
||||
def load_agent_config(config_path: str) -> AgentConfig:
|
||||
"""Load and validate an agent YAML config file.
|
||||
|
||||
Raises ValueError on validation failure.
|
||||
"""
|
||||
import yaml
|
||||
|
||||
with open(config_path) as f:
|
||||
raw = yaml.safe_load(f)
|
||||
|
||||
errors = []
|
||||
|
||||
# Required fields
|
||||
for fld in REQUIRED_FIELDS + REQUIRED_VOICE_FIELDS:
|
||||
if fld not in raw or not raw[fld]:
|
||||
errors.append(f"Missing required field: {fld}")
|
||||
|
||||
# KB scope
|
||||
kb_scope = raw.get("kb_scope", {})
|
||||
if not isinstance(kb_scope, dict) or "primary" not in kb_scope:
|
||||
errors.append("Missing kb_scope.primary (list of primary domain dirs)")
|
||||
elif not isinstance(kb_scope["primary"], list) or len(kb_scope["primary"]) == 0:
|
||||
errors.append("kb_scope.primary must be a non-empty list")
|
||||
|
||||
# Learnings file
|
||||
if "learnings_file" not in raw:
|
||||
errors.append("Missing required field: learnings_file")
|
||||
|
||||
if errors:
|
||||
raise ValueError(
|
||||
f"Agent config validation failed ({config_path}):\n"
|
||||
+ "\n".join(f" - {e}" for e in errors)
|
||||
)
|
||||
|
||||
return AgentConfig(
|
||||
name=raw["name"],
|
||||
handle=raw["handle"],
|
||||
x_handle=raw.get("x_handle"),
|
||||
bot_token_file=raw["bot_token_file"],
|
||||
pentagon_agent_id=raw["pentagon_agent_id"],
|
||||
domain=raw["domain"],
|
||||
kb_scope_primary=kb_scope["primary"],
|
||||
voice_summary=raw["voice_summary"],
|
||||
voice_definition=raw["voice_definition"],
|
||||
domain_expertise=raw.get("domain_expertise", ""),
|
||||
learnings_file=raw["learnings_file"],
|
||||
opsec_additional_patterns=raw.get("opsec_additional_patterns", []),
|
||||
response_model=raw.get("response_model", "anthropic/claude-opus-4-6"),
|
||||
triage_model=raw.get("triage_model", "anthropic/claude-haiku-4.5"),
|
||||
max_tokens=raw.get("max_tokens", 1024),
|
||||
max_response_per_user_per_hour=raw.get("max_response_per_user_per_hour", 30),
|
||||
)
|
||||
|
||||
|
||||
def validate_agent_config(config_path: str) -> list[str]:
|
||||
"""Validate config file and check runtime dependencies.
|
||||
|
||||
Returns list of warnings (empty = all good).
|
||||
Raises ValueError on hard failures.
|
||||
"""
|
||||
config = load_agent_config(config_path)
|
||||
warnings = []
|
||||
|
||||
# Check bot token file exists
|
||||
if not os.path.exists(config.bot_token_path):
|
||||
warnings.append(f"Bot token file not found: {config.bot_token_path}")
|
||||
|
||||
# Check primary KB dirs exist
|
||||
for d in config.kb_scope_primary:
|
||||
full = os.path.join(WORKTREE_DIR, d)
|
||||
if not os.path.isdir(full):
|
||||
warnings.append(f"KB scope dir not found: {full}")
|
||||
|
||||
# Check learnings file parent dir exists
|
||||
learnings_dir = os.path.dirname(config.learnings_path)
|
||||
if not os.path.isdir(learnings_dir):
|
||||
warnings.append(f"Learnings dir not found: {learnings_dir}")
|
||||
|
||||
# Validate OPSEC patterns compile
|
||||
for i, pattern in enumerate(config.opsec_additional_patterns):
|
||||
try:
|
||||
re.compile(pattern, re.IGNORECASE)
|
||||
except re.error as e:
|
||||
warnings.append(f"Invalid OPSEC regex pattern [{i}]: {e}")
|
||||
|
||||
return warnings
|
||||
118
ops/pipeline-v2/telegram/agent_runner.py
Normal file
118
ops/pipeline-v2/telegram/agent_runner.py
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Agent runner — entry point for running a Teleo Telegram agent.
|
||||
|
||||
Usage:
|
||||
python3 agent_runner.py --agent rio
|
||||
python3 agent_runner.py --agent theseus
|
||||
python3 agent_runner.py --agent rio --validate
|
||||
|
||||
Systemd template unit: teleo-agent@.service
|
||||
ExecStart=/usr/bin/python3 /opt/teleo-eval/telegram/agent_runner.py --agent %i
|
||||
|
||||
Each agent runs as a separate process for fault isolation.
|
||||
Template unit means `systemctl start teleo-agent@rio` and
|
||||
`systemctl start teleo-agent@theseus` are independent services
|
||||
with separate log streams (journalctl -u teleo-agent@rio).
|
||||
|
||||
Epimetheus owns this module.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
AGENTS_DIR = Path(__file__).parent / "agents"
|
||||
|
||||
|
||||
def find_config(agent_name: str) -> Path:
|
||||
"""Resolve agent name to config file path."""
|
||||
config_path = AGENTS_DIR / f"{agent_name}.yaml"
|
||||
if not config_path.exists():
|
||||
print(f"ERROR: Config not found: {config_path}", file=sys.stderr)
|
||||
print(f"Available agents: {', '.join(p.stem for p in AGENTS_DIR.glob('*.yaml'))}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
return config_path
|
||||
|
||||
|
||||
def validate(agent_name: str) -> bool:
|
||||
"""Validate agent config and runtime dependencies. Returns True if valid."""
|
||||
config_path = find_config(agent_name)
|
||||
# Add telegram dir to path for agent_config import
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from agent_config import validate_agent_config
|
||||
try:
|
||||
warnings = validate_agent_config(str(config_path))
|
||||
if warnings:
|
||||
for w in warnings:
|
||||
print(f" WARNING: {w}", file=sys.stderr)
|
||||
print(f" Config OK: {agent_name} ({config_path})")
|
||||
return True
|
||||
except ValueError as e:
|
||||
print(f" FAILED: {e}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def run(agent_name: str):
|
||||
"""Run the agent bot process."""
|
||||
config_path = find_config(agent_name)
|
||||
|
||||
# Validate before running (fail fast)
|
||||
if not validate(agent_name):
|
||||
sys.exit(1)
|
||||
|
||||
# Set sys.argv so bot.py's main() picks up the config
|
||||
sys.argv = ["bot.py", "--config", str(config_path)]
|
||||
|
||||
# Import and run bot — this blocks until the bot exits
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
import bot
|
||||
bot.main()
|
||||
|
||||
|
||||
def list_agents():
|
||||
"""List available agent configs."""
|
||||
configs = sorted(AGENTS_DIR.glob("*.yaml"))
|
||||
if not configs:
|
||||
print("No agent configs found in", AGENTS_DIR)
|
||||
return
|
||||
print("Available agents:")
|
||||
for p in configs:
|
||||
# Quick parse to get agent name from YAML
|
||||
name = p.stem
|
||||
try:
|
||||
import yaml
|
||||
with open(p) as f:
|
||||
data = yaml.safe_load(f)
|
||||
domain = data.get("domain", "unknown")
|
||||
print(f" {name:12s} domain={domain}")
|
||||
except Exception:
|
||||
print(f" {name:12s} (config parse error)")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run a Teleo Telegram agent",
|
||||
epilog="Systemd: teleo-agent@.service uses --agent %%i"
|
||||
)
|
||||
parser.add_argument("--agent", help="Agent name (e.g., rio, theseus)")
|
||||
parser.add_argument("--validate", action="store_true", help="Validate config and exit")
|
||||
parser.add_argument("--list", action="store_true", help="List available agents")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.list:
|
||||
list_agents()
|
||||
return
|
||||
|
||||
if not args.agent:
|
||||
parser.error("--agent is required (or use --list)")
|
||||
|
||||
if args.validate:
|
||||
ok = validate(args.agent)
|
||||
sys.exit(0 if ok else 1)
|
||||
|
||||
run(args.agent)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
241
ops/pipeline-v2/telegram/approval_stages.py
Normal file
241
ops/pipeline-v2/telegram/approval_stages.py
Normal file
|
|
@ -0,0 +1,241 @@
|
|||
"""Pluggable approval architecture — extensible voting stages for content approval.
|
||||
|
||||
Design constraint from m3ta: the approval step must be a pipeline stage, not hardcoded.
|
||||
|
||||
Current stage: 1 human approves via Telegram.
|
||||
Future stages (interface designed, not implemented):
|
||||
- Agent pre-screening votes (weighted by CI score)
|
||||
- Multi-human approval
|
||||
- Domain-agent substance checks
|
||||
- Futarchy-style decision markets on high-stakes content
|
||||
|
||||
Adding a new approval stage = implementing ApprovalStage and registering it.
|
||||
Threshold logic aggregates votes across all stages.
|
||||
|
||||
Epimetheus owns this module.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Callable, Optional
|
||||
|
||||
logger = logging.getLogger("approval-stages")
|
||||
|
||||
|
||||
class Vote(Enum):
|
||||
APPROVE = "approve"
|
||||
REJECT = "reject"
|
||||
ABSTAIN = "abstain"
|
||||
|
||||
|
||||
@dataclass
|
||||
class StageResult:
|
||||
"""Result from a single approval stage."""
|
||||
stage_name: str
|
||||
vote: Vote
|
||||
weight: float # 0.0 - 1.0, how much this stage's vote counts
|
||||
reason: str = ""
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AggregateResult:
|
||||
"""Aggregated result across all approval stages."""
|
||||
approved: bool
|
||||
total_weight_approve: float
|
||||
total_weight_reject: float
|
||||
total_weight_abstain: float
|
||||
stage_results: list[StageResult]
|
||||
threshold: float # what threshold was used
|
||||
|
||||
@property
|
||||
def summary(self) -> str:
|
||||
status = "APPROVED" if self.approved else "REJECTED"
|
||||
return (
|
||||
f"{status} (approve={self.total_weight_approve:.2f}, "
|
||||
f"reject={self.total_weight_reject:.2f}, "
|
||||
f"threshold={self.threshold:.2f})"
|
||||
)
|
||||
|
||||
|
||||
class ApprovalStage:
|
||||
"""Base class for approval stages.
|
||||
|
||||
Implement check() to add a new approval stage.
|
||||
The method receives the approval request and returns a StageResult.
|
||||
|
||||
Stages run in priority order (lower = earlier).
|
||||
A stage can short-circuit by returning a REJECT with weight >= threshold.
|
||||
"""
|
||||
|
||||
name: str = "unnamed"
|
||||
priority: int = 100 # lower = runs earlier
|
||||
weight: float = 1.0 # default weight of this stage's vote
|
||||
|
||||
def check(self, request: dict) -> StageResult:
|
||||
"""Evaluate the approval request. Must be overridden."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
# ─── Built-in Stages ─────────────────────────────────────────────────
|
||||
|
||||
class OutputGateStage(ApprovalStage):
|
||||
"""Stage 0: Deterministic output gate. Blocks system content."""
|
||||
|
||||
name = "output_gate"
|
||||
priority = 0
|
||||
weight = 1.0 # absolute veto — if gate blocks, nothing passes
|
||||
|
||||
def check(self, request: dict) -> StageResult:
|
||||
from output_gate import gate_for_tweet_queue
|
||||
|
||||
content = request.get("content", "")
|
||||
agent = request.get("originating_agent", "")
|
||||
gate = gate_for_tweet_queue(content, agent)
|
||||
|
||||
if gate:
|
||||
return StageResult(self.name, Vote.APPROVE, self.weight,
|
||||
"Content passed output gate")
|
||||
else:
|
||||
return StageResult(self.name, Vote.REJECT, self.weight,
|
||||
f"Blocked: {', '.join(gate.blocked_reasons)}",
|
||||
{"blocked_reasons": gate.blocked_reasons})
|
||||
|
||||
|
||||
class OpsecStage(ApprovalStage):
|
||||
"""Stage 1: OPSEC content filter. Blocks sensitive content."""
|
||||
|
||||
name = "opsec_filter"
|
||||
priority = 1
|
||||
weight = 1.0 # absolute veto
|
||||
|
||||
def check(self, request: dict) -> StageResult:
|
||||
from approvals import check_opsec
|
||||
|
||||
content = request.get("content", "")
|
||||
violation = check_opsec(content)
|
||||
|
||||
if violation:
|
||||
return StageResult(self.name, Vote.REJECT, self.weight, violation)
|
||||
else:
|
||||
return StageResult(self.name, Vote.APPROVE, self.weight,
|
||||
"No OPSEC violations")
|
||||
|
||||
|
||||
class HumanApprovalStage(ApprovalStage):
|
||||
"""Stage 10: Human approval via Telegram. Currently the final gate.
|
||||
|
||||
This stage is async — it doesn't return immediately.
|
||||
Instead, it sets up the Telegram notification and returns ABSTAIN.
|
||||
The actual vote comes later when Cory taps Approve/Reject.
|
||||
"""
|
||||
|
||||
name = "human_approval"
|
||||
priority = 10
|
||||
weight = 1.0
|
||||
|
||||
def check(self, request: dict) -> StageResult:
|
||||
# Human approval is handled asynchronously via Telegram
|
||||
# This stage just validates the request is properly formatted
|
||||
if not request.get("content"):
|
||||
return StageResult(self.name, Vote.REJECT, self.weight,
|
||||
"No content to approve")
|
||||
|
||||
return StageResult(self.name, Vote.ABSTAIN, self.weight,
|
||||
"Awaiting human approval via Telegram",
|
||||
{"async": True})
|
||||
|
||||
|
||||
# ─── Stage Registry ──────────────────────────────────────────────────
|
||||
|
||||
# Default stages — these run for every approval request
|
||||
_DEFAULT_STAGES: list[ApprovalStage] = [
|
||||
OutputGateStage(),
|
||||
OpsecStage(),
|
||||
HumanApprovalStage(),
|
||||
]
|
||||
|
||||
# Custom stages added by agents or plugins
|
||||
_CUSTOM_STAGES: list[ApprovalStage] = []
|
||||
|
||||
|
||||
def register_stage(stage: ApprovalStage):
|
||||
"""Register a custom approval stage."""
|
||||
_CUSTOM_STAGES.append(stage)
|
||||
_CUSTOM_STAGES.sort(key=lambda s: s.priority)
|
||||
logger.info("Registered approval stage: %s (priority=%d, weight=%.2f)",
|
||||
stage.name, stage.priority, stage.weight)
|
||||
|
||||
|
||||
def get_all_stages() -> list[ApprovalStage]:
|
||||
"""Get all stages sorted by priority."""
|
||||
all_stages = _DEFAULT_STAGES + _CUSTOM_STAGES
|
||||
all_stages.sort(key=lambda s: s.priority)
|
||||
return all_stages
|
||||
|
||||
|
||||
# ─── Aggregation ─────────────────────────────────────────────────────
|
||||
|
||||
def run_sync_stages(request: dict, threshold: float = 0.5) -> AggregateResult:
|
||||
"""Run all synchronous approval stages and aggregate results.
|
||||
|
||||
Stages with async=True in metadata are skipped (handled separately).
|
||||
Short-circuits on any REJECT with weight >= threshold.
|
||||
|
||||
Args:
|
||||
request: dict with at minimum {content, originating_agent, type}
|
||||
threshold: weighted approve score needed to pass (0.0-1.0)
|
||||
|
||||
Returns:
|
||||
AggregateResult with the decision.
|
||||
"""
|
||||
stages = get_all_stages()
|
||||
results = []
|
||||
total_approve = 0.0
|
||||
total_reject = 0.0
|
||||
total_abstain = 0.0
|
||||
|
||||
for stage in stages:
|
||||
try:
|
||||
result = stage.check(request)
|
||||
except Exception as e:
|
||||
logger.error("Stage %s failed: %s — treating as ABSTAIN", stage.name, e)
|
||||
result = StageResult(stage.name, Vote.ABSTAIN, 0.0, f"Error: {e}")
|
||||
|
||||
results.append(result)
|
||||
|
||||
if result.vote == Vote.APPROVE:
|
||||
total_approve += result.weight
|
||||
elif result.vote == Vote.REJECT:
|
||||
total_reject += result.weight
|
||||
# Short-circuit: absolute veto
|
||||
if result.weight >= threshold:
|
||||
return AggregateResult(
|
||||
approved=False,
|
||||
total_weight_approve=total_approve,
|
||||
total_weight_reject=total_reject,
|
||||
total_weight_abstain=total_abstain,
|
||||
stage_results=results,
|
||||
threshold=threshold,
|
||||
)
|
||||
else:
|
||||
total_abstain += result.weight
|
||||
|
||||
# Final decision based on non-abstain votes
|
||||
active_weight = total_approve + total_reject
|
||||
if active_weight == 0:
|
||||
# All abstain — pass to async stages (human approval)
|
||||
approved = False # not yet approved, awaiting human
|
||||
else:
|
||||
approved = (total_approve / active_weight) >= threshold
|
||||
|
||||
return AggregateResult(
|
||||
approved=approved,
|
||||
total_weight_approve=total_approve,
|
||||
total_weight_reject=total_reject,
|
||||
total_weight_abstain=total_abstain,
|
||||
stage_results=results,
|
||||
threshold=threshold,
|
||||
)
|
||||
344
ops/pipeline-v2/telegram/approvals.py
Normal file
344
ops/pipeline-v2/telegram/approvals.py
Normal file
|
|
@ -0,0 +1,344 @@
|
|||
"""Telegram approval workflow — human-in-the-loop for outgoing comms + core KB changes.
|
||||
|
||||
Flow: Agent submits → Leo reviews substance → Bot sends to Cory → Cory approves/rejects.
|
||||
|
||||
Architecture:
|
||||
- approval_queue table in pipeline.db (migration v11)
|
||||
- Bot polls for leo_approved items, sends formatted Telegram messages with inline buttons
|
||||
- Cory taps Approve/Reject → callback handler updates status
|
||||
- 24h expiry timeout on all pending approvals
|
||||
|
||||
OPSEC: Content filter rejects submissions containing financial figures or deal-specific language.
|
||||
No deal terms, no dollar amounts, no private investment details in approval requests — ever.
|
||||
|
||||
Epimetheus owns this module.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import sqlite3
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from telegram import InlineKeyboardButton, InlineKeyboardMarkup, Update
|
||||
from telegram.ext import CallbackQueryHandler, ContextTypes
|
||||
|
||||
logger = logging.getLogger("telegram.approvals")
|
||||
|
||||
# ─── OPSEC Content Filter ─────────────────────────────────────────────
|
||||
# Reject submissions containing financial figures or deal-specific language.
|
||||
# Pattern matches: $1M, $500K, 1.5 million, deal terms, valuation, cap table, etc.
|
||||
OPSEC_PATTERNS = [
|
||||
re.compile(r"\$[\d,.]+[KMBkmb]?\b", re.IGNORECASE), # $500K, $1.5M, $100
|
||||
re.compile(r"\b\d+[\d,.]*\s*(million|billion|thousand)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(deal terms?|valuation|cap table|equity split|ownership stake|term sheet|dilution|fee split)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(SAFE\s+(?:note|round|agreement)|SAFT|convertible note|preferred stock|liquidation preference)\b", re.IGNORECASE),
|
||||
re.compile(r"\bSeries\s+[A-Z]\b", re.IGNORECASE), # Series A/B/C/F funding rounds
|
||||
re.compile(r"\b(partnership terms|committed to (?:the |a )?round|funding round|(?:pre-?)?seed round)\b", re.IGNORECASE),
|
||||
]
|
||||
|
||||
# Sensitive entity names — loaded from opsec-entities.txt config file.
|
||||
# Edit the config file to add/remove entities without code changes.
|
||||
_OPSEC_ENTITIES_FILE = Path(__file__).parent / "opsec-entities.txt"
|
||||
|
||||
|
||||
def _load_sensitive_entities() -> list[re.Pattern]:
|
||||
"""Load sensitive entity patterns from config file."""
|
||||
patterns = []
|
||||
if _OPSEC_ENTITIES_FILE.exists():
|
||||
for line in _OPSEC_ENTITIES_FILE.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if line and not line.startswith("#"):
|
||||
patterns.append(re.compile(rf"\b{line}\b", re.IGNORECASE))
|
||||
return patterns
|
||||
|
||||
|
||||
SENSITIVE_ENTITIES = _load_sensitive_entities()
|
||||
|
||||
|
||||
def check_opsec(content: str) -> str | None:
|
||||
"""Check content against OPSEC patterns. Returns violation description or None."""
|
||||
for pattern in OPSEC_PATTERNS:
|
||||
match = pattern.search(content)
|
||||
if match:
|
||||
return f"OPSEC violation: content contains '{match.group()}' — no financial figures or deal terms in approval requests"
|
||||
for pattern in SENSITIVE_ENTITIES:
|
||||
match = pattern.search(content)
|
||||
if match:
|
||||
return f"OPSEC violation: content references sensitive entity '{match.group()}' — deal-adjacent entities blocked"
|
||||
return None
|
||||
|
||||
|
||||
# ─── Message Formatting ───────────────────────────────────────────────
|
||||
|
||||
TYPE_LABELS = {
|
||||
"tweet": "Tweet",
|
||||
"kb_change": "KB Change",
|
||||
"architecture_change": "Architecture Change",
|
||||
"public_post": "Public Post",
|
||||
"position": "Position",
|
||||
"agent_structure": "Agent Structure",
|
||||
}
|
||||
|
||||
# ─── Tier Classification ─────────────────────────────────────────────
|
||||
# Tier 1: Must approve (outgoing, public, irreversible)
|
||||
# Tier 2: Should approve (core architecture, strategic)
|
||||
# Tier 3: Autonomous (no approval needed — goes to daily digest only)
|
||||
|
||||
TIER_1_TYPES = {"tweet", "public_post", "position"}
|
||||
TIER_2_TYPES = {"kb_change", "architecture_change", "agent_structure"}
|
||||
# Everything else is Tier 3 — no approval queue entry, digest only
|
||||
|
||||
|
||||
def classify_tier(approval_type: str) -> int:
|
||||
"""Classify an approval request into tier 1, 2, or 3."""
|
||||
if approval_type in TIER_1_TYPES:
|
||||
return 1
|
||||
if approval_type in TIER_2_TYPES:
|
||||
return 2
|
||||
return 3
|
||||
|
||||
|
||||
def format_approval_message(row: sqlite3.Row) -> str:
|
||||
"""Format an approval request for Telegram display."""
|
||||
type_label = TYPE_LABELS.get(row["type"], row["type"].replace("_", " ").title())
|
||||
agent = row["originating_agent"].title()
|
||||
content = row["content"]
|
||||
|
||||
# Truncate long content for Telegram (4096 char limit)
|
||||
if len(content) > 3000:
|
||||
content = content[:3000] + "\n\n[... truncated]"
|
||||
|
||||
parts = [
|
||||
f"APPROVAL REQUEST",
|
||||
f"",
|
||||
f"Type: {type_label}",
|
||||
f"From: {agent}",
|
||||
]
|
||||
|
||||
if row["context"]:
|
||||
parts.append(f"Context: {row['context']}")
|
||||
|
||||
if row["leo_review_note"]:
|
||||
parts.append(f"Leo review: {row['leo_review_note']}")
|
||||
|
||||
parts.extend([
|
||||
"",
|
||||
"---",
|
||||
content,
|
||||
"---",
|
||||
])
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def build_keyboard(request_id: int) -> InlineKeyboardMarkup:
|
||||
"""Build inline keyboard with Approve/Reject buttons."""
|
||||
return InlineKeyboardMarkup([
|
||||
[
|
||||
InlineKeyboardButton("Approve", callback_data=f"approve:{request_id}"),
|
||||
InlineKeyboardButton("Reject", callback_data=f"reject:{request_id}"),
|
||||
]
|
||||
])
|
||||
|
||||
|
||||
# ─── Core Logic ───────────────────────────────────────────────────────
|
||||
|
||||
def get_pending_for_cory(conn: sqlite3.Connection) -> list[sqlite3.Row]:
|
||||
"""Get approval requests that Leo approved and are ready for Cory."""
|
||||
return conn.execute(
|
||||
"""SELECT * FROM approval_queue
|
||||
WHERE leo_review_status = 'leo_approved'
|
||||
AND status = 'pending'
|
||||
AND telegram_message_id IS NULL
|
||||
AND (expires_at IS NULL OR expires_at > datetime('now'))
|
||||
ORDER BY submitted_at ASC""",
|
||||
).fetchall()
|
||||
|
||||
|
||||
def expire_stale_requests(conn: sqlite3.Connection) -> int:
|
||||
"""Expire requests older than 24h. Returns count expired."""
|
||||
cursor = conn.execute(
|
||||
"""UPDATE approval_queue
|
||||
SET status = 'expired', decided_at = datetime('now')
|
||||
WHERE status = 'pending'
|
||||
AND expires_at IS NOT NULL
|
||||
AND expires_at <= datetime('now')""",
|
||||
)
|
||||
if cursor.rowcount > 0:
|
||||
conn.commit()
|
||||
logger.info("Expired %d stale approval requests", cursor.rowcount)
|
||||
return cursor.rowcount
|
||||
|
||||
|
||||
def record_decision(
|
||||
conn: sqlite3.Connection,
|
||||
request_id: int,
|
||||
decision: str,
|
||||
decision_by: str,
|
||||
rejection_reason: str = None,
|
||||
) -> bool:
|
||||
"""Record an approval/rejection decision. Returns True if updated."""
|
||||
cursor = conn.execute(
|
||||
"""UPDATE approval_queue
|
||||
SET status = ?, decision_by = ?, rejection_reason = ?,
|
||||
decided_at = datetime('now')
|
||||
WHERE id = ? AND status = 'pending'""",
|
||||
(decision, decision_by, rejection_reason, request_id),
|
||||
)
|
||||
conn.commit()
|
||||
return cursor.rowcount > 0
|
||||
|
||||
|
||||
def record_telegram_message(conn: sqlite3.Connection, request_id: int, message_id: int):
|
||||
"""Record the Telegram message ID for an approval notification."""
|
||||
conn.execute(
|
||||
"UPDATE approval_queue SET telegram_message_id = ? WHERE id = ?",
|
||||
(message_id, request_id),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
# ─── Telegram Handlers ────────────────────────────────────────────────
|
||||
|
||||
async def handle_approval_callback(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||
"""Handle Approve/Reject button taps from Cory."""
|
||||
query = update.callback_query
|
||||
await query.answer()
|
||||
|
||||
data = query.data
|
||||
if not data or ":" not in data:
|
||||
return
|
||||
|
||||
action, request_id_str = data.split(":", 1)
|
||||
if action not in ("approve", "reject"):
|
||||
return
|
||||
|
||||
try:
|
||||
request_id = int(request_id_str)
|
||||
except ValueError:
|
||||
return
|
||||
|
||||
conn = context.bot_data.get("approval_conn")
|
||||
if not conn:
|
||||
await query.edit_message_text("Error: approval DB not connected")
|
||||
return
|
||||
|
||||
if action == "reject":
|
||||
# Check if user sent a reply with rejection reason
|
||||
rejection_reason = None
|
||||
# For rejection, edit the message to ask for reason
|
||||
row = conn.execute(
|
||||
"SELECT * FROM approval_queue WHERE id = ?", (request_id,)
|
||||
).fetchone()
|
||||
if not row or row["status"] != "pending":
|
||||
await query.edit_message_text("This request has already been processed.")
|
||||
return
|
||||
|
||||
# Store pending rejection — user can reply with reason
|
||||
context.bot_data[f"pending_reject:{request_id}"] = True
|
||||
await query.edit_message_text(
|
||||
f"{query.message.text}\n\nRejected. Reply to this message with feedback for the agent (optional).",
|
||||
)
|
||||
record_decision(conn, request_id, "rejected", query.from_user.username or str(query.from_user.id))
|
||||
logger.info("Approval #%d REJECTED by %s", request_id, query.from_user.username)
|
||||
return
|
||||
|
||||
# Approve
|
||||
user = query.from_user.username or str(query.from_user.id)
|
||||
success = record_decision(conn, request_id, "approved", user)
|
||||
|
||||
if success:
|
||||
# Check if this is a tweet — if so, auto-post to X
|
||||
row = conn.execute(
|
||||
"SELECT type FROM approval_queue WHERE id = ?", (request_id,)
|
||||
).fetchone()
|
||||
|
||||
post_status = ""
|
||||
if row and row["type"] == "tweet":
|
||||
try:
|
||||
from x_publisher import handle_approved_tweet
|
||||
result = await handle_approved_tweet(conn, request_id)
|
||||
if result.get("success"):
|
||||
url = result.get("tweet_url", "")
|
||||
post_status = f"\n\nPosted to X: {url}"
|
||||
logger.info("Tweet #%d auto-posted: %s", request_id, url)
|
||||
else:
|
||||
error = result.get("error", "unknown error")
|
||||
post_status = f"\n\nPost failed: {error}"
|
||||
logger.error("Tweet #%d auto-post failed: %s", request_id, error)
|
||||
except Exception as e:
|
||||
post_status = f"\n\nPost failed: {e}"
|
||||
logger.error("Tweet #%d auto-post error: %s", request_id, e)
|
||||
|
||||
await query.edit_message_text(
|
||||
f"{query.message.text}\n\nAPPROVED by {user}{post_status}"
|
||||
)
|
||||
logger.info("Approval #%d APPROVED by %s", request_id, user)
|
||||
else:
|
||||
await query.edit_message_text("This request has already been processed.")
|
||||
|
||||
|
||||
async def handle_rejection_reply(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||
"""Capture rejection reason from reply to a rejected approval message."""
|
||||
if not update.message or not update.message.reply_to_message:
|
||||
return False
|
||||
|
||||
# Check if the replied-to message is a rejected approval
|
||||
conn = context.bot_data.get("approval_conn")
|
||||
if not conn:
|
||||
return False
|
||||
|
||||
reply_msg_id = update.message.reply_to_message.message_id
|
||||
row = conn.execute(
|
||||
"SELECT id FROM approval_queue WHERE telegram_message_id = ? AND status = 'rejected'",
|
||||
(reply_msg_id,),
|
||||
).fetchone()
|
||||
|
||||
if not row:
|
||||
return False
|
||||
|
||||
# Update rejection reason
|
||||
reason = update.message.text.strip()
|
||||
conn.execute(
|
||||
"UPDATE approval_queue SET rejection_reason = ? WHERE id = ?",
|
||||
(reason, row["id"]),
|
||||
)
|
||||
conn.commit()
|
||||
await update.message.reply_text(f"Feedback recorded for approval #{row['id']}.")
|
||||
logger.info("Rejection reason added for approval #%d: %s", row["id"], reason[:100])
|
||||
return True
|
||||
|
||||
|
||||
# ─── Poll Job ─────────────────────────────────────────────────────────
|
||||
|
||||
async def poll_approvals(context: ContextTypes.DEFAULT_TYPE):
|
||||
"""Poll for Leo-approved requests and send to Cory. Runs every 30s."""
|
||||
conn = context.bot_data.get("approval_conn")
|
||||
admin_chat_id = context.bot_data.get("admin_chat_id")
|
||||
|
||||
if not conn or not admin_chat_id:
|
||||
return
|
||||
|
||||
# Expire stale requests first (may fail on DB lock - retry next cycle)
|
||||
try:
|
||||
expire_stale_requests(conn)
|
||||
except Exception:
|
||||
pass # non-fatal, retries in 30s
|
||||
|
||||
# Send new notifications
|
||||
pending = get_pending_for_cory(conn)
|
||||
for row in pending:
|
||||
try:
|
||||
text = format_approval_message(row)
|
||||
keyboard = build_keyboard(row["id"])
|
||||
msg = await context.bot.send_message(
|
||||
chat_id=admin_chat_id,
|
||||
text=text,
|
||||
reply_markup=keyboard,
|
||||
)
|
||||
record_telegram_message(conn, row["id"], msg.message_id)
|
||||
logger.info("Sent approval #%d to admin (type=%s, agent=%s)",
|
||||
row["id"], row["type"], row["originating_agent"])
|
||||
except Exception as e:
|
||||
logger.error("Failed to send approval #%d: %s", row["id"], e)
|
||||
2069
ops/pipeline-v2/telegram/bot.py
Normal file
2069
ops/pipeline-v2/telegram/bot.py
Normal file
File diff suppressed because it is too large
Load diff
208
ops/pipeline-v2/telegram/digest.py
Normal file
208
ops/pipeline-v2/telegram/digest.py
Normal file
|
|
@ -0,0 +1,208 @@
|
|||
"""Daily digest — sends Cory a summary of all Tier 3 activity at 8am London time.
|
||||
|
||||
Aggregates: merged claims (with insight summaries), pipeline metrics, agent activity,
|
||||
pending review items. Runs as a scheduled job in bot.py.
|
||||
|
||||
Epimetheus owns this module.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
logger = logging.getLogger("telegram.digest")
|
||||
|
||||
LONDON_TZ = ZoneInfo("Europe/London")
|
||||
DIGEST_HOUR_LONDON = 8 # 8am London time (auto-adjusts for BST/GMT)
|
||||
|
||||
|
||||
def next_digest_time() -> datetime:
|
||||
"""Calculate the next 8am London time as a UTC datetime.
|
||||
|
||||
Handles BST/GMT transitions automatically via zoneinfo.
|
||||
"""
|
||||
now = datetime.now(LONDON_TZ)
|
||||
target = now.replace(hour=DIGEST_HOUR_LONDON, minute=0, second=0, microsecond=0)
|
||||
if target <= now:
|
||||
target += timedelta(days=1)
|
||||
return target.astimezone(timezone.utc)
|
||||
|
||||
|
||||
def _get_merged_claims_24h(conn: sqlite3.Connection) -> list[dict]:
|
||||
"""Get PRs merged in the last 24 hours with domain and branch info."""
|
||||
rows = conn.execute(
|
||||
"""SELECT number, branch, domain, agent, commit_type, merged_at, description
|
||||
FROM prs
|
||||
WHERE merged_at > datetime('now', '-24 hours')
|
||||
AND status = 'merged'
|
||||
ORDER BY merged_at DESC""",
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
def _get_pipeline_metrics_24h(conn: sqlite3.Connection) -> dict:
|
||||
"""Get pipeline activity metrics for the last 24 hours."""
|
||||
total_merged = conn.execute(
|
||||
"SELECT COUNT(*) FROM prs WHERE merged_at > datetime('now', '-24 hours') AND status = 'merged'"
|
||||
).fetchone()[0]
|
||||
|
||||
total_closed = conn.execute(
|
||||
"SELECT COUNT(*) FROM prs WHERE status = 'closed' AND created_at > datetime('now', '-24 hours')"
|
||||
).fetchone()[0]
|
||||
|
||||
total_conflict = conn.execute(
|
||||
"SELECT COUNT(*) FROM prs WHERE status IN ('conflict', 'conflict_permanent') AND created_at > datetime('now', '-24 hours')"
|
||||
).fetchone()[0]
|
||||
|
||||
total_open = conn.execute(
|
||||
"SELECT COUNT(*) FROM prs WHERE status IN ('open', 'reviewing', 'approved', 'merging')"
|
||||
).fetchone()[0]
|
||||
|
||||
# Approval rate (last 24h)
|
||||
evaluated = conn.execute(
|
||||
"SELECT COUNT(*) FROM prs WHERE leo_verdict IN ('approve', 'request_changes') AND created_at > datetime('now', '-24 hours')"
|
||||
).fetchone()[0]
|
||||
approved = conn.execute(
|
||||
"SELECT COUNT(*) FROM prs WHERE leo_verdict = 'approve' AND created_at > datetime('now', '-24 hours')"
|
||||
).fetchone()[0]
|
||||
approval_rate = (approved / evaluated * 100) if evaluated > 0 else 0
|
||||
|
||||
return {
|
||||
"merged": total_merged,
|
||||
"closed": total_closed,
|
||||
"conflict": total_conflict,
|
||||
"open": total_open,
|
||||
"evaluated": evaluated,
|
||||
"approved": approved,
|
||||
"approval_rate": approval_rate,
|
||||
}
|
||||
|
||||
|
||||
def _get_agent_activity_24h(conn: sqlite3.Connection) -> dict[str, int]:
|
||||
"""Get PR count by agent for the last 24 hours."""
|
||||
rows = conn.execute(
|
||||
"""SELECT agent, COUNT(*) as cnt
|
||||
FROM prs
|
||||
WHERE created_at > datetime('now', '-24 hours')
|
||||
AND agent IS NOT NULL
|
||||
GROUP BY agent
|
||||
ORDER BY cnt DESC""",
|
||||
).fetchall()
|
||||
return {r["agent"]: r["cnt"] for r in rows}
|
||||
|
||||
|
||||
def _get_pending_review_count(conn: sqlite3.Connection) -> int:
|
||||
"""Count PRs awaiting review."""
|
||||
return conn.execute(
|
||||
"SELECT COUNT(*) FROM prs WHERE status IN ('open', 'reviewing')"
|
||||
).fetchone()[0]
|
||||
|
||||
|
||||
def _extract_claim_title(branch: str) -> str:
|
||||
"""Extract a human-readable claim title from a branch name.
|
||||
|
||||
Branch format: extract/source-slug or agent/description
|
||||
"""
|
||||
# Strip prefix (extract/, research/, theseus/, etc.)
|
||||
parts = branch.split("/", 1)
|
||||
slug = parts[1] if len(parts) > 1 else parts[0]
|
||||
# Convert slug to readable title
|
||||
return slug.replace("-", " ").replace("_", " ").title()
|
||||
|
||||
|
||||
|
||||
def format_digest(
|
||||
merged_claims: list[dict],
|
||||
metrics: dict,
|
||||
agent_activity: dict[str, int],
|
||||
pending_review: int,
|
||||
) -> str:
|
||||
"""Format the daily digest message."""
|
||||
now = datetime.now(timezone.utc)
|
||||
date_str = now.strftime("%Y-%m-%d")
|
||||
|
||||
parts = [f"DAILY DIGEST — {date_str}", ""]
|
||||
|
||||
# Merged claims section
|
||||
if merged_claims:
|
||||
# Group by domain
|
||||
by_domain: dict[str, list] = {}
|
||||
for claim in merged_claims:
|
||||
domain = claim.get("domain") or "unknown"
|
||||
by_domain.setdefault(domain, []).append(claim)
|
||||
|
||||
parts.append(f"CLAIMS MERGED ({len(merged_claims)})")
|
||||
for domain, claims in sorted(by_domain.items()):
|
||||
for c in claims:
|
||||
# Use real description from frontmatter if available, fall back to slug title
|
||||
desc = c.get("description")
|
||||
if desc:
|
||||
# Take first description if multiple (pipe-delimited)
|
||||
display = desc.split(" | ")[0]
|
||||
if len(display) > 120:
|
||||
display = display[:117] + "..."
|
||||
else:
|
||||
display = _extract_claim_title(c.get("branch", "unknown"))
|
||||
commit_type = c.get("commit_type", "")
|
||||
type_tag = f"[{commit_type}] " if commit_type else ""
|
||||
parts.append(f" {type_tag}{display} ({domain})")
|
||||
parts.append("")
|
||||
else:
|
||||
parts.extend(["CLAIMS MERGED (0)", " No claims merged in the last 24h", ""])
|
||||
|
||||
# Pipeline metrics
|
||||
success_rate = 0
|
||||
total_attempted = metrics["merged"] + metrics["closed"] + metrics["conflict"]
|
||||
if total_attempted > 0:
|
||||
success_rate = metrics["merged"] / total_attempted * 100
|
||||
|
||||
parts.append("PIPELINE")
|
||||
parts.append(f" Merged: {metrics['merged']} | Closed: {metrics['closed']} | Conflicts: {metrics['conflict']}")
|
||||
parts.append(f" Success rate: {success_rate:.0f}% | Approval rate: {metrics['approval_rate']:.0f}%")
|
||||
parts.append(f" Open PRs: {metrics['open']}")
|
||||
parts.append("")
|
||||
|
||||
# Agent activity
|
||||
if agent_activity:
|
||||
parts.append("AGENTS")
|
||||
for agent, count in agent_activity.items():
|
||||
parts.append(f" {agent}: {count} PRs")
|
||||
parts.append("")
|
||||
else:
|
||||
parts.extend(["AGENTS", " No agent activity in the last 24h", ""])
|
||||
|
||||
# Pending review
|
||||
if pending_review > 0:
|
||||
parts.append(f"PENDING YOUR REVIEW: {pending_review}")
|
||||
else:
|
||||
parts.append("PENDING YOUR REVIEW: 0")
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
async def send_daily_digest(context):
|
||||
"""Send daily digest to admin chat. Scheduled job."""
|
||||
conn = context.bot_data.get("approval_conn")
|
||||
admin_chat_id = context.bot_data.get("admin_chat_id")
|
||||
|
||||
if not conn or not admin_chat_id:
|
||||
logger.debug("Digest skipped — no DB connection or admin chat ID")
|
||||
return
|
||||
|
||||
try:
|
||||
merged = _get_merged_claims_24h(conn)
|
||||
metrics = _get_pipeline_metrics_24h(conn)
|
||||
activity = _get_agent_activity_24h(conn)
|
||||
pending = _get_pending_review_count(conn)
|
||||
|
||||
text = format_digest(merged, metrics, activity, pending)
|
||||
|
||||
await context.bot.send_message(
|
||||
chat_id=admin_chat_id,
|
||||
text=text,
|
||||
)
|
||||
logger.info("Daily digest sent (%d claims, %d agents active)",
|
||||
len(merged), len(activity))
|
||||
except Exception as e:
|
||||
logger.error("Failed to send daily digest: %s", e)
|
||||
52
ops/pipeline-v2/telegram/eval.py
Normal file
52
ops/pipeline-v2/telegram/eval.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
"""Eval pipeline stub — provides imports for bot.py.
|
||||
Full implementation pending Ganymede review."""
|
||||
|
||||
CONFIDENCE_FLOOR = 0.3
|
||||
COST_ALERT_THRESHOLD = 0.22
|
||||
|
||||
|
||||
class _LLMResponse(str):
|
||||
"""str subclass carrying token counts and cost."""
|
||||
def __new__(cls, content, prompt_tokens=0, completion_tokens=0, cost=0.0, model=''):
|
||||
obj = super().__new__(cls, content)
|
||||
obj.prompt_tokens = prompt_tokens
|
||||
obj.completion_tokens = completion_tokens
|
||||
obj.cost = cost
|
||||
obj.model = model
|
||||
return obj
|
||||
|
||||
|
||||
def estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
|
||||
"""Per-model cost estimation."""
|
||||
rates = {
|
||||
'anthropic/claude-opus-4': (15.0, 75.0),
|
||||
'anthropic/claude-sonnet-4': (3.0, 15.0),
|
||||
'anthropic/claude-haiku-4.5': (0.80, 4.0),
|
||||
'openai/gpt-4o': (2.50, 10.0),
|
||||
}
|
||||
for prefix, (input_rate, output_rate) in rates.items():
|
||||
if prefix in model:
|
||||
return (prompt_tokens * input_rate + completion_tokens * output_rate) / 1_000_000
|
||||
return (prompt_tokens * 3.0 + completion_tokens * 15.0) / 1_000_000
|
||||
|
||||
|
||||
def check_url_fabrication(response: str, kb_context: str) -> tuple[str, list[str]]:
|
||||
"""Check for fabricated URLs. Returns (cleaned_response, fabricated_urls)."""
|
||||
import re
|
||||
urls = re.findall(r'https?://[^\s\)"]+', response)
|
||||
if not urls or not kb_context:
|
||||
return response, []
|
||||
kb_urls = set(re.findall(r'https?://[^\s\)"]+', kb_context))
|
||||
fabricated = [u for u in urls if u not in kb_urls and not u.startswith('https://t.me/')]
|
||||
cleaned = response
|
||||
for u in fabricated:
|
||||
cleaned = cleaned.replace(u, '[URL removed]')
|
||||
return cleaned, fabricated
|
||||
|
||||
|
||||
def apply_confidence_floor(response: str, confidence: float | None) -> tuple[str, bool, str | None]:
|
||||
"""Apply confidence floor. Returns (response, blocked, block_reason)."""
|
||||
if confidence is not None and confidence < CONFIDENCE_FLOOR:
|
||||
caveat = '⚠️ Low confidence response — treat with skepticism.\n\n'
|
||||
return caveat + response, True, f'confidence {confidence:.2f} below floor {CONFIDENCE_FLOOR}'
|
||||
return response, False, None
|
||||
76
ops/pipeline-v2/telegram/eval_checks.py
Normal file
76
ops/pipeline-v2/telegram/eval_checks.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
"""Eval pipeline — pure functions for response quality checks.
|
||||
|
||||
Extracted from bot.py so tests can import without telegram dependency.
|
||||
No side effects, no I/O, no imports beyond stdlib.
|
||||
|
||||
Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887>
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Per-model pricing (input $/M tokens, output $/M tokens) — from OpenRouter
|
||||
MODEL_PRICING = {
|
||||
"anthropic/claude-opus-4-6": (15.0, 75.0),
|
||||
"anthropic/claude-sonnet-4-6": (3.0, 15.0),
|
||||
"anthropic/claude-haiku-4.5": (0.80, 4.0),
|
||||
"anthropic/claude-3.5-haiku": (0.80, 4.0),
|
||||
"openai/gpt-4o": (2.50, 10.0),
|
||||
"openai/gpt-4o-mini": (0.15, 0.60),
|
||||
}
|
||||
|
||||
CONFIDENCE_FLOOR = 0.4
|
||||
COST_ALERT_THRESHOLD = 0.22 # per-response alert threshold in USD
|
||||
|
||||
# URL fabrication regex — matches http:// and https:// URLs
|
||||
_URL_RE = re.compile(r'https?://[^\s\)\]\"\'<>]+')
|
||||
|
||||
|
||||
class _LLMResponse(str):
|
||||
"""String subclass carrying token counts and cost from OpenRouter usage field."""
|
||||
prompt_tokens: int = 0
|
||||
completion_tokens: int = 0
|
||||
cost: float = 0.0
|
||||
model: str = ""
|
||||
|
||||
def __new__(cls, text: str, prompt_tokens: int = 0, completion_tokens: int = 0,
|
||||
cost: float = 0.0, model: str = ""):
|
||||
obj = super().__new__(cls, text)
|
||||
obj.prompt_tokens = prompt_tokens
|
||||
obj.completion_tokens = completion_tokens
|
||||
obj.cost = cost
|
||||
obj.model = model
|
||||
return obj
|
||||
|
||||
|
||||
def estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
|
||||
"""Estimate cost in USD from token counts and model pricing."""
|
||||
input_rate, output_rate = MODEL_PRICING.get(model, (3.0, 15.0)) # default to Sonnet
|
||||
return (prompt_tokens * input_rate + completion_tokens * output_rate) / 1_000_000
|
||||
|
||||
|
||||
def check_url_fabrication(response_text: str, kb_context: str) -> tuple[str, list[str]]:
|
||||
"""Check for fabricated URLs in response. Replace any not found in KB context.
|
||||
|
||||
Returns (cleaned_text, list_of_fabricated_urls).
|
||||
"""
|
||||
kb_urls = set(_URL_RE.findall(kb_context)) if kb_context else set()
|
||||
response_urls = _URL_RE.findall(response_text)
|
||||
fabricated = [url for url in response_urls if url not in kb_urls]
|
||||
result = response_text
|
||||
for url in fabricated:
|
||||
result = result.replace(url, "[URL removed — not verified]")
|
||||
return result, fabricated
|
||||
|
||||
|
||||
def apply_confidence_floor(display_response: str, confidence_score: float | None) -> tuple[str, bool, str | None]:
|
||||
"""Apply confidence floor check.
|
||||
|
||||
Returns (possibly_modified_response, is_blocked, block_reason).
|
||||
"""
|
||||
if confidence_score is not None and confidence_score < CONFIDENCE_FLOOR:
|
||||
modified = (
|
||||
f"⚠️ Low confidence — I may not have reliable data on this topic.\n\n"
|
||||
+ display_response
|
||||
)
|
||||
return modified, True, f"confidence {confidence_score:.2f} < floor {CONFIDENCE_FLOOR}"
|
||||
return display_response, False, None
|
||||
747
ops/pipeline-v2/telegram/kb_retrieval.py
Normal file
747
ops/pipeline-v2/telegram/kb_retrieval.py
Normal file
|
|
@ -0,0 +1,747 @@
|
|||
#!/usr/bin/env python3
|
||||
"""KB Retrieval for Telegram bot — multi-layer search across the Teleo knowledge base.
|
||||
|
||||
Architecture (Ganymede-reviewed):
|
||||
Layer 1: Entity resolution — query tokens → entity name/aliases/tags → entity file
|
||||
Layer 2: Claim search — substring + keyword matching on titles AND descriptions
|
||||
Layer 3: Agent context — positions, beliefs referencing matched entities/claims
|
||||
|
||||
Entry point: retrieve_context(query, repo_dir) → KBContext
|
||||
|
||||
Epimetheus owns this module.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
logger = logging.getLogger("kb-retrieval")
|
||||
|
||||
# ─── Types ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntityMatch:
|
||||
"""A matched entity with its profile."""
|
||||
name: str
|
||||
path: str
|
||||
entity_type: str
|
||||
domain: str
|
||||
overview: str # first ~500 chars of body
|
||||
tags: list[str]
|
||||
related_claims: list[str] # wiki-link titles from body
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClaimMatch:
|
||||
"""A matched claim."""
|
||||
title: str
|
||||
path: str
|
||||
domain: str
|
||||
confidence: str
|
||||
description: str
|
||||
score: float # relevance score
|
||||
|
||||
|
||||
@dataclass
|
||||
class PositionMatch:
|
||||
"""An agent position on a topic."""
|
||||
agent: str
|
||||
title: str
|
||||
content: str # first ~500 chars
|
||||
|
||||
|
||||
@dataclass
|
||||
class KBContext:
|
||||
"""Full KB context for a query — passed to the LLM prompt."""
|
||||
entities: list[EntityMatch] = field(default_factory=list)
|
||||
claims: list[ClaimMatch] = field(default_factory=list)
|
||||
positions: list[PositionMatch] = field(default_factory=list)
|
||||
belief_excerpts: list[str] = field(default_factory=list)
|
||||
stats: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
# ─── Index ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class KBIndex:
|
||||
"""In-memory index of entities, claims, and agent state. Rebuilt on mtime change."""
|
||||
|
||||
def __init__(self, repo_dir: str):
|
||||
self.repo_dir = Path(repo_dir)
|
||||
self._entities: list[dict] = [] # [{name, path, type, domain, tags, handles, body_excerpt, aliases}]
|
||||
self._claims: list[dict] = [] # [{title, path, domain, confidence, description}]
|
||||
self._positions: list[dict] = [] # [{agent, title, path, content}]
|
||||
self._beliefs: list[dict] = [] # [{agent, path, content}]
|
||||
self._entity_alias_map: dict[str, list[int]] = {} # lowercase alias → indices into _entities
|
||||
self._last_build: float = 0
|
||||
|
||||
def ensure_fresh(self, max_age_seconds: int = 300):
|
||||
"""Rebuild index if stale. Rebuilds every max_age_seconds (default 5 min)."""
|
||||
now = time.time()
|
||||
if now - self._last_build > max_age_seconds:
|
||||
self._build()
|
||||
|
||||
def _build(self):
|
||||
"""Rebuild all indexes from filesystem."""
|
||||
logger.info("Rebuilding KB index from %s", self.repo_dir)
|
||||
start = time.time()
|
||||
|
||||
self._entities = []
|
||||
self._claims = []
|
||||
self._positions = []
|
||||
self._beliefs = []
|
||||
self._entity_alias_map = {}
|
||||
|
||||
self._index_entities()
|
||||
self._index_claims()
|
||||
self._index_agent_state()
|
||||
self._last_build = time.time()
|
||||
|
||||
logger.info("KB index built in %.1fs: %d entities, %d claims, %d positions",
|
||||
time.time() - start, len(self._entities), len(self._claims), len(self._positions))
|
||||
|
||||
def _index_entities(self):
|
||||
"""Scan entities/ and decisions/ for entity and decision files."""
|
||||
entity_dirs = [
|
||||
self.repo_dir / "entities",
|
||||
self.repo_dir / "decisions",
|
||||
]
|
||||
for entities_dir in entity_dirs:
|
||||
if not entities_dir.exists():
|
||||
continue
|
||||
for md_file in entities_dir.rglob("*.md"):
|
||||
self._index_single_entity(md_file)
|
||||
|
||||
def _index_single_entity(self, md_file: Path):
|
||||
"""Index a single entity or decision file."""
|
||||
try:
|
||||
fm, body = _parse_frontmatter(md_file)
|
||||
if not fm or fm.get("type") not in ("entity", "decision"):
|
||||
return
|
||||
|
||||
name = fm.get("name", md_file.stem)
|
||||
handles = fm.get("handles", []) or []
|
||||
tags = fm.get("tags", []) or []
|
||||
entity_type = fm.get("entity_type", "unknown")
|
||||
domain = fm.get("domain", "unknown")
|
||||
|
||||
# For decision records, also index summary and proposer as searchable text
|
||||
summary = fm.get("summary", "")
|
||||
proposer = fm.get("proposer", "")
|
||||
|
||||
# Build aliases from multiple sources
|
||||
aliases = set()
|
||||
aliases.add(name.lower())
|
||||
aliases.add(md_file.stem.lower()) # slugified name
|
||||
for h in handles:
|
||||
aliases.add(h.lower().lstrip("@"))
|
||||
for t in tags:
|
||||
aliases.add(t.lower())
|
||||
# Add proposer name as alias for decision records
|
||||
if proposer:
|
||||
aliases.add(proposer.lower())
|
||||
# Add parent_entity as alias (Ganymede: MetaDAO queries should surface its decisions)
|
||||
parent = fm.get("parent_entity", "")
|
||||
if parent:
|
||||
parent_slug = parent.strip("[]").lower()
|
||||
aliases.add(parent_slug)
|
||||
|
||||
# Mine body for ticker mentions ($XXXX and standalone ALL-CAPS tokens)
|
||||
dollar_tickers = re.findall(r"\$([A-Z]{2,10})", body[:2000])
|
||||
for ticker in dollar_tickers:
|
||||
aliases.add(ticker.lower())
|
||||
aliases.add(f"${ticker.lower()}")
|
||||
# Standalone all-caps tokens (likely tickers: OMFG, META, SOL)
|
||||
caps_tokens = re.findall(r"\b([A-Z]{2,10})\b", body[:2000])
|
||||
for token in caps_tokens:
|
||||
# Filter common English words that happen to be short caps
|
||||
if token not in ("THE", "AND", "FOR", "NOT", "BUT", "HAS", "ARE", "WAS",
|
||||
"ITS", "ALL", "CAN", "HAD", "HER", "ONE", "OUR", "OUT",
|
||||
"NEW", "NOW", "OLD", "SEE", "WAY", "MAY", "SAY", "SHE",
|
||||
"TWO", "HOW", "BOY", "DID", "GET", "PUT", "KEY", "TVL",
|
||||
"AMM", "CEO", "SDK", "API", "ICO", "APY", "FAQ", "IPO"):
|
||||
aliases.add(token.lower())
|
||||
aliases.add(f"${token.lower()}")
|
||||
|
||||
# Also add aliases field if it exists (future schema)
|
||||
for a in (fm.get("aliases", []) or []):
|
||||
aliases.add(a.lower())
|
||||
|
||||
# Extract wiki-linked claim references from body
|
||||
related_claims = re.findall(r"\[\[([^\]]+)\]\]", body)
|
||||
|
||||
# Body excerpt — decisions get full body, entities get 500 chars
|
||||
ft = fm.get("type")
|
||||
if ft == "decision":
|
||||
# Full body for decision records — proposals can be 6K+
|
||||
overview = body[:8000] if body else (summary or "")
|
||||
elif summary:
|
||||
overview = f"{summary} "
|
||||
body_lines = [l for l in body.split("\n") if l.strip() and not l.startswith("#")]
|
||||
remaining = 500 - len(overview)
|
||||
if remaining > 0:
|
||||
overview += " ".join(body_lines[:10])[:remaining]
|
||||
else:
|
||||
body_lines = [l for l in body.split("\n") if l.strip() and not l.startswith("#")]
|
||||
overview = " ".join(body_lines[:10])[:500]
|
||||
|
||||
idx = len(self._entities)
|
||||
self._entities.append({
|
||||
"name": name,
|
||||
"path": str(md_file),
|
||||
"type": entity_type,
|
||||
"domain": domain,
|
||||
"tags": tags,
|
||||
"handles": handles,
|
||||
"aliases": list(aliases),
|
||||
"overview": overview,
|
||||
"related_claims": related_claims,
|
||||
})
|
||||
|
||||
# Register all aliases in lookup map
|
||||
for alias in aliases:
|
||||
self._entity_alias_map.setdefault(alias, []).append(idx)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Failed to index entity %s: %s", md_file, e)
|
||||
|
||||
def _index_claims(self):
|
||||
"""Scan domains/, core/, and foundations/ for claim files."""
|
||||
claim_dirs = [
|
||||
self.repo_dir / "domains",
|
||||
self.repo_dir / "core",
|
||||
self.repo_dir / "foundations",
|
||||
]
|
||||
for claim_dir in claim_dirs:
|
||||
if not claim_dir.exists():
|
||||
continue
|
||||
for md_file in claim_dir.rglob("*.md"):
|
||||
# Skip _map.md and other non-claim files
|
||||
if md_file.name.startswith("_"):
|
||||
continue
|
||||
try:
|
||||
fm, body = _parse_frontmatter(md_file)
|
||||
if not fm:
|
||||
# Many claims lack explicit type — index them anyway
|
||||
title = md_file.stem.replace("-", " ")
|
||||
self._claims.append({
|
||||
"title": title,
|
||||
"path": str(md_file),
|
||||
"domain": _domain_from_path(md_file, self.repo_dir),
|
||||
"confidence": "unknown",
|
||||
"description": "",
|
||||
})
|
||||
continue
|
||||
|
||||
# Skip non-claim types if type is explicit
|
||||
ft = fm.get("type")
|
||||
if ft and ft not in ("claim", None):
|
||||
continue
|
||||
|
||||
title = md_file.stem.replace("-", " ")
|
||||
self._claims.append({
|
||||
"title": title,
|
||||
"path": str(md_file),
|
||||
"domain": fm.get("domain", _domain_from_path(md_file, self.repo_dir)),
|
||||
"confidence": fm.get("confidence", "unknown"),
|
||||
"description": fm.get("description", ""),
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning("Failed to index claim %s: %s", md_file, e)
|
||||
|
||||
def _index_agent_state(self):
|
||||
"""Scan agents/ for positions and beliefs."""
|
||||
agents_dir = self.repo_dir / "agents"
|
||||
if not agents_dir.exists():
|
||||
return
|
||||
for agent_dir in agents_dir.iterdir():
|
||||
if not agent_dir.is_dir():
|
||||
continue
|
||||
agent_name = agent_dir.name
|
||||
|
||||
# Index positions
|
||||
positions_dir = agent_dir / "positions"
|
||||
if positions_dir.exists():
|
||||
for md_file in positions_dir.glob("*.md"):
|
||||
try:
|
||||
fm, body = _parse_frontmatter(md_file)
|
||||
title = fm.get("title", md_file.stem.replace("-", " ")) if fm else md_file.stem.replace("-", " ")
|
||||
content = body[:500] if body else ""
|
||||
self._positions.append({
|
||||
"agent": agent_name,
|
||||
"title": title,
|
||||
"path": str(md_file),
|
||||
"content": content,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning("Failed to index position %s: %s", md_file, e)
|
||||
|
||||
# Index beliefs (just the file, we'll excerpt on demand)
|
||||
beliefs_file = agent_dir / "beliefs.md"
|
||||
if beliefs_file.exists():
|
||||
try:
|
||||
content = beliefs_file.read_text()[:3000]
|
||||
self._beliefs.append({
|
||||
"agent": agent_name,
|
||||
"path": str(beliefs_file),
|
||||
"content": content,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning("Failed to index beliefs %s: %s", beliefs_file, e)
|
||||
|
||||
|
||||
# ─── Retrieval ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def retrieve_context(query: str, repo_dir: str, index: KBIndex | None = None,
|
||||
max_claims: int = 8, max_entities: int = 5,
|
||||
max_positions: int = 3,
|
||||
kb_scope: list[str] | None = None) -> KBContext:
|
||||
"""Main entry point: retrieve full KB context for a query.
|
||||
|
||||
Three layers:
|
||||
1. Entity resolution — match query tokens to entities, scored by relevance
|
||||
2. Claim search — substring + keyword matching on titles and descriptions
|
||||
3. Agent context — positions and beliefs referencing matched entities/claims
|
||||
"""
|
||||
if index is None:
|
||||
index = KBIndex(repo_dir)
|
||||
index.ensure_fresh()
|
||||
|
||||
ctx = KBContext()
|
||||
|
||||
# Normalize query
|
||||
query_lower = query.lower()
|
||||
query_tokens = _tokenize(query_lower)
|
||||
|
||||
# ── Layer 1: Entity Resolution ──
|
||||
# Score each entity by how many query tokens match its aliases/name
|
||||
scored_entities: list[tuple[float, int]] = [] # (score, index)
|
||||
|
||||
# Build a set of candidate indices from alias map + substring matching
|
||||
candidate_indices = set()
|
||||
for token in query_tokens:
|
||||
if token in index._entity_alias_map:
|
||||
candidate_indices.update(index._entity_alias_map[token])
|
||||
if token.startswith("$"):
|
||||
bare = token[1:]
|
||||
if bare in index._entity_alias_map:
|
||||
candidate_indices.update(index._entity_alias_map[bare])
|
||||
|
||||
for i, ent in enumerate(index._entities):
|
||||
for token in query_tokens:
|
||||
if len(token) >= 3 and token in ent["name"].lower():
|
||||
candidate_indices.add(i)
|
||||
|
||||
# Score candidates by query token overlap
|
||||
for idx in candidate_indices:
|
||||
ent = index._entities[idx]
|
||||
score = _score_entity(query_lower, query_tokens, ent)
|
||||
if score > 0:
|
||||
scored_entities.append((score, idx))
|
||||
|
||||
scored_entities.sort(key=lambda x: x[0], reverse=True)
|
||||
|
||||
for score, idx in scored_entities[:max_entities]:
|
||||
ent = index._entities[idx]
|
||||
ctx.entities.append(EntityMatch(
|
||||
name=ent["name"],
|
||||
path=ent["path"],
|
||||
entity_type=ent["type"],
|
||||
domain=ent["domain"],
|
||||
overview=_sanitize_for_prompt(ent["overview"], max_len=8000),
|
||||
tags=ent["tags"],
|
||||
related_claims=ent["related_claims"],
|
||||
))
|
||||
|
||||
# Collect entity-related claim titles for boosting
|
||||
entity_claim_titles = set()
|
||||
for em in ctx.entities:
|
||||
for rc in em.related_claims:
|
||||
entity_claim_titles.add(rc.lower().replace("-", " "))
|
||||
|
||||
# ── Layer 2: Claim Search ──
|
||||
# Import min score threshold (filters single-stopword garbage matches)
|
||||
try:
|
||||
from lib.config import RETRIEVAL_MIN_CLAIM_SCORE as MIN_SCORE
|
||||
except ImportError:
|
||||
MIN_SCORE = 3.0
|
||||
|
||||
scored_claims: list[tuple[float, dict]] = []
|
||||
|
||||
# Normalize kb_scope paths for prefix matching
|
||||
_scope_prefixes = None
|
||||
if kb_scope:
|
||||
_scope_prefixes = [str(Path(repo_dir) / s) for s in kb_scope]
|
||||
|
||||
for claim in index._claims:
|
||||
# Domain filtering: if kb_scope is set, only score claims in-scope
|
||||
if _scope_prefixes:
|
||||
if not any(claim["path"].startswith(p) for p in _scope_prefixes):
|
||||
continue
|
||||
score = _score_claim(query_lower, query_tokens, claim, entity_claim_titles)
|
||||
if score >= MIN_SCORE:
|
||||
scored_claims.append((score, claim))
|
||||
|
||||
scored_claims.sort(key=lambda x: x[0], reverse=True)
|
||||
|
||||
for score, claim in scored_claims[:max_claims]:
|
||||
ctx.claims.append(ClaimMatch(
|
||||
title=claim["title"],
|
||||
path=claim["path"],
|
||||
domain=claim["domain"],
|
||||
confidence=claim["confidence"],
|
||||
description=_sanitize_for_prompt(claim.get("description", "")),
|
||||
score=score,
|
||||
))
|
||||
|
||||
# ── Layer 3: Agent Context ──
|
||||
# Find positions referencing matched entities or claims
|
||||
match_terms = set(query_tokens)
|
||||
for em in ctx.entities:
|
||||
match_terms.add(em.name.lower())
|
||||
for cm in ctx.claims:
|
||||
# Add key words from matched claim titles
|
||||
match_terms.update(t for t in cm.title.lower().split() if len(t) >= 4)
|
||||
|
||||
for pos in index._positions:
|
||||
pos_text = (pos["title"] + " " + pos["content"]).lower()
|
||||
overlap = sum(1 for t in match_terms if t in pos_text)
|
||||
if overlap >= 2:
|
||||
ctx.positions.append(PositionMatch(
|
||||
agent=pos["agent"],
|
||||
title=pos["title"],
|
||||
content=_sanitize_for_prompt(pos["content"]),
|
||||
))
|
||||
if len(ctx.positions) >= max_positions:
|
||||
break
|
||||
|
||||
# Extract relevant belief excerpts
|
||||
for belief in index._beliefs:
|
||||
belief_text = belief["content"].lower()
|
||||
overlap = sum(1 for t in match_terms if t in belief_text)
|
||||
if overlap >= 2:
|
||||
# Extract relevant paragraphs
|
||||
excerpts = _extract_relevant_paragraphs(belief["content"], match_terms, max_paragraphs=2)
|
||||
for exc in excerpts:
|
||||
ctx.belief_excerpts.append(f"**{belief['agent']}**: {_sanitize_for_prompt(exc)}")
|
||||
|
||||
# Stats
|
||||
ctx.stats = {
|
||||
"total_claims": len(index._claims),
|
||||
"total_entities": len(index._entities),
|
||||
"total_positions": len(index._positions),
|
||||
"entities_matched": len(ctx.entities),
|
||||
"claims_matched": len(ctx.claims),
|
||||
}
|
||||
|
||||
return ctx
|
||||
|
||||
|
||||
# ─── Scoring ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
_STOP_WORDS = frozenset({
|
||||
"the", "for", "and", "but", "not", "you", "can", "has", "are", "was",
|
||||
"its", "all", "had", "her", "one", "our", "out", "new", "now", "old",
|
||||
"see", "way", "may", "say", "she", "two", "how", "did", "get", "put",
|
||||
"give", "me", "ok", "full", "text", "what", "about", "tell", "this",
|
||||
"that", "with", "from", "have", "more", "some", "than", "them", "then",
|
||||
"into", "also", "just", "your", "been", "here", "will", "does", "know",
|
||||
"please", "think",
|
||||
})
|
||||
|
||||
|
||||
def _score_entity(query_lower: str, query_tokens: list[str], entity: dict) -> float:
|
||||
"""Score an entity against a query. Higher = more relevant."""
|
||||
name_lower = entity["name"].lower()
|
||||
overview_lower = entity.get("overview", "").lower()
|
||||
aliases = entity.get("aliases", [])
|
||||
score = 0.0
|
||||
|
||||
# Filter out stop words — only score meaningful tokens
|
||||
meaningful_tokens = [t for t in query_tokens if t not in _STOP_WORDS and len(t) >= 3]
|
||||
|
||||
for token in meaningful_tokens:
|
||||
# Name match (highest signal)
|
||||
if token in name_lower:
|
||||
score += 3.0
|
||||
# Alias match (tags, proposer, parent_entity, tickers)
|
||||
elif any(token == a or token in a for a in aliases):
|
||||
score += 1.0
|
||||
# Overview match (body content)
|
||||
elif token in overview_lower:
|
||||
score += 0.5
|
||||
|
||||
# Boost multi-word name matches (e.g. "robin hanson" in entity name)
|
||||
if len(meaningful_tokens) >= 2:
|
||||
bigrams = [f"{meaningful_tokens[i]} {meaningful_tokens[i+1]}" for i in range(len(meaningful_tokens) - 1)]
|
||||
for bg in bigrams:
|
||||
if bg in name_lower:
|
||||
score += 5.0
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def _score_claim(query_lower: str, query_tokens: list[str], claim: dict,
|
||||
entity_claim_titles: set[str]) -> float:
|
||||
"""Score a claim against a query. Higher = more relevant."""
|
||||
title = claim["title"].lower()
|
||||
desc = claim.get("description", "").lower()
|
||||
searchable = title + " " + desc
|
||||
score = 0.0
|
||||
|
||||
# Filter stopwords — same as entity scoring. Without this, "from", "what", "to"
|
||||
# all score points and garbage like "fee revenue splits" matches on "living".
|
||||
meaningful_tokens = [t for t in query_tokens if t not in _STOP_WORDS and len(t) >= 3]
|
||||
|
||||
# Substring match on meaningful tokens only
|
||||
for token in meaningful_tokens:
|
||||
if token in searchable:
|
||||
score += 2.0 if token in title else 1.0
|
||||
|
||||
# Boost if this claim is wiki-linked from a matched entity
|
||||
if any(t in title for t in entity_claim_titles):
|
||||
score += 5.0
|
||||
|
||||
# Boost multi-word matches (use meaningful tokens only)
|
||||
if len(meaningful_tokens) >= 2:
|
||||
bigrams = [f"{meaningful_tokens[i]} {meaningful_tokens[i+1]}" for i in range(len(meaningful_tokens) - 1)]
|
||||
for bg in bigrams:
|
||||
if bg in searchable:
|
||||
score += 3.0
|
||||
|
||||
return score
|
||||
|
||||
|
||||
# ─── Helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _parse_frontmatter(path: Path) -> tuple[dict | None, str]:
|
||||
"""Parse YAML frontmatter and body from a markdown file."""
|
||||
try:
|
||||
text = path.read_text(errors="replace")
|
||||
except Exception:
|
||||
return None, ""
|
||||
|
||||
if not text.startswith("---"):
|
||||
return None, text
|
||||
|
||||
end = text.find("\n---", 3)
|
||||
if end == -1:
|
||||
return None, text
|
||||
|
||||
try:
|
||||
fm = yaml.safe_load(text[3:end])
|
||||
if not isinstance(fm, dict):
|
||||
return None, text
|
||||
body = text[end + 4:].strip()
|
||||
return fm, body
|
||||
except yaml.YAMLError:
|
||||
return None, text
|
||||
|
||||
|
||||
def _domain_from_path(path: Path, repo_dir: Path) -> str:
|
||||
"""Infer domain from file path."""
|
||||
rel = path.relative_to(repo_dir)
|
||||
parts = rel.parts
|
||||
if len(parts) >= 2 and parts[0] in ("domains", "entities", "decisions"):
|
||||
return parts[1]
|
||||
if len(parts) >= 1 and parts[0] == "core":
|
||||
return "core"
|
||||
if len(parts) >= 1 and parts[0] == "foundations":
|
||||
return parts[1] if len(parts) >= 2 else "foundations"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def _tokenize(text: str) -> list[str]:
|
||||
"""Split query into searchable tokens."""
|
||||
# Keep $ prefix for ticker matching
|
||||
tokens = re.findall(r"\$?\w+", text.lower())
|
||||
# Filter out very short stop words but keep short tickers
|
||||
return [t for t in tokens if len(t) >= 2]
|
||||
|
||||
|
||||
def _sanitize_for_prompt(text: str, max_len: int = 1000) -> str:
|
||||
"""Sanitize content before injecting into LLM prompt (Ganymede: security)."""
|
||||
# Strip code blocks
|
||||
text = re.sub(r"```.*?```", "[code block removed]", text, flags=re.DOTALL)
|
||||
# Strip anything that looks like system instructions
|
||||
text = re.sub(r"(system:|assistant:|human:|<\|.*?\|>)", "", text, flags=re.IGNORECASE)
|
||||
# Truncate
|
||||
return text[:max_len]
|
||||
|
||||
|
||||
def _extract_relevant_paragraphs(text: str, terms: set[str], max_paragraphs: int = 2) -> list[str]:
|
||||
"""Extract paragraphs from text that contain the most matching terms."""
|
||||
paragraphs = text.split("\n\n")
|
||||
scored = []
|
||||
for p in paragraphs:
|
||||
p_stripped = p.strip()
|
||||
if len(p_stripped) < 20:
|
||||
continue
|
||||
p_lower = p_stripped.lower()
|
||||
overlap = sum(1 for t in terms if t in p_lower)
|
||||
if overlap > 0:
|
||||
scored.append((overlap, p_stripped[:300]))
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [text for _, text in scored[:max_paragraphs]]
|
||||
|
||||
|
||||
def format_context_for_prompt(ctx: KBContext) -> str:
|
||||
"""Format KBContext as text for injection into the LLM prompt."""
|
||||
sections = []
|
||||
|
||||
if ctx.entities:
|
||||
sections.append("## Matched Entities")
|
||||
for i, ent in enumerate(ctx.entities):
|
||||
sections.append(f"**{ent.name}** ({ent.entity_type}, {ent.domain})")
|
||||
# Top 3 entities get full content, rest get truncated
|
||||
if i < 3:
|
||||
sections.append(ent.overview[:8000])
|
||||
else:
|
||||
sections.append(ent.overview[:500])
|
||||
if ent.related_claims:
|
||||
sections.append("Related claims: " + ", ".join(ent.related_claims[:5]))
|
||||
sections.append("")
|
||||
|
||||
if ctx.claims:
|
||||
sections.append("## Relevant KB Claims")
|
||||
for claim in ctx.claims:
|
||||
sections.append(f"- **{claim.title}** (confidence: {claim.confidence}, domain: {claim.domain})")
|
||||
if claim.description:
|
||||
sections.append(f" {claim.description}")
|
||||
sections.append("")
|
||||
|
||||
if ctx.positions:
|
||||
sections.append("## Agent Positions")
|
||||
for pos in ctx.positions:
|
||||
sections.append(f"**{pos.agent}**: {pos.title}")
|
||||
sections.append(pos.content[:200])
|
||||
sections.append("")
|
||||
|
||||
if ctx.belief_excerpts:
|
||||
sections.append("## Relevant Beliefs")
|
||||
for exc in ctx.belief_excerpts:
|
||||
sections.append(exc)
|
||||
sections.append("")
|
||||
|
||||
if not sections:
|
||||
return "No relevant KB content found for this query."
|
||||
|
||||
# Add stats footer
|
||||
sections.append(f"---\nKB: {ctx.stats.get('total_claims', '?')} claims, "
|
||||
f"{ctx.stats.get('total_entities', '?')} entities. "
|
||||
f"Matched: {ctx.stats.get('entities_matched', 0)} entities, "
|
||||
f"{ctx.stats.get('claims_matched', 0)} claims.")
|
||||
|
||||
return "\n".join(sections)
|
||||
|
||||
|
||||
# --- Qdrant vector search integration ---
|
||||
|
||||
# Module-level import guard for lib.search (Fix 3: no per-call sys.path manipulation)
|
||||
_vector_search = None
|
||||
try:
|
||||
import sys as _sys
|
||||
import os as _os
|
||||
_pipeline_root = _os.path.dirname(_os.path.dirname(_os.path.abspath(__file__)))
|
||||
if _pipeline_root not in _sys.path:
|
||||
_sys.path.insert(0, _pipeline_root)
|
||||
from lib.search import search as _vector_search
|
||||
except ImportError:
|
||||
logger.warning("Qdrant search unavailable at module load (lib.search not found)")
|
||||
|
||||
|
||||
def retrieve_vector_context(query: str,
|
||||
keyword_paths: list[str] | None = None) -> tuple[str, dict]:
|
||||
"""Semantic search via Qdrant — returns (formatted_text, metadata).
|
||||
|
||||
Complements retrieve_context() (symbolic/keyword) with semantic similarity.
|
||||
Falls back gracefully if Qdrant is unavailable.
|
||||
|
||||
Args:
|
||||
keyword_paths: Claim paths already matched by keyword search. These are
|
||||
excluded at the Qdrant query level AND from graph expansion to avoid
|
||||
duplicates in the prompt.
|
||||
|
||||
Returns:
|
||||
(formatted_text, metadata_dict)
|
||||
metadata_dict: {direct_results: [...], expanded_results: [...],
|
||||
layers_hit: [...], duration_ms: int}
|
||||
"""
|
||||
import time as _time
|
||||
t0 = _time.monotonic()
|
||||
empty_meta = {"direct_results": [], "expanded_results": [],
|
||||
"layers_hit": [], "duration_ms": 0}
|
||||
|
||||
if _vector_search is None:
|
||||
return "", empty_meta
|
||||
|
||||
try:
|
||||
results = _vector_search(query, expand=True,
|
||||
exclude=keyword_paths)
|
||||
except Exception as e:
|
||||
logger.warning("Qdrant search failed: %s", e)
|
||||
return "", empty_meta
|
||||
|
||||
duration = int((_time.monotonic() - t0) * 1000)
|
||||
|
||||
if results.get("error") or not results.get("direct_results"):
|
||||
return "", {**empty_meta, "duration_ms": duration,
|
||||
"error": results.get("error")}
|
||||
|
||||
layers_hit = ["qdrant"]
|
||||
if results.get("expanded_results"):
|
||||
layers_hit.append("graph")
|
||||
|
||||
# Build structured metadata for audit
|
||||
meta = {
|
||||
"direct_results": [
|
||||
{"path": r["claim_path"], "title": r["claim_title"],
|
||||
"score": r["score"], "domain": r.get("domain", ""),
|
||||
"source": "qdrant"}
|
||||
for r in results["direct_results"]
|
||||
],
|
||||
"expanded_results": [
|
||||
{"path": r["claim_path"], "title": r["claim_title"],
|
||||
"edge_type": r.get("edge_type", "related"),
|
||||
"from_claim": r.get("from_claim", ""), "source": "graph"}
|
||||
for r in results.get("expanded_results", [])
|
||||
],
|
||||
"layers_hit": layers_hit,
|
||||
"duration_ms": duration,
|
||||
}
|
||||
|
||||
# Build formatted text for prompt (Fix 4: subsection headers)
|
||||
sections = []
|
||||
sections.append("## Semantic Search Results (Qdrant)")
|
||||
sections.append("")
|
||||
sections.append("### Direct matches")
|
||||
|
||||
for r in results["direct_results"]:
|
||||
score_pct = int(r["score"] * 100)
|
||||
line = f"- **{r['claim_title']}** ({score_pct}% match"
|
||||
if r.get("domain"):
|
||||
line += f", {r['domain']}"
|
||||
if r.get("confidence"):
|
||||
line += f", {r['confidence']}"
|
||||
line += ")"
|
||||
sections.append(line)
|
||||
if r.get("snippet"):
|
||||
sections.append(f" {r['snippet']}")
|
||||
|
||||
if results.get("expanded_results"):
|
||||
sections.append("")
|
||||
sections.append("### Related claims (graph expansion)")
|
||||
for r in results["expanded_results"]:
|
||||
edge = r.get("edge_type", "related")
|
||||
weight_str = f" ×{r.get('edge_weight', 1.0)}" if r.get("edge_weight", 1.0) != 1.0 else ""
|
||||
sections.append(f"- {r['claim_title']} ({edge}{weight_str} → {r.get('from_claim', '').split('/')[-1]})")
|
||||
|
||||
return "\n".join(sections), meta
|
||||
719
ops/pipeline-v2/telegram/kb_tools.py
Normal file
719
ops/pipeline-v2/telegram/kb_tools.py
Normal file
|
|
@ -0,0 +1,719 @@
|
|||
#!/usr/bin/env python3
|
||||
"""KB tools for LLM function-calling — source tracing + entity/claim lookup.
|
||||
|
||||
These tools let the agent trace claims back to their original sources,
|
||||
find all claims from a specific piece of research, and read source documents.
|
||||
|
||||
Epimetheus owns this module.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
logger = logging.getLogger("tg.kb_tools")
|
||||
|
||||
|
||||
# ─── Tool definitions (OpenAI function-calling format) ───────────────
|
||||
|
||||
TOOL_DEFINITIONS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "find_by_source",
|
||||
"description": (
|
||||
"Find all claims extracted from a specific source (article, paper, thread). "
|
||||
"Search by author name, source title, or keywords. Returns all claims from "
|
||||
"matching sources with their frontmatter."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Author name, source title, or keywords to match against claim source fields",
|
||||
},
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_source",
|
||||
"description": (
|
||||
"Read the original source document (article, thread, paper) that claims were "
|
||||
"extracted from. Use when you need the full context behind a claim, not just "
|
||||
"the extracted summary."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"source_title": {
|
||||
"type": "string",
|
||||
"description": "Title or slug of the source document to read",
|
||||
},
|
||||
},
|
||||
"required": ["source_title"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_entity",
|
||||
"description": "Read the full profile of a KB entity (project, person, protocol).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Entity name or slug",
|
||||
},
|
||||
},
|
||||
"required": ["name"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "list_entity_links",
|
||||
"description": "List all entities and claims linked from an entity's wiki-links.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Entity name or slug",
|
||||
},
|
||||
},
|
||||
"required": ["name"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_claim",
|
||||
"description": "Read the full content of a specific claim file.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"description": "Claim title or slug",
|
||||
},
|
||||
},
|
||||
"required": ["title"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "search_kb",
|
||||
"description": "Search the KB for claims matching a query. Uses keyword matching.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Search query",
|
||||
},
|
||||
"max_results": {
|
||||
"type": "integer",
|
||||
"description": "Max results to return (default 5)",
|
||||
},
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "explore_graph",
|
||||
"description": (
|
||||
"Follow knowledge graph edges from a claim to find connected claims. "
|
||||
"Returns all claims linked via supports, challenges, depends_on, and related edges. "
|
||||
"Use this to discover the full argument structure around a claim — what supports it, "
|
||||
"what challenges it, and what it depends on."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"claim_title": {
|
||||
"type": "string",
|
||||
"description": "Title or slug of the claim to explore edges from",
|
||||
},
|
||||
},
|
||||
"required": ["claim_title"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "search_sources",
|
||||
"description": (
|
||||
"Search the source archive for original documents by topic, author, or title. "
|
||||
"Returns matching source files with their titles and first few lines. "
|
||||
"Use this when you want to find the original research/article/thread, not just extracted claims."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Topic, author name, or keywords to search source documents",
|
||||
},
|
||||
"max_results": {
|
||||
"type": "integer",
|
||||
"description": "Max results to return (default 5)",
|
||||
},
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "pr_status",
|
||||
"description": (
|
||||
"Check the status of a pipeline PR by number. Returns eval verdicts, "
|
||||
"merge status, time in queue, rejection reasons, and retry counts."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pr_number": {
|
||||
"type": "integer",
|
||||
"description": "PR number to look up",
|
||||
},
|
||||
},
|
||||
"required": ["pr_number"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "check_duplicate",
|
||||
"description": (
|
||||
"Check if a claim is a near-duplicate of existing KB content. "
|
||||
"Returns top-3 closest matches with similarity scores. "
|
||||
">=0.85 = likely duplicate, 0.70-0.85 = check manually, <0.70 = novel."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The claim text to check for duplicates",
|
||||
},
|
||||
},
|
||||
"required": ["text"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ─── Tool implementations ────────────────────────────────────────────
|
||||
|
||||
|
||||
def find_by_source(query: str, kb_dir: str) -> str:
|
||||
"""Find all claims extracted from sources matching the query.
|
||||
|
||||
Searches claim frontmatter `source:` fields for author names, titles, keywords.
|
||||
Returns structured list of all claims from matching sources.
|
||||
"""
|
||||
query_lower = query.lower()
|
||||
query_tokens = [t for t in re.findall(r'\w+', query_lower) if len(t) >= 3]
|
||||
|
||||
# Scan all claim files for matching source fields
|
||||
matches: list[dict] = []
|
||||
claim_dirs = [
|
||||
Path(kb_dir) / "domains",
|
||||
Path(kb_dir) / "core",
|
||||
Path(kb_dir) / "foundations",
|
||||
]
|
||||
|
||||
for claim_dir in claim_dirs:
|
||||
if not claim_dir.exists():
|
||||
continue
|
||||
for md_file in claim_dir.rglob("*.md"):
|
||||
if md_file.name.startswith("_"):
|
||||
continue
|
||||
try:
|
||||
fm, body = _parse_frontmatter(md_file)
|
||||
if not fm:
|
||||
continue
|
||||
source = fm.get("source", "")
|
||||
source_file = fm.get("source_file", "")
|
||||
searchable = f"{source} {source_file}".lower()
|
||||
|
||||
# Score: how many query tokens appear in the source field
|
||||
score = sum(1 for t in query_tokens if t in searchable)
|
||||
if score >= max(1, len(query_tokens) // 2):
|
||||
matches.append({
|
||||
"title": md_file.stem.replace("-", " "),
|
||||
"path": str(md_file.relative_to(kb_dir)),
|
||||
"source": source,
|
||||
"source_file": source_file,
|
||||
"domain": fm.get("domain", "unknown"),
|
||||
"confidence": fm.get("confidence", "unknown"),
|
||||
"description": fm.get("description", ""),
|
||||
"score": score,
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not matches:
|
||||
return f"No claims found from sources matching '{query}'."
|
||||
|
||||
# Sort by score desc, group by source
|
||||
matches.sort(key=lambda m: m["score"], reverse=True)
|
||||
|
||||
# Group by source
|
||||
by_source: dict[str, list[dict]] = {}
|
||||
for m in matches:
|
||||
key = m["source"] or "unknown"
|
||||
by_source.setdefault(key, []).append(m)
|
||||
|
||||
lines = [f"Found {len(matches)} claims from {len(by_source)} matching sources:\n"]
|
||||
for source_name, claims in list(by_source.items())[:5]: # Cap at 5 sources
|
||||
lines.append(f"## Source: {source_name}")
|
||||
if claims[0].get("source_file"):
|
||||
lines.append(f"File: {claims[0]['source_file']}")
|
||||
for c in claims[:10]: # Cap at 10 claims per source
|
||||
lines.append(f"- **{c['title']}** ({c['confidence']}, {c['domain']})")
|
||||
if c["description"]:
|
||||
lines.append(f" {c['description'][:200]}")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)[:4000]
|
||||
|
||||
|
||||
def read_source(source_title: str, kb_dir: str) -> str:
|
||||
"""Read the original source document from the archive.
|
||||
|
||||
Looks in inbox/archive/ and sources/ for matching files.
|
||||
"""
|
||||
title_lower = source_title.lower()
|
||||
slug = re.sub(r'[^a-z0-9]+', '-', title_lower).strip('-')
|
||||
|
||||
# Search paths for source files
|
||||
search_dirs = [
|
||||
Path(kb_dir) / "inbox" / "archive",
|
||||
Path(kb_dir) / "sources",
|
||||
Path(kb_dir) / "inbox" / "queue",
|
||||
]
|
||||
|
||||
best_match = None
|
||||
best_score = 0
|
||||
|
||||
for search_dir in search_dirs:
|
||||
if not search_dir.exists():
|
||||
continue
|
||||
for md_file in search_dir.rglob("*.md"):
|
||||
file_slug = md_file.stem.lower()
|
||||
# Score by token overlap
|
||||
score = 0
|
||||
for token in re.findall(r'\w+', title_lower):
|
||||
if len(token) >= 3 and token in file_slug:
|
||||
score += 1
|
||||
if slug in file_slug:
|
||||
score += 5 # Exact slug match
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = md_file
|
||||
|
||||
if not best_match:
|
||||
return f"Source document '{source_title}' not found in archive."
|
||||
|
||||
try:
|
||||
content = best_match.read_text(errors="replace")
|
||||
# Truncate to 4K for prompt safety
|
||||
if len(content) > 4000:
|
||||
content = content[:4000] + "\n\n[... truncated, full document is longer ...]"
|
||||
return f"## Source: {best_match.name}\n\n{content}"
|
||||
except Exception as e:
|
||||
return f"Error reading source: {e}"
|
||||
|
||||
|
||||
def read_entity(name: str, kb_dir: str) -> str:
|
||||
"""Read the full profile of a KB entity."""
|
||||
entity_file = _find_file(name, [
|
||||
Path(kb_dir) / "entities",
|
||||
Path(kb_dir) / "decisions",
|
||||
])
|
||||
if not entity_file:
|
||||
return f"Entity '{name}' not found."
|
||||
try:
|
||||
content = entity_file.read_text(errors="replace")
|
||||
return content[:4000]
|
||||
except Exception as e:
|
||||
return f"Error reading entity: {e}"
|
||||
|
||||
|
||||
def list_entity_links(name: str, kb_dir: str) -> str:
|
||||
"""List all wiki-links from an entity file, with dedup."""
|
||||
entity_file = _find_file(name, [
|
||||
Path(kb_dir) / "entities",
|
||||
Path(kb_dir) / "decisions",
|
||||
])
|
||||
if not entity_file:
|
||||
return f"Entity '{name}' not found."
|
||||
|
||||
try:
|
||||
content = entity_file.read_text(errors="replace")
|
||||
links = re.findall(r"\[\[([^\]]+)\]\]", content)
|
||||
# Dedup while preserving order
|
||||
seen = set()
|
||||
unique_links = []
|
||||
for link in links:
|
||||
if link.lower() not in seen:
|
||||
seen.add(link.lower())
|
||||
unique_links.append(link)
|
||||
if not unique_links:
|
||||
return f"Entity '{name}' has no wiki-links."
|
||||
return f"Entity '{name}' links to {len(unique_links)} items:\n" + "\n".join(
|
||||
f"- [[{link}]]" for link in unique_links
|
||||
)
|
||||
except Exception as e:
|
||||
return f"Error reading entity links: {e}"
|
||||
|
||||
|
||||
def read_claim(title: str, kb_dir: str) -> str:
|
||||
"""Read the full content of a claim file."""
|
||||
claim_file = _find_file(title, [
|
||||
Path(kb_dir) / "domains",
|
||||
Path(kb_dir) / "core",
|
||||
Path(kb_dir) / "foundations",
|
||||
])
|
||||
if not claim_file:
|
||||
return f"Claim '{title}' not found."
|
||||
try:
|
||||
content = claim_file.read_text(errors="replace")
|
||||
return content[:4000]
|
||||
except Exception as e:
|
||||
return f"Error reading claim: {e}"
|
||||
|
||||
|
||||
def search_kb(query: str, kb_dir: str, max_results: int = 5) -> str:
|
||||
"""Search KB claims by keyword matching."""
|
||||
from kb_retrieval import KBIndex, retrieve_context
|
||||
index = KBIndex(kb_dir)
|
||||
index.ensure_fresh()
|
||||
ctx = retrieve_context(query, kb_dir, index=index, max_claims=max_results)
|
||||
if not ctx.claims:
|
||||
return f"No claims found for '{query}'."
|
||||
lines = [f"Found {len(ctx.claims)} claims:"]
|
||||
for c in ctx.claims:
|
||||
lines.append(f"- **{c.title}** ({c.confidence}, {c.domain}, score: {c.score:.1f})")
|
||||
if c.description:
|
||||
lines.append(f" {c.description[:200]}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def explore_graph(claim_title: str, kb_dir: str) -> str:
|
||||
"""Follow knowledge graph edges from a claim to find connected claims.
|
||||
|
||||
Uses lib/search.py graph_expand() for 1-hop traversal of supports/challenges/
|
||||
depends_on/related edges in frontmatter.
|
||||
"""
|
||||
# Find the claim file first
|
||||
claim_file = _find_file(claim_title, [
|
||||
Path(kb_dir) / "domains",
|
||||
Path(kb_dir) / "core",
|
||||
Path(kb_dir) / "foundations",
|
||||
])
|
||||
if not claim_file:
|
||||
return f"Claim '{claim_title}' not found. Try a different title or use search_kb to find it first."
|
||||
|
||||
try:
|
||||
rel_path = str(claim_file.relative_to(kb_dir))
|
||||
except ValueError:
|
||||
rel_path = str(claim_file)
|
||||
|
||||
# Use the existing graph_expand from lib/search.py
|
||||
try:
|
||||
from lib.search import graph_expand
|
||||
expanded = graph_expand([rel_path], repo_root=Path(kb_dir), max_expanded=20)
|
||||
except ImportError:
|
||||
# Fallback: parse edges directly from the file
|
||||
expanded = []
|
||||
fm, body = _parse_frontmatter(claim_file)
|
||||
if fm:
|
||||
for edge_type in ("supports", "challenges", "challenged_by", "depends_on", "related"):
|
||||
targets = fm.get(edge_type, [])
|
||||
if isinstance(targets, str):
|
||||
targets = [targets]
|
||||
if isinstance(targets, list):
|
||||
for t in targets:
|
||||
expanded.append({"claim_title": t, "edge_type": edge_type, "edge_weight": 1.0})
|
||||
|
||||
if not expanded:
|
||||
return f"Claim '{claim_title}' has no graph edges (no supports, challenges, or related claims)."
|
||||
|
||||
# Group by edge type for readability
|
||||
by_type: dict[str, list[dict]] = {}
|
||||
for e in expanded:
|
||||
by_type.setdefault(e["edge_type"], []).append(e)
|
||||
|
||||
lines = [f"Graph edges from '{claim_title}' ({len(expanded)} connected claims):\n"]
|
||||
type_labels = {
|
||||
"supports": "Supports (this claim backs these up)",
|
||||
"challenges": "Challenges (this claim argues against these)",
|
||||
"challenged_by": "Challenged by (these argue against this claim)",
|
||||
"depends_on": "Depends on (prerequisites for this claim)",
|
||||
"related": "Related (connected by topic)",
|
||||
"wiki_links": "Wiki-linked (mentioned in body text)",
|
||||
}
|
||||
for edge_type, items in by_type.items():
|
||||
label = type_labels.get(edge_type, edge_type)
|
||||
lines.append(f"### {label}")
|
||||
for item in items:
|
||||
title = item.get("claim_title", "unknown")
|
||||
weight = item.get("edge_weight", 1.0)
|
||||
lines.append(f"- {title}" + (f" (weight: {weight})" if weight != 1.0 else ""))
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)[:4000]
|
||||
|
||||
|
||||
def search_sources(query: str, kb_dir: str, max_results: int = 5) -> str:
|
||||
"""Search the source archive for original documents by topic/author/title.
|
||||
|
||||
Scans inbox/archive/ and sources/ directories, scoring by token overlap.
|
||||
"""
|
||||
query_lower = query.lower()
|
||||
query_tokens = [t for t in re.findall(r'\w+', query_lower) if len(t) >= 3]
|
||||
|
||||
if not query_tokens:
|
||||
return "Query too short — provide at least one keyword with 3+ characters."
|
||||
|
||||
search_dirs = [
|
||||
Path(kb_dir) / "inbox" / "archive",
|
||||
Path(kb_dir) / "sources",
|
||||
Path(kb_dir) / "inbox" / "queue",
|
||||
]
|
||||
|
||||
matches: list[dict] = []
|
||||
for search_dir in search_dirs:
|
||||
if not search_dir.exists():
|
||||
continue
|
||||
for md_file in search_dir.rglob("*.md"):
|
||||
if md_file.name.startswith("_"):
|
||||
continue
|
||||
file_stem = md_file.stem.lower().replace("-", " ")
|
||||
# Score by token overlap with filename
|
||||
score = sum(1 for t in query_tokens if t in file_stem)
|
||||
# Also check first 500 chars of file content for author/topic
|
||||
if score == 0:
|
||||
try:
|
||||
head = md_file.read_text(errors="replace")[:500].lower()
|
||||
score = sum(0.5 for t in query_tokens if t in head)
|
||||
except Exception:
|
||||
continue
|
||||
if score >= max(1, len(query_tokens) // 3):
|
||||
# Read first few lines for preview
|
||||
try:
|
||||
preview = md_file.read_text(errors="replace")[:300].strip()
|
||||
except Exception:
|
||||
preview = "(could not read)"
|
||||
matches.append({
|
||||
"title": md_file.stem.replace("-", " "),
|
||||
"path": str(md_file.relative_to(kb_dir)),
|
||||
"score": score,
|
||||
"preview": preview,
|
||||
})
|
||||
|
||||
if not matches:
|
||||
return f"No source documents found matching '{query}'. Try different keywords or check find_by_source for claims from that source."
|
||||
|
||||
matches.sort(key=lambda m: m["score"], reverse=True)
|
||||
matches = matches[:max_results]
|
||||
|
||||
lines = [f"Found {len(matches)} source documents:\n"]
|
||||
for m in matches:
|
||||
lines.append(f"### {m['title']}")
|
||||
lines.append(f"Path: {m['path']}")
|
||||
lines.append(f"{m['preview'][:200]}")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)[:4000]
|
||||
|
||||
|
||||
# ─── Tool dispatcher ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
def execute_tool(tool_name: str, args: dict, kb_dir: str) -> str:
|
||||
"""Dispatch a tool call by name. Returns the tool's string result."""
|
||||
if tool_name == "find_by_source":
|
||||
return find_by_source(args.get("query", ""), kb_dir)
|
||||
elif tool_name == "read_source":
|
||||
return read_source(args.get("source_title", ""), kb_dir)
|
||||
elif tool_name == "read_entity":
|
||||
return read_entity(args.get("name", ""), kb_dir)
|
||||
elif tool_name == "list_entity_links":
|
||||
return list_entity_links(args.get("name", ""), kb_dir)
|
||||
elif tool_name == "read_claim":
|
||||
return read_claim(args.get("title", ""), kb_dir)
|
||||
elif tool_name == "search_kb":
|
||||
return search_kb(args.get("query", ""), kb_dir, args.get("max_results", 5))
|
||||
elif tool_name == "explore_graph":
|
||||
return explore_graph(args.get("claim_title", ""), kb_dir)
|
||||
elif tool_name == "search_sources":
|
||||
return search_sources(args.get("query", ""), kb_dir, args.get("max_results", 5))
|
||||
elif tool_name == "pr_status":
|
||||
return _tool_pr_status(args.get("pr_number", 0))
|
||||
elif tool_name == "check_duplicate":
|
||||
return _tool_check_duplicate(args.get("text", ""))
|
||||
else:
|
||||
return f"Unknown tool: {tool_name}"
|
||||
|
||||
|
||||
# ─── Helpers ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _parse_frontmatter(path: Path) -> tuple[dict | None, str]:
|
||||
"""Parse YAML frontmatter and body from a markdown file."""
|
||||
try:
|
||||
text = path.read_text(errors="replace")
|
||||
except Exception:
|
||||
return None, ""
|
||||
|
||||
if not text.startswith("---"):
|
||||
return None, text
|
||||
|
||||
end = text.find("\n---", 3)
|
||||
if end == -1:
|
||||
return None, text
|
||||
|
||||
try:
|
||||
fm = yaml.safe_load(text[3:end])
|
||||
if not isinstance(fm, dict):
|
||||
return None, text
|
||||
body = text[end + 4:].strip()
|
||||
return fm, body
|
||||
except yaml.YAMLError:
|
||||
return None, text
|
||||
|
||||
|
||||
def _find_file(name: str, search_dirs: list[Path]) -> Path | None:
|
||||
"""Find a markdown file by name/slug across search directories."""
|
||||
slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
||||
name_lower = name.lower()
|
||||
|
||||
for search_dir in search_dirs:
|
||||
if not search_dir.exists():
|
||||
continue
|
||||
for md_file in search_dir.rglob("*.md"):
|
||||
if md_file.name.startswith("_"):
|
||||
continue
|
||||
stem_lower = md_file.stem.lower()
|
||||
# Exact slug match
|
||||
if stem_lower == slug:
|
||||
return md_file
|
||||
# Normalized match (spaces vs hyphens)
|
||||
if stem_lower.replace("-", " ") == name_lower.replace("-", " "):
|
||||
return md_file
|
||||
# Substring match for long titles
|
||||
if len(slug) >= 8 and slug in stem_lower:
|
||||
return md_file
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ─── Pipeline DB tools ──────────────────────────────────────────────
|
||||
|
||||
|
||||
def _tool_pr_status(pr_number: int) -> str:
|
||||
"""Wrapper for pr_status() — connects to pipeline DB, returns formatted string."""
|
||||
import json
|
||||
import sqlite3
|
||||
|
||||
db_path = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
row = conn.execute(
|
||||
"""SELECT number, branch, source_path, status, domain, agent,
|
||||
commit_type, tier, leo_verdict, domain_verdict,
|
||||
domain_agent, eval_issues, priority, origin,
|
||||
cost_usd, created_at, merged_at, last_attempt, last_error,
|
||||
transient_retries, substantive_retries, description
|
||||
FROM prs WHERE number = ?""",
|
||||
(pr_number,),
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
if not row:
|
||||
return f"PR #{pr_number} not found."
|
||||
|
||||
issues = []
|
||||
try:
|
||||
issues = json.loads(row["eval_issues"] or "[]")
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
lines = [
|
||||
f"PR #{row['number']} — {row['status'].upper()}",
|
||||
f"Branch: {row['branch']}",
|
||||
f"Domain: {row['domain'] or 'unknown'} | Agent: {row['agent'] or 'pipeline'}",
|
||||
f"Type: {row['commit_type'] or 'unknown'} | Tier: {row['tier'] or 'unknown'}",
|
||||
f"Leo verdict: {row['leo_verdict']} | Domain verdict: {row['domain_verdict']}",
|
||||
]
|
||||
if row["description"]:
|
||||
lines.append(f"Description: {row['description']}")
|
||||
if issues:
|
||||
lines.append(f"Eval issues: {', '.join(str(i) for i in issues)}")
|
||||
if row["last_error"]:
|
||||
lines.append(f"Last error: {row['last_error'][:200]}")
|
||||
lines.append(f"Retries: {row['transient_retries']} transient, {row['substantive_retries']} substantive")
|
||||
lines.append(f"Created: {row['created_at']} | Last attempt: {row['last_attempt']}")
|
||||
if row["merged_at"]:
|
||||
lines.append(f"Merged: {row['merged_at']}")
|
||||
if row["cost_usd"]:
|
||||
lines.append(f"Eval cost: ${row['cost_usd']:.4f}")
|
||||
|
||||
return "\n".join(lines)
|
||||
except Exception as e:
|
||||
return f"Error querying PR #{pr_number}: {e}"
|
||||
|
||||
|
||||
def _tool_check_duplicate(text: str) -> str:
|
||||
"""Wrapper for check_duplicate() — calls Qdrant, returns formatted string."""
|
||||
import sys
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
from lib.search import check_duplicate as _check_dup
|
||||
|
||||
if not text:
|
||||
return "Error: text is required."
|
||||
|
||||
result = _check_dup(text)
|
||||
|
||||
if result.get("error"):
|
||||
return f"Error: {result['error']}"
|
||||
|
||||
lines = [f"Verdict: {result['verdict'].upper()} (highest score: {result['highest_score']:.4f})"]
|
||||
|
||||
for i, m in enumerate(result["matches"], 1):
|
||||
lines.append(
|
||||
f" {i}. [{m['score']:.4f}] {m['claim_title'][:80]}"
|
||||
f"\n Path: {m['claim_path']}"
|
||||
)
|
||||
|
||||
if not result["matches"]:
|
||||
lines.append(" No matches found above minimum threshold.")
|
||||
|
||||
return "\n".join(lines)
|
||||
112
ops/pipeline-v2/telegram/market_data.py
Normal file
112
ops/pipeline-v2/telegram/market_data.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Market data API client for live token prices.
|
||||
|
||||
Calls Ben's teleo-ai-api endpoint for ownership coin prices.
|
||||
Used by the Telegram bot to give Rio real-time market context.
|
||||
|
||||
Epimetheus owns this module. Rhea: static API key pattern.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import aiohttp
|
||||
|
||||
logger = logging.getLogger("market-data")
|
||||
|
||||
API_URL = "https://teleo-ai-api-257133920458.us-east4.run.app/v0/chat/tool/market-data"
|
||||
API_KEY_FILE = "/opt/teleo-eval/secrets/market-data-key"
|
||||
|
||||
# Cache: avoid hitting the API on every message
|
||||
_cache: dict[str, dict] = {} # token_name → {data, timestamp}
|
||||
CACHE_TTL = 300 # 5 minutes
|
||||
|
||||
|
||||
def _load_api_key() -> str | None:
|
||||
"""Load the market-data API key from secrets."""
|
||||
try:
|
||||
return Path(API_KEY_FILE).read_text().strip()
|
||||
except Exception:
|
||||
logger.warning("Market data API key not found at %s", API_KEY_FILE)
|
||||
return None
|
||||
|
||||
|
||||
async def get_token_price(token_name: str) -> dict | None:
|
||||
"""Fetch live market data for a token.
|
||||
|
||||
Returns dict with price, market_cap, volume, etc. or None on failure.
|
||||
Caches results for CACHE_TTL seconds.
|
||||
"""
|
||||
import time
|
||||
|
||||
token_upper = token_name.upper().strip("$")
|
||||
|
||||
# Check cache
|
||||
cached = _cache.get(token_upper)
|
||||
if cached and time.time() - cached["timestamp"] < CACHE_TTL:
|
||||
return cached["data"]
|
||||
|
||||
key = _load_api_key()
|
||||
if not key:
|
||||
return None
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(
|
||||
API_URL,
|
||||
headers={
|
||||
"X-Internal-Key": key,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={"token": token_upper},
|
||||
timeout=aiohttp.ClientTimeout(total=10),
|
||||
) as resp:
|
||||
if resp.status >= 400:
|
||||
logger.warning("Market data API %s → %d", token_upper, resp.status)
|
||||
return None
|
||||
data = await resp.json()
|
||||
|
||||
# Cache the result
|
||||
_cache[token_upper] = {
|
||||
"data": data,
|
||||
"timestamp": time.time(),
|
||||
}
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.warning("Market data API error for %s: %s", token_upper, e)
|
||||
return None
|
||||
|
||||
|
||||
def format_price_context(data: dict, token_name: str) -> str:
|
||||
"""Format market data into a concise string for the LLM prompt."""
|
||||
if not data:
|
||||
return ""
|
||||
|
||||
# API returns a "result" text field with pre-formatted data
|
||||
result_text = data.get("result", "")
|
||||
if result_text:
|
||||
return result_text
|
||||
|
||||
# Fallback for structured JSON responses
|
||||
parts = [f"Live market data for {token_name}:"]
|
||||
|
||||
price = data.get("price") or data.get("current_price")
|
||||
if price:
|
||||
parts.append(f"Price: ${price}")
|
||||
|
||||
mcap = data.get("market_cap") or data.get("marketCap")
|
||||
if mcap:
|
||||
if isinstance(mcap, (int, float)) and mcap > 1_000_000:
|
||||
parts.append(f"Market cap: ${mcap/1_000_000:.1f}M")
|
||||
else:
|
||||
parts.append(f"Market cap: {mcap}")
|
||||
|
||||
volume = data.get("volume") or data.get("volume_24h")
|
||||
if volume:
|
||||
parts.append(f"24h volume: ${volume}")
|
||||
|
||||
change = data.get("price_change_24h") or data.get("change_24h")
|
||||
if change:
|
||||
parts.append(f"24h change: {change}")
|
||||
|
||||
return " | ".join(parts) if len(parts) > 1 else ""
|
||||
147
ops/pipeline-v2/telegram/output_gate.py
Normal file
147
ops/pipeline-v2/telegram/output_gate.py
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
"""Output gate — classifies content as system/internal vs public-facing.
|
||||
|
||||
Blocks pipeline messages (extraction logs, merge notifications, diagnostics)
|
||||
from ever reaching the tweet queue or any public-facing output.
|
||||
|
||||
This is a deterministic classifier — no LLM calls. Pattern matching on content.
|
||||
|
||||
Epimetheus owns this module.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# ─── System Message Patterns ─────────────────────────────────────────
|
||||
# Content matching ANY of these is classified as system/internal.
|
||||
|
||||
_SYSTEM_PATTERNS = [
|
||||
# Pipeline operations
|
||||
re.compile(r"\b(PR\s*#\d+|pull request|merge|rebase|cherry.?pick)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(extraction|extracted|extractor|extract/)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(pipeline|cron|batch.?extract|systemd|teleo-pipeline)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(conflict.?permanent|conflict.?closed|merge.?conflict)\b", re.IGNORECASE),
|
||||
|
||||
# Infrastructure / ops
|
||||
re.compile(r"\b(schema\s*v\d+|migration\s*v\d+|SCHEMA_VERSION)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(deploy|VPS|ssh|scp|systemctl|journalctl)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(Qdrant|embed.?on.?merge|vector.?gc|backfill)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(ReadWritePaths|ProtectSystem|ExecStartPre)\b", re.IGNORECASE),
|
||||
|
||||
# Diagnostics
|
||||
re.compile(r"\b(vital.?signs|queue.?staleness|orphan.?ratio)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(approval.?rate|throughput|PRs?.?per.?hour)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(reviewer_count|reviewer.?backfill)\b", re.IGNORECASE),
|
||||
|
||||
# Agent coordination internals
|
||||
re.compile(r"\b(Ganymede|Rhea|Oberon)\s+(review(?:ed)?|approv(?:ed|es?)|reject(?:ed|s)?)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(PIPELINE_OWNED_PREFIXES|AGENT_NAMES)\b"),
|
||||
re.compile(r"\b(worktree|bare.?repo|forgejo|git\.livingip)\b", re.IGNORECASE),
|
||||
|
||||
# Code / technical
|
||||
re.compile(r"\b(def\s+\w+|import\s+\w+|class\s+\w+)\b"),
|
||||
re.compile(r"\b(\.py|\.yaml|\.json|\.md)\s", re.IGNORECASE),
|
||||
re.compile(r"\b(sqlite3?|pipeline\.db|response_audit)\b", re.IGNORECASE),
|
||||
|
||||
# Internal metrics / debugging
|
||||
re.compile(r"\b(cosine.?sim|threshold|PRIOR_ART_THRESHOLD)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(pre.?screen|Layer\s*[01234]|RRF|entity.?boost)\b", re.IGNORECASE),
|
||||
|
||||
# Paths
|
||||
re.compile(r"/opt/teleo-eval/"),
|
||||
re.compile(r"/Users/\w+/"),
|
||||
re.compile(r"\.pentagon/"),
|
||||
]
|
||||
|
||||
# ─── Public Content Signals ──────────────────────────────────────────
|
||||
# Content matching these is MORE LIKELY to be public-facing.
|
||||
# These don't override system classification — they're tiebreakers.
|
||||
|
||||
_PUBLIC_SIGNALS = [
|
||||
re.compile(r"^(thread|tweet|post):", re.IGNORECASE | re.MULTILINE),
|
||||
re.compile(r"\b(insight|analysis|take|perspective|argument)\b", re.IGNORECASE),
|
||||
re.compile(r"\b(audience|followers|engagement|impression)\b", re.IGNORECASE),
|
||||
]
|
||||
|
||||
|
||||
class GateResult:
|
||||
"""Result of output gate classification."""
|
||||
|
||||
__slots__ = ("is_public", "blocked_reasons", "confidence")
|
||||
|
||||
def __init__(self, is_public: bool, blocked_reasons: list[str], confidence: float):
|
||||
self.is_public = is_public
|
||||
self.blocked_reasons = blocked_reasons
|
||||
self.confidence = confidence
|
||||
|
||||
def __bool__(self):
|
||||
return self.is_public
|
||||
|
||||
def __repr__(self):
|
||||
status = "PUBLIC" if self.is_public else "BLOCKED"
|
||||
return f"GateResult({status}, reasons={self.blocked_reasons}, conf={self.confidence:.2f})"
|
||||
|
||||
|
||||
def classify(content: str) -> GateResult:
|
||||
"""Classify content as public-facing or system/internal.
|
||||
|
||||
Returns GateResult:
|
||||
- is_public=True: safe for tweet queue / public output
|
||||
- is_public=False: system content, blocked from public outputs
|
||||
"""
|
||||
if not content or not content.strip():
|
||||
return GateResult(False, ["empty content"], 1.0)
|
||||
|
||||
# Count system pattern matches
|
||||
system_hits = []
|
||||
for pattern in _SYSTEM_PATTERNS:
|
||||
match = pattern.search(content)
|
||||
if match:
|
||||
system_hits.append(match.group())
|
||||
|
||||
# Count public signals
|
||||
public_hits = sum(1 for p in _PUBLIC_SIGNALS if p.search(content))
|
||||
|
||||
# Decision logic
|
||||
if len(system_hits) >= 3:
|
||||
# Strong system signal — definitely internal
|
||||
return GateResult(False, system_hits[:5], 0.95)
|
||||
|
||||
if len(system_hits) >= 1 and public_hits == 0:
|
||||
# Some system signal, no public signal — likely internal
|
||||
return GateResult(False, system_hits, 0.75)
|
||||
|
||||
if len(system_hits) == 0:
|
||||
# No system signal — public
|
||||
return GateResult(True, [], 0.90 if public_hits > 0 else 0.70)
|
||||
|
||||
# Mixed signals (system hits + public signals) — default to blocking
|
||||
# Better to block a borderline tweet than leak system info
|
||||
return GateResult(False, system_hits, 0.50)
|
||||
|
||||
|
||||
def gate_for_tweet_queue(content: str, agent: str = None) -> GateResult:
|
||||
"""Gate specifically for the tweet queue. Stricter than general classify.
|
||||
|
||||
Additional checks:
|
||||
- OPSEC filter (imported from approvals)
|
||||
- Agent attribution check
|
||||
"""
|
||||
result = classify(content)
|
||||
if not result.is_public:
|
||||
return result
|
||||
|
||||
# Additional tweet-specific checks
|
||||
blocked = []
|
||||
|
||||
# Must not be too short (probably a fragment or command)
|
||||
stripped = content.strip()
|
||||
if len(stripped) < 20:
|
||||
blocked.append("content too short for tweet (<20 chars)")
|
||||
|
||||
# Must not contain raw URLs to internal systems
|
||||
if re.search(r"https?://(?:localhost|127\.0\.0\.1|77\.42\.65\.182)", stripped):
|
||||
blocked.append("contains internal URL")
|
||||
|
||||
if blocked:
|
||||
return GateResult(False, blocked, 0.85)
|
||||
|
||||
return result
|
||||
154
ops/pipeline-v2/telegram/response.py
Normal file
154
ops/pipeline-v2/telegram/response.py
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Response construction and post-processing.
|
||||
|
||||
Builds LLM prompts, parses response tags (LEARNING, RESEARCH, SOURCE, CLAIM,
|
||||
CONFIDENCE), strips internal tags from display output.
|
||||
|
||||
All functions are stateless. No Telegram types, no SQLite, no module-level state.
|
||||
|
||||
Extracted from bot.py (Ganymede decomposition spec).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
logger = logging.getLogger("tg.response")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedResponse:
|
||||
"""Result of parsing Rio's raw LLM response."""
|
||||
display_text: str
|
||||
confidence: float | None
|
||||
learnings: list[tuple[str, str]] = field(default_factory=list) # [(category, correction)]
|
||||
research_queries: list[str] = field(default_factory=list)
|
||||
sources: list[str] = field(default_factory=list)
|
||||
claims: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
def build_system_prompt(
|
||||
*,
|
||||
kb_context: str,
|
||||
market_context: str,
|
||||
research_context: str,
|
||||
x_link_context: str,
|
||||
learnings: str,
|
||||
conversation_history: str,
|
||||
username: str,
|
||||
message: str,
|
||||
) -> str:
|
||||
"""Assemble the full Opus system prompt for Rio's response.
|
||||
|
||||
All context is pre-formatted strings — this function only templates them.
|
||||
"""
|
||||
return f"""You are Rio, the Teleo internet finance agent. Your Telegram handle is @FutAIrdBot — that IS you. Users tag @FutAIrdBot to reach you. Never say "I'm not FutAIrdBot." You are also @futaRdIO on X. You have deep knowledge about futarchy, prediction markets, token governance, and the MetaDAO ecosystem.
|
||||
|
||||
## RESPONSE LENGTH — CRITICAL
|
||||
Default to SHORT responses. 1-3 sentences for simple questions. Match the length of the question.
|
||||
Only go longer when the user explicitly asks for depth, analysis, or a breakdown.
|
||||
If you catch yourself writing more than one paragraph, stop and ask: "Did they ask for this much?" If not, cut it.
|
||||
|
||||
## How to sound
|
||||
Write like a sharp analyst talking to peers, not like an AI. Specifically:
|
||||
- Use your knowledge naturally. Don't say "the KB tracks" or "at experimental confidence" or "our claims show." Just state what you know and how confident you are in plain language.
|
||||
- Have a take. You're an analyst, not a summarizer. Say what you actually think.
|
||||
- Every sentence must add something the user doesn't already know. Cut filler, restatements, and padding ruthlessly.
|
||||
- Short questions deserve short answers. Give the fact, not a framing essay.
|
||||
- Match the user's energy. One-line question = one-line answer.
|
||||
- Sound human. No em dashes, no "That said", no "It's worth noting." Just say the thing.
|
||||
- No markdown. Plain text only.
|
||||
- When you're uncertain, just say so simply. "Not sure about X" — done.
|
||||
|
||||
## Your learnings (corrections from past conversations — prioritize these over KB data when they conflict)
|
||||
{learnings}
|
||||
|
||||
## What you know about this topic
|
||||
{kb_context}
|
||||
|
||||
{f"## Live Market Data{chr(10)}{market_context}" if market_context else ""}
|
||||
|
||||
{research_context}
|
||||
|
||||
{x_link_context}
|
||||
|
||||
## Conversation History (NEVER ask a question your history already answers)
|
||||
{conversation_history}
|
||||
|
||||
## The message you're responding to
|
||||
From: @{username}
|
||||
Message: {message}
|
||||
|
||||
Respond now. Be substantive but concise. If they're wrong about something, say so directly. If they know something you don't, tell them it's worth digging into. If they correct you, accept it and build on the correction. Do NOT respond to messages that aren't directed at you — only respond when tagged or replied to.
|
||||
|
||||
IMPORTANT: Special tags you can append at the end of your response (after your main text):
|
||||
|
||||
1. LEARNING: [category] [what you learned]
|
||||
Categories: factual, communication, structured_data
|
||||
Only when genuinely learned something. Most responses have none.
|
||||
NEVER save a learning about what data you do or don't have access to.
|
||||
|
||||
2. RESEARCH: [search query]
|
||||
Triggers a live X search and sends results back to the chat. ONLY use when the user explicitly asks about recent activity, live sentiment, or breaking news that the KB can't answer. Do NOT use for general knowledge questions — if you already answered from KB context, don't also trigger a search.
|
||||
|
||||
3. SOURCE: [description of what to ingest]
|
||||
When a user shares valuable source material (X posts, articles, data). Creates a source file in the ingestion pipeline, attributed to the user. Include the verbatim content — don't alter or summarize the user's contribution. Use this when someone drops a link or shares original analysis worth preserving.
|
||||
|
||||
4. CLAIM: [specific, disagreeable assertion]
|
||||
When a user makes a specific claim with evidence that could enter the KB. Creates a draft claim file attributed to them. Only for genuine claims — not opinions or questions.
|
||||
|
||||
5. CONFIDENCE: [0.0-1.0]
|
||||
ALWAYS include this tag. Rate how well the KB context above actually helped you answer this question. 1.0 = KB had exactly what was needed. 0.5 = KB had partial/tangential info. 0.0 = KB had nothing relevant, you answered from general knowledge. This is for internal audit only — never visible to users."""
|
||||
|
||||
|
||||
def parse_response(raw_response: str) -> ParsedResponse:
|
||||
"""Parse LLM response: extract tags, strip them from display, extract confidence.
|
||||
|
||||
Tag parsing order: LEARNING, RESEARCH, SOURCE, CLAIM, CONFIDENCE.
|
||||
Confidence regex is case-insensitive, bracket-optional.
|
||||
"""
|
||||
display = raw_response
|
||||
|
||||
# LEARNING tags
|
||||
learnings = re.findall(
|
||||
r'^LEARNING:\s*(factual|communication|structured_data)\s+(.+)$',
|
||||
raw_response, re.MULTILINE)
|
||||
if learnings:
|
||||
display = re.sub(r'\n?LEARNING:\s*\S+\s+.+$', '', display, flags=re.MULTILINE).rstrip()
|
||||
|
||||
# RESEARCH tags
|
||||
research_queries = re.findall(r'^RESEARCH:\s+(.+)$', raw_response, re.MULTILINE)
|
||||
if research_queries:
|
||||
display = re.sub(r'\n?RESEARCH:\s+.+$', '', display, flags=re.MULTILINE).rstrip()
|
||||
|
||||
# SOURCE tags
|
||||
sources = re.findall(r'^SOURCE:\s+(.+)$', raw_response, re.MULTILINE)
|
||||
if sources:
|
||||
display = re.sub(r'\n?SOURCE:\s+.+$', '', display, flags=re.MULTILINE).rstrip()
|
||||
|
||||
# CLAIM tags
|
||||
claims = re.findall(r'^CLAIM:\s+(.+)$', raw_response, re.MULTILINE)
|
||||
if claims:
|
||||
display = re.sub(r'\n?CLAIM:\s+.+$', '', display, flags=re.MULTILINE).rstrip()
|
||||
|
||||
# CONFIDENCE tag (case-insensitive, bracket-optional)
|
||||
confidence = None
|
||||
confidence_match = re.search(
|
||||
r'^CONFIDENCE:\s*\[?([\d.]+)\]?', raw_response, re.MULTILINE | re.IGNORECASE)
|
||||
if confidence_match:
|
||||
try:
|
||||
confidence = max(0.0, min(1.0, float(confidence_match.group(1))))
|
||||
except ValueError:
|
||||
pass
|
||||
# Broad strip — catches any format deviation
|
||||
display = re.sub(
|
||||
r'\n?^CONFIDENCE\s*:.*$', '', display, flags=re.MULTILINE | re.IGNORECASE).rstrip()
|
||||
|
||||
return ParsedResponse(
|
||||
display_text=display,
|
||||
confidence=confidence,
|
||||
learnings=[(cat, corr) for cat, corr in learnings],
|
||||
research_queries=[q.strip() for q in research_queries],
|
||||
sources=[s.strip() for s in sources],
|
||||
claims=[c.strip() for c in claims],
|
||||
)
|
||||
347
ops/pipeline-v2/telegram/retrieval.py
Normal file
347
ops/pipeline-v2/telegram/retrieval.py
Normal file
|
|
@ -0,0 +1,347 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Retrieval orchestration — keyword, vector, RRF merge, query decomposition.
|
||||
|
||||
All functions are stateless. LLM calls are injected via callback (llm_fn).
|
||||
No Telegram types, no SQLite, no module-level state.
|
||||
|
||||
Extracted from bot.py (Ganymede decomposition spec).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from typing import Any, Callable, Awaitable
|
||||
|
||||
from lib.config import (
|
||||
RETRIEVAL_RRF_K as RRF_K,
|
||||
RETRIEVAL_ENTITY_BOOST as ENTITY_BOOST,
|
||||
RETRIEVAL_MAX_RESULTS as MAX_RETRIEVAL_CLAIMS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("tg.retrieval")
|
||||
|
||||
# Type alias for the LLM callback injected by bot.py
|
||||
LLMFn = Callable[[str, str, int], Awaitable[str | None]] # (model, prompt, max_tokens) → response
|
||||
|
||||
|
||||
def rrf_merge_context(kb_ctx: Any, vector_meta: dict, kb_read_dir: str) -> tuple[str, list[dict]]:
|
||||
"""Merge keyword and vector retrieval into a single ranked claim list via RRF.
|
||||
|
||||
Reciprocal Rank Fusion: RRF(d) = Σ 1/(k + rank_i(d))
|
||||
k=20 tuned for small result sets (5-10 per source).
|
||||
|
||||
Entity-aware boosting: claims wiki-linked from matched entities get +50% RRF score.
|
||||
|
||||
Returns (formatted_text, ranked_claims_for_audit).
|
||||
"""
|
||||
# Collect claim titles wiki-linked from matched entities
|
||||
entity_linked_titles: set[str] = set()
|
||||
if kb_ctx and kb_ctx.entities:
|
||||
for ent in kb_ctx.entities:
|
||||
for t in ent.related_claims:
|
||||
entity_linked_titles.add(t.lower())
|
||||
|
||||
# --- Build per-claim RRF scores ---
|
||||
claim_map: dict[str, dict] = {}
|
||||
|
||||
# Keyword claims (already sorted by keyword score desc)
|
||||
for rank, claim in enumerate(kb_ctx.claims):
|
||||
p = claim.path
|
||||
if kb_read_dir and p.startswith(kb_read_dir):
|
||||
p = p[len(kb_read_dir):].lstrip("/")
|
||||
rrf = 1.0 / (RRF_K + rank)
|
||||
claim_map[p] = {
|
||||
"rrf_score": rrf,
|
||||
"title": claim.title,
|
||||
"domain": claim.domain,
|
||||
"confidence": claim.confidence,
|
||||
"description": claim.description,
|
||||
"source": "keyword",
|
||||
"vector_score": None,
|
||||
}
|
||||
|
||||
# Vector results (already sorted by cosine desc)
|
||||
for rank, vr in enumerate(vector_meta.get("direct_results", [])):
|
||||
p = vr.get("path", "")
|
||||
rrf = 1.0 / (RRF_K + rank)
|
||||
if p in claim_map:
|
||||
claim_map[p]["rrf_score"] += rrf
|
||||
claim_map[p]["source"] = "vector+keyword"
|
||||
claim_map[p]["vector_score"] = vr.get("score")
|
||||
else:
|
||||
claim_map[p] = {
|
||||
"rrf_score": rrf,
|
||||
"title": vr.get("title", ""),
|
||||
"domain": vr.get("domain", ""),
|
||||
"confidence": "",
|
||||
"description": "",
|
||||
"source": "vector",
|
||||
"vector_score": vr.get("score"),
|
||||
}
|
||||
|
||||
# Apply entity-linked boost
|
||||
if entity_linked_titles:
|
||||
for p, info in claim_map.items():
|
||||
if info["title"].lower() in entity_linked_titles:
|
||||
info["rrf_score"] *= ENTITY_BOOST
|
||||
info["source"] = info["source"] + "+entity"
|
||||
|
||||
# Sort by RRF score desc
|
||||
ranked = sorted(claim_map.items(), key=lambda x: x[1]["rrf_score"], reverse=True)
|
||||
|
||||
# --- Format output ---
|
||||
sections = []
|
||||
|
||||
# Entities section (keyword search is still best for entity resolution)
|
||||
if kb_ctx.entities:
|
||||
sections.append("## Matched Entities")
|
||||
for i, ent in enumerate(kb_ctx.entities):
|
||||
sections.append(f"**{ent.name}** ({ent.entity_type}, {ent.domain})")
|
||||
if i < 3:
|
||||
sections.append(ent.overview[:8000])
|
||||
else:
|
||||
sections.append(ent.overview[:500])
|
||||
if ent.related_claims:
|
||||
sections.append("Related claims: " + ", ".join(ent.related_claims[:5]))
|
||||
sections.append("")
|
||||
|
||||
# Merged claims section (RRF-ranked)
|
||||
if ranked:
|
||||
sections.append("## Retrieved Claims")
|
||||
for path, info in ranked[:MAX_RETRIEVAL_CLAIMS]:
|
||||
line = f"- **{info['title']}**"
|
||||
meta_parts = []
|
||||
if info["confidence"]:
|
||||
meta_parts.append(f"confidence: {info['confidence']}")
|
||||
if info["domain"]:
|
||||
meta_parts.append(info["domain"])
|
||||
if info["vector_score"] is not None:
|
||||
meta_parts.append(f"{int(info['vector_score'] * 100)}% semantic match")
|
||||
if meta_parts:
|
||||
line += f" ({', '.join(meta_parts)})"
|
||||
sections.append(line)
|
||||
if info["description"]:
|
||||
sections.append(f" {info['description']}")
|
||||
sections.append("")
|
||||
|
||||
# Positions section
|
||||
if kb_ctx.positions:
|
||||
sections.append("## Agent Positions")
|
||||
for pos in kb_ctx.positions:
|
||||
sections.append(f"**{pos.agent}**: {pos.title}")
|
||||
sections.append(pos.content[:200])
|
||||
sections.append("")
|
||||
|
||||
# Beliefs section
|
||||
if kb_ctx.belief_excerpts:
|
||||
sections.append("## Relevant Beliefs")
|
||||
for exc in kb_ctx.belief_excerpts:
|
||||
sections.append(exc)
|
||||
sections.append("")
|
||||
|
||||
# Build audit-friendly ranked list
|
||||
claims_audit = []
|
||||
for i, (path, info) in enumerate(ranked[:MAX_RETRIEVAL_CLAIMS]):
|
||||
claims_audit.append({
|
||||
"path": path, "title": info["title"],
|
||||
"score": round(info["rrf_score"], 4),
|
||||
"rank": i + 1, "source": info["source"],
|
||||
})
|
||||
|
||||
if not sections:
|
||||
return "No relevant KB content found for this query.", claims_audit
|
||||
|
||||
# Stats footer
|
||||
n_vector = sum(1 for _, v in ranked if v["source"] in ("vector", "vector+keyword"))
|
||||
n_keyword = sum(1 for _, v in ranked if v["source"] in ("keyword", "vector+keyword"))
|
||||
n_both = sum(1 for _, v in ranked if v["source"] == "vector+keyword")
|
||||
sections.append(f"---\nKB: {kb_ctx.stats.get('total_claims', '?')} claims, "
|
||||
f"{kb_ctx.stats.get('total_entities', '?')} entities. "
|
||||
f"Retrieved: {len(ranked)} claims (vector: {n_vector}, keyword: {n_keyword}, both: {n_both}).")
|
||||
|
||||
return "\n".join(sections), claims_audit
|
||||
|
||||
|
||||
async def reformulate_query(
|
||||
query: str,
|
||||
history: list[dict],
|
||||
llm_fn: LLMFn,
|
||||
model: str,
|
||||
) -> str:
|
||||
"""Rewrite conversational follow-ups into standalone search queries.
|
||||
|
||||
If there's no conversation history or the query is already standalone,
|
||||
returns the original query unchanged.
|
||||
"""
|
||||
if not history:
|
||||
return query
|
||||
|
||||
try:
|
||||
last_exchange = history[-1]
|
||||
recent_context = ""
|
||||
if last_exchange.get("user"):
|
||||
recent_context += f"User: {last_exchange['user'][:300]}\n"
|
||||
if last_exchange.get("bot"):
|
||||
recent_context += f"Bot: {last_exchange['bot'][:300]}\n"
|
||||
reformulate_prompt = (
|
||||
f"A user is in a conversation. Given the recent exchange and their new message, "
|
||||
f"rewrite the new message as a STANDALONE search query that captures what they're "
|
||||
f"actually asking about. The query should work for semantic search — specific topics, "
|
||||
f"entities, and concepts.\n\n"
|
||||
f"Recent exchange:\n{recent_context}\n"
|
||||
f"New message: {query}\n\n"
|
||||
f"If the message is already a clear standalone question or topic, return it unchanged.\n"
|
||||
f"If it's a follow-up, correction, or reference to the conversation, rewrite it.\n\n"
|
||||
f"Return ONLY the rewritten query, nothing else. Max 30 words."
|
||||
)
|
||||
reformulated = await llm_fn(model, reformulate_prompt, 80)
|
||||
if reformulated and reformulated.strip() and len(reformulated.strip()) > 3:
|
||||
logger.info("Query reformulated: '%s' → '%s'", query[:60], reformulated.strip()[:60])
|
||||
return reformulated.strip()
|
||||
except Exception as e:
|
||||
logger.warning("Query reformulation failed: %s", e)
|
||||
|
||||
return query
|
||||
|
||||
|
||||
async def decompose_query(
|
||||
query: str,
|
||||
llm_fn: LLMFn,
|
||||
model: str,
|
||||
) -> list[str]:
|
||||
"""Split multi-part queries into focused sub-queries for vector search.
|
||||
|
||||
Only decomposes if query is >8 words and contains a conjunction or multiple
|
||||
question marks. Otherwise returns [query] unchanged.
|
||||
"""
|
||||
try:
|
||||
words = query.split()
|
||||
has_conjunction = any(w.lower() in ("and", "but", "also", "plus", "versus", "vs") for w in words)
|
||||
has_question_marks = query.count("?") > 1
|
||||
if len(words) > 8 and (has_conjunction or has_question_marks):
|
||||
decompose_prompt = (
|
||||
f"Split this query into 2-3 focused search sub-queries. Each sub-query should "
|
||||
f"target one specific concept or question. Return one sub-query per line, nothing else.\n\n"
|
||||
f"Query: {query}\n\n"
|
||||
f"If the query is already focused on one topic, return it unchanged on a single line."
|
||||
)
|
||||
decomposed = await llm_fn(model, decompose_prompt, 150)
|
||||
if decomposed:
|
||||
parts = [p.strip().lstrip("0123456789.-) ") for p in decomposed.strip().split("\n") if p.strip()]
|
||||
if 1 < len(parts) <= 4:
|
||||
logger.info("Query decomposed: '%s' → %s", query[:60], parts)
|
||||
return parts
|
||||
except Exception as e:
|
||||
logger.warning("Query decomposition failed: %s", e)
|
||||
|
||||
return [query]
|
||||
|
||||
|
||||
def vector_search_merge(
|
||||
sub_queries: list[str],
|
||||
retrieve_vector_fn: Callable[[str], tuple[str, dict]],
|
||||
) -> dict:
|
||||
"""Run vector search on each sub-query, dedup by path (keep highest score).
|
||||
|
||||
Returns merged vector_meta dict with keys:
|
||||
direct_results, expanded_results, layers_hit, duration_ms, errors.
|
||||
"""
|
||||
all_direct = []
|
||||
all_expanded = []
|
||||
layers = []
|
||||
total_duration = 0
|
||||
errors = []
|
||||
|
||||
for sq in sub_queries:
|
||||
_, v_meta = retrieve_vector_fn(sq)
|
||||
all_direct.extend(v_meta.get("direct_results", []))
|
||||
all_expanded.extend(v_meta.get("expanded_results", []))
|
||||
layers.extend(v_meta.get("layers_hit", []))
|
||||
total_duration += v_meta.get("duration_ms", 0)
|
||||
if v_meta.get("error"):
|
||||
errors.append(v_meta["error"])
|
||||
|
||||
# Dedup by path (keep highest score)
|
||||
seen: dict[str, dict] = {}
|
||||
for vr in all_direct:
|
||||
p = vr.get("path", "")
|
||||
if p not in seen or vr.get("score", 0) > seen[p].get("score", 0):
|
||||
seen[p] = vr
|
||||
|
||||
result = {
|
||||
"direct_results": list(seen.values()),
|
||||
"expanded_results": all_expanded,
|
||||
"layers_hit": list(set(layers)),
|
||||
"duration_ms": total_duration,
|
||||
}
|
||||
if errors:
|
||||
result["errors"] = errors
|
||||
return result
|
||||
|
||||
|
||||
async def orchestrate_retrieval(
|
||||
text: str,
|
||||
search_query: str,
|
||||
kb_read_dir: str,
|
||||
kb_index: Any,
|
||||
llm_fn: LLMFn,
|
||||
triage_model: str,
|
||||
retrieve_context_fn: Callable,
|
||||
retrieve_vector_fn: Callable[[str], tuple[str, dict]],
|
||||
kb_scope: list[str] | None = None,
|
||||
) -> dict:
|
||||
"""Full retrieval pipeline: keyword → decompose → vector → RRF merge.
|
||||
|
||||
Returns dict with keys:
|
||||
kb_context_text, claims_audit, retrieval_layers, vector_meta,
|
||||
tool_calls, kb_ctx.
|
||||
"""
|
||||
tool_calls = []
|
||||
|
||||
# 1. Keyword retrieval (entity resolution needs full context)
|
||||
t_kb = time.monotonic()
|
||||
kb_ctx = retrieve_context_fn(search_query, kb_read_dir, index=kb_index, kb_scope=kb_scope)
|
||||
kb_duration = int((time.monotonic() - t_kb) * 1000)
|
||||
retrieval_layers = ["keyword"] if (kb_ctx and (kb_ctx.entities or kb_ctx.claims)) else []
|
||||
tool_calls.append({
|
||||
"tool": "retrieve_context",
|
||||
"input": {"query": search_query[:200], "original_query": text[:200] if search_query != text else None},
|
||||
"output": {"entities": len(kb_ctx.entities) if kb_ctx else 0,
|
||||
"claims": len(kb_ctx.claims) if kb_ctx else 0},
|
||||
"duration_ms": kb_duration,
|
||||
})
|
||||
|
||||
# 2. Query decomposition
|
||||
t_decompose = time.monotonic()
|
||||
sub_queries = await decompose_query(search_query, llm_fn, triage_model)
|
||||
decompose_duration = int((time.monotonic() - t_decompose) * 1000)
|
||||
if len(sub_queries) > 1:
|
||||
tool_calls.append({
|
||||
"tool": "query_decompose",
|
||||
"input": {"query": search_query[:200]},
|
||||
"output": {"sub_queries": sub_queries},
|
||||
"duration_ms": decompose_duration,
|
||||
})
|
||||
|
||||
# 3. Vector search across sub-queries
|
||||
vector_meta = vector_search_merge(sub_queries, retrieve_vector_fn)
|
||||
|
||||
# 4. RRF merge
|
||||
kb_context_text, claims_audit = rrf_merge_context(kb_ctx, vector_meta, kb_read_dir)
|
||||
retrieval_layers.extend(vector_meta.get("layers_hit", []))
|
||||
tool_calls.append({
|
||||
"tool": "retrieve_qdrant_context",
|
||||
"input": {"query": text[:200]},
|
||||
"output": {"direct_hits": len(vector_meta.get("direct_results", [])),
|
||||
"expanded": len(vector_meta.get("expanded_results", []))},
|
||||
"duration_ms": vector_meta.get("duration_ms", 0),
|
||||
})
|
||||
|
||||
return {
|
||||
"kb_context_text": kb_context_text,
|
||||
"claims_audit": claims_audit,
|
||||
"retrieval_layers": retrieval_layers,
|
||||
"vector_meta": vector_meta,
|
||||
"tool_calls": tool_calls,
|
||||
"kb_ctx": kb_ctx,
|
||||
}
|
||||
62
ops/pipeline-v2/telegram/rio.yaml
Normal file
62
ops/pipeline-v2/telegram/rio.yaml
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
# Rio — Teleo internet finance agent
|
||||
# This config drives Rio's Telegram bot identity, KB scope, and voice.
|
||||
|
||||
# ─── Identity ────────────────────────────────────────────────────────────
|
||||
name: Rio
|
||||
handle: "@FutAIrdBot"
|
||||
x_handle: "@futaRdIO"
|
||||
bot_token_file: telegram-bot-token
|
||||
pentagon_agent_id: 244ba05f
|
||||
domain: internet-finance
|
||||
domain_expertise: >
|
||||
futarchy, prediction markets, token governance, the MetaDAO ecosystem,
|
||||
conditional markets, internet capital formation, and permissionless fundraising
|
||||
|
||||
# ─── KB Scope ────────────────────────────────────────────────────────────
|
||||
# One full-KB query; results tagged primary/cross-domain post-hoc.
|
||||
kb_scope:
|
||||
primary:
|
||||
- domains/internet-finance
|
||||
- foundations
|
||||
- core
|
||||
|
||||
# ─── Voice ───────────────────────────────────────────────────────────────
|
||||
voice_summary: "Sharp analyst talking to peers. High signal density."
|
||||
|
||||
voice_definition: |
|
||||
## Register
|
||||
You're a sharp analyst talking to peers — people who know markets and
|
||||
governance mechanisms. Don't explain basics unless asked. Lead with your
|
||||
take, not the context.
|
||||
|
||||
## Certainty Expression
|
||||
Be direct about conviction levels. "High conviction" / "Speculative but
|
||||
interesting" / "I don't know." Never hedge with weasel words when you
|
||||
have a clear view. Never express false certainty when you don't.
|
||||
|
||||
## Domain Vocabulary
|
||||
Use futarchy, pro-rata, oversubscription, ICO, conditional markets,
|
||||
liquidation proposals without explanation. Explain newer protocol-specific
|
||||
terms (ownership coins, PRISM) on first use.
|
||||
|
||||
## Signature Moves
|
||||
Connect everything to market mechanisms and incentive structures. When
|
||||
someone describes a governance problem, you see the market design solution.
|
||||
When someone describes a market outcome, you trace it back to the
|
||||
mechanism that produced it.
|
||||
|
||||
# ─── Learnings ───────────────────────────────────────────────────────────
|
||||
learnings_file: agents/rio/learnings.md
|
||||
|
||||
# ─── Eval ────────────────────────────────────────────────────────────────
|
||||
opsec_additional_patterns:
|
||||
- "token price \\$[\\d,.]+"
|
||||
- "LP (allocation|commitment)"
|
||||
|
||||
# ─── Model ───────────────────────────────────────────────────────────────
|
||||
response_model: anthropic/claude-opus-4-6
|
||||
triage_model: anthropic/claude-haiku-4.5
|
||||
max_tokens: 500
|
||||
|
||||
# ─── Rate Limits ─────────────────────────────────────────────────────────
|
||||
max_response_per_user_per_hour: 30
|
||||
68
ops/pipeline-v2/telegram/theseus.yaml
Normal file
68
ops/pipeline-v2/telegram/theseus.yaml
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
# Theseus — Teleo AI alignment & collective superintelligence agent
|
||||
# This config drives Theseus's Telegram bot identity, KB scope, and voice.
|
||||
|
||||
# ─── Identity ────────────────────────────────────────────────────────────
|
||||
name: Theseus
|
||||
handle: "@TheseusAIBot"
|
||||
bot_token_file: theseus-bot-token
|
||||
pentagon_agent_id: 46864dd4
|
||||
domain: ai-alignment
|
||||
domain_expertise: >
|
||||
AI alignment, collective superintelligence architecture, multi-agent
|
||||
coordination, capability evaluation, formal verification as oversight,
|
||||
and empirical AI capability evidence
|
||||
|
||||
# ─── KB Scope ────────────────────────────────────────────────────────────
|
||||
kb_scope:
|
||||
primary:
|
||||
- domains/ai-alignment
|
||||
- domains/collective-intelligence
|
||||
- foundations
|
||||
- core
|
||||
|
||||
# ─── Voice ───────────────────────────────────────────────────────────────
|
||||
voice_summary: "Technically precise, structurally analytical. Alignment as coordination, not catastrophe."
|
||||
|
||||
voice_definition: |
|
||||
## Register
|
||||
Technically precise but accessible. No jargon-hiding, no catastrophizing.
|
||||
You see alignment as a coordination problem, not a purely technical one.
|
||||
That framing shapes everything you say.
|
||||
|
||||
## Certainty Expression
|
||||
Intellectually honest about what's empirically grounded vs theoretically
|
||||
thin. Say "the evidence shows" when it does, "this is structural analysis"
|
||||
when it's inference, "I don't know" when you don't. Never dress speculation
|
||||
as fact.
|
||||
|
||||
## Domain Vocabulary
|
||||
Use alignment, mesa-optimization, RLHF, constitutional AI, verification,
|
||||
coordination protocols, capability evaluation without explanation. Ground
|
||||
abstract alignment concepts in concrete examples — the Claude's Cycles
|
||||
research program, multi-agent architectures, observable failure modes.
|
||||
|
||||
## Signature Moves
|
||||
Connect everything to coordination and architecture. When someone raises
|
||||
an alignment concern, you see the structural mechanism. When someone
|
||||
describes a capability, you trace the coordination pattern that produced
|
||||
it. Evidence over theory — always prefer documented observation over
|
||||
hypotheticals.
|
||||
|
||||
## What You Don't Do
|
||||
No doomerism, no accelerationism. Structural analysis only. Don't
|
||||
catastrophize and don't hand-wave risks away.
|
||||
|
||||
# ─── Learnings ───────────────────────────────────────────────────────────
|
||||
learnings_file: agents/theseus/learnings.md
|
||||
|
||||
# ─── Eval ────────────────────────────────────────────────────────────────
|
||||
opsec_additional_patterns:
|
||||
- "internal (architecture|infra)"
|
||||
|
||||
# ─── Model ───────────────────────────────────────────────────────────────
|
||||
response_model: anthropic/claude-opus-4-6
|
||||
triage_model: anthropic/claude-haiku-4.5
|
||||
max_tokens: 500
|
||||
|
||||
# ─── Rate Limits ─────────────────────────────────────────────────────────
|
||||
max_response_per_user_per_hour: 30
|
||||
85
ops/pipeline-v2/telegram/worktree_lock.py
Normal file
85
ops/pipeline-v2/telegram/worktree_lock.py
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
"""File-based lock for ALL processes writing to the main worktree.
|
||||
|
||||
One lock, one mechanism (Ganymede: Option C). Used by:
|
||||
- Pipeline daemon stages (entity_batch, source archiver, substantive_fixer) via async wrapper
|
||||
- Telegram bot (sync context manager)
|
||||
|
||||
Protects: /opt/teleo-eval/workspaces/main/
|
||||
|
||||
flock auto-releases on process exit (even crash/kill). No stale lock cleanup needed.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import fcntl
|
||||
import logging
|
||||
import time
|
||||
from contextlib import asynccontextmanager, contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger("worktree-lock")
|
||||
|
||||
LOCKFILE = Path("/opt/teleo-eval/workspaces/.main-worktree.lock")
|
||||
|
||||
|
||||
@contextmanager
|
||||
def main_worktree_lock(timeout: float = 10.0):
|
||||
"""Sync context manager — use in telegram bot and other external processes.
|
||||
|
||||
Usage:
|
||||
with main_worktree_lock():
|
||||
# write to inbox/queue/, git add/commit/push, etc.
|
||||
"""
|
||||
LOCKFILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
fp = open(LOCKFILE, "w")
|
||||
start = time.monotonic()
|
||||
while True:
|
||||
try:
|
||||
fcntl.flock(fp, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
break
|
||||
except BlockingIOError:
|
||||
if time.monotonic() - start > timeout:
|
||||
fp.close()
|
||||
logger.warning("Main worktree lock timeout after %.0fs", timeout)
|
||||
raise TimeoutError(f"Could not acquire main worktree lock in {timeout}s")
|
||||
time.sleep(0.1)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
fcntl.flock(fp, fcntl.LOCK_UN)
|
||||
fp.close()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def async_main_worktree_lock(timeout: float = 10.0):
|
||||
"""Async context manager — use in pipeline daemon stages.
|
||||
|
||||
Acquires the same file lock via run_in_executor (Ganymede: <1ms overhead).
|
||||
|
||||
Usage:
|
||||
async with async_main_worktree_lock():
|
||||
await _git("fetch", "origin", "main", cwd=main_dir)
|
||||
await _git("reset", "--hard", "origin/main", cwd=main_dir)
|
||||
# ... write files, commit, push ...
|
||||
"""
|
||||
loop = asyncio.get_event_loop()
|
||||
LOCKFILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
fp = open(LOCKFILE, "w")
|
||||
|
||||
def _acquire():
|
||||
start = time.monotonic()
|
||||
while True:
|
||||
try:
|
||||
fcntl.flock(fp, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
return
|
||||
except BlockingIOError:
|
||||
if time.monotonic() - start > timeout:
|
||||
fp.close()
|
||||
raise TimeoutError(f"Could not acquire main worktree lock in {timeout}s")
|
||||
time.sleep(0.1)
|
||||
|
||||
await loop.run_in_executor(None, _acquire)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
fcntl.flock(fp, fcntl.LOCK_UN)
|
||||
fp.close()
|
||||
366
ops/pipeline-v2/telegram/x_client.py
Normal file
366
ops/pipeline-v2/telegram/x_client.py
Normal file
|
|
@ -0,0 +1,366 @@
|
|||
#!/usr/bin/env python3
|
||||
"""X (Twitter) API client for Teleo agents.
|
||||
|
||||
Consolidated interface to twitterapi.io. Used by:
|
||||
- Telegram bot (research, tweet fetching, link analysis)
|
||||
- Research sessions (network monitoring, source discovery)
|
||||
- Any agent that needs X data
|
||||
|
||||
Epimetheus owns this module.
|
||||
|
||||
## Available Endpoints (twitterapi.io)
|
||||
|
||||
| Endpoint | What it does | When to use |
|
||||
|----------|-------------|-------------|
|
||||
| GET /tweets?tweet_ids={id} | Fetch specific tweet(s) by ID | User drops a link, need full content |
|
||||
| GET /article?tweet_id={id} | Fetch X long-form article | User drops an article link |
|
||||
| GET /tweet/advanced_search?query={q} | Search tweets by keyword | /research command, topic discovery |
|
||||
| GET /user/last_tweets?userName={u} | Get user's recent tweets | Network monitoring, agent research |
|
||||
|
||||
## Cost
|
||||
|
||||
All endpoints use the X-API-Key header. Pricing is per-request via twitterapi.io.
|
||||
Rate limits depend on plan tier. Key at /opt/teleo-eval/secrets/twitterapi-io-key.
|
||||
|
||||
## Rate Limiting
|
||||
|
||||
Research searches: 3 per user per day (explicit /research).
|
||||
Haiku autonomous searches: uncapped (don't burn user budget).
|
||||
Tweet fetches (URL lookups): uncapped (cheap, single tweet).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import aiohttp
|
||||
|
||||
logger = logging.getLogger("x-client")
|
||||
|
||||
# ─── Config ──────────────────────────────────────────────────────────────
|
||||
|
||||
BASE_URL = "https://api.twitterapi.io/twitter"
|
||||
API_KEY_FILE = "/opt/teleo-eval/secrets/twitterapi-io-key"
|
||||
REQUEST_TIMEOUT = 15 # seconds
|
||||
|
||||
# Rate limiting for user-triggered research
|
||||
_research_usage: dict[int, list[float]] = {}
|
||||
MAX_RESEARCH_PER_DAY = 3
|
||||
|
||||
|
||||
# ─── API Key ─────────────────────────────────────────────────────────────
|
||||
|
||||
def _load_api_key() -> Optional[str]:
|
||||
"""Load the twitterapi.io API key from secrets."""
|
||||
try:
|
||||
return Path(API_KEY_FILE).read_text().strip()
|
||||
except Exception:
|
||||
logger.warning("X API key not found at %s", API_KEY_FILE)
|
||||
return None
|
||||
|
||||
|
||||
def _headers() -> dict:
|
||||
"""Build request headers with API key."""
|
||||
key = _load_api_key()
|
||||
if not key:
|
||||
return {}
|
||||
return {"X-API-Key": key}
|
||||
|
||||
|
||||
# ─── Rate Limiting ───────────────────────────────────────────────────────
|
||||
|
||||
def check_research_rate_limit(user_id: int) -> bool:
|
||||
"""Check if user has research requests remaining. Returns True if allowed."""
|
||||
now = time.time()
|
||||
times = _research_usage.get(user_id, [])
|
||||
times = [t for t in times if now - t < 86400]
|
||||
_research_usage[user_id] = times
|
||||
return len(times) < MAX_RESEARCH_PER_DAY
|
||||
|
||||
|
||||
def record_research_usage(user_id: int):
|
||||
"""Record an explicit research request against user's daily limit."""
|
||||
_research_usage.setdefault(user_id, []).append(time.time())
|
||||
|
||||
|
||||
def get_research_remaining(user_id: int) -> int:
|
||||
"""Get remaining research requests for today."""
|
||||
now = time.time()
|
||||
times = [t for t in _research_usage.get(user_id, []) if now - t < 86400]
|
||||
return max(0, MAX_RESEARCH_PER_DAY - len(times))
|
||||
|
||||
|
||||
# ─── Core API Functions ──────────────────────────────────────────────────
|
||||
|
||||
async def get_tweet(tweet_id: str) -> Optional[dict]:
|
||||
"""Fetch a single tweet by ID. Works for any tweet, any age.
|
||||
|
||||
Endpoint: GET /tweets?tweet_ids={id}
|
||||
|
||||
Returns structured dict or None on failure.
|
||||
"""
|
||||
headers = _headers()
|
||||
if not headers:
|
||||
return None
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(
|
||||
f"{BASE_URL}/tweets",
|
||||
params={"tweet_ids": tweet_id},
|
||||
headers=headers,
|
||||
timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT),
|
||||
) as resp:
|
||||
if resp.status != 200:
|
||||
logger.warning("get_tweet(%s) → %d", tweet_id, resp.status)
|
||||
return None
|
||||
data = await resp.json()
|
||||
tweets = data.get("tweets", [])
|
||||
if not tweets:
|
||||
return None
|
||||
return _normalize_tweet(tweets[0])
|
||||
except Exception as e:
|
||||
logger.warning("get_tweet(%s) error: %s", tweet_id, e)
|
||||
return None
|
||||
|
||||
|
||||
async def get_article(tweet_id: str) -> Optional[dict]:
|
||||
"""Fetch an X long-form article by tweet ID.
|
||||
|
||||
Endpoint: GET /article?tweet_id={id}
|
||||
|
||||
Returns structured dict or None if not an article / not found.
|
||||
"""
|
||||
headers = _headers()
|
||||
if not headers:
|
||||
return None
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(
|
||||
f"{BASE_URL}/article",
|
||||
params={"tweet_id": tweet_id},
|
||||
headers=headers,
|
||||
timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT),
|
||||
) as resp:
|
||||
if resp.status != 200:
|
||||
return None
|
||||
data = await resp.json()
|
||||
article = data.get("article")
|
||||
if not article:
|
||||
return None
|
||||
# Article body is in "contents" array (not "text" field)
|
||||
contents = article.get("contents", [])
|
||||
text_parts = []
|
||||
for block in contents:
|
||||
block_text = block.get("text", "")
|
||||
if not block_text:
|
||||
continue
|
||||
block_type = block.get("type", "unstyled")
|
||||
if block_type.startswith("header"):
|
||||
text_parts.append(f"\n## {block_text}\n")
|
||||
elif block_type == "markdown":
|
||||
text_parts.append(block_text)
|
||||
elif block_type in ("unordered-list-item",):
|
||||
text_parts.append(f"- {block_text}")
|
||||
elif block_type in ("ordered-list-item",):
|
||||
text_parts.append(f"* {block_text}")
|
||||
elif block_type == "blockquote":
|
||||
text_parts.append(f"> {block_text}")
|
||||
else:
|
||||
text_parts.append(block_text)
|
||||
full_text = "\n".join(text_parts)
|
||||
author_data = article.get("author", {})
|
||||
likes = article.get("likeCount", 0) or 0
|
||||
retweets = article.get("retweetCount", 0) or 0
|
||||
return {
|
||||
"text": full_text,
|
||||
"title": article.get("title", ""),
|
||||
"author": author_data.get("userName", ""),
|
||||
"author_name": author_data.get("name", ""),
|
||||
"author_followers": author_data.get("followers", 0),
|
||||
"tweet_date": article.get("createdAt", ""),
|
||||
"is_article": True,
|
||||
"engagement": likes + retweets,
|
||||
"likes": likes,
|
||||
"retweets": retweets,
|
||||
"views": article.get("viewCount", 0) or 0,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning("get_article(%s) error: %s", tweet_id, e)
|
||||
return None
|
||||
|
||||
|
||||
async def search_tweets(query: str, max_results: int = 20, min_engagement: int = 0) -> list[dict]:
|
||||
"""Search X for tweets matching a query. Returns most recent, sorted by engagement.
|
||||
|
||||
Endpoint: GET /tweet/advanced_search?query={q}&queryType=Latest
|
||||
|
||||
Use short queries (2-3 words). Long queries return nothing.
|
||||
"""
|
||||
headers = _headers()
|
||||
if not headers:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(
|
||||
f"{BASE_URL}/tweet/advanced_search",
|
||||
params={"query": query, "queryType": "Latest"},
|
||||
headers=headers,
|
||||
timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT),
|
||||
) as resp:
|
||||
if resp.status >= 400:
|
||||
logger.warning("search_tweets('%s') → %d", query, resp.status)
|
||||
return []
|
||||
data = await resp.json()
|
||||
raw_tweets = data.get("tweets", [])
|
||||
except Exception as e:
|
||||
logger.warning("search_tweets('%s') error: %s", query, e)
|
||||
return []
|
||||
|
||||
results = []
|
||||
for tweet in raw_tweets[:max_results * 2]:
|
||||
normalized = _normalize_tweet(tweet)
|
||||
if not normalized:
|
||||
continue
|
||||
if normalized["text"].startswith("RT @"):
|
||||
continue
|
||||
if normalized["engagement"] < min_engagement:
|
||||
continue
|
||||
results.append(normalized)
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
|
||||
results.sort(key=lambda t: t["engagement"], reverse=True)
|
||||
return results
|
||||
|
||||
|
||||
async def get_user_tweets(username: str, max_results: int = 20) -> list[dict]:
|
||||
"""Get a user's most recent tweets.
|
||||
|
||||
Endpoint: GET /user/last_tweets?userName={username}
|
||||
|
||||
Used by research sessions for network monitoring.
|
||||
"""
|
||||
headers = _headers()
|
||||
if not headers:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(
|
||||
f"{BASE_URL}/user/last_tweets",
|
||||
params={"userName": username},
|
||||
headers=headers,
|
||||
timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT),
|
||||
) as resp:
|
||||
if resp.status >= 400:
|
||||
logger.warning("get_user_tweets('%s') → %d", username, resp.status)
|
||||
return []
|
||||
data = await resp.json()
|
||||
raw_tweets = data.get("tweets", [])
|
||||
except Exception as e:
|
||||
logger.warning("get_user_tweets('%s') error: %s", username, e)
|
||||
return []
|
||||
|
||||
return [_normalize_tweet(t) for t in raw_tweets[:max_results] if _normalize_tweet(t)]
|
||||
|
||||
|
||||
# ─── High-Level Functions ────────────────────────────────────────────────
|
||||
|
||||
async def fetch_from_url(url: str) -> Optional[dict]:
|
||||
"""Fetch tweet or article content from an X URL.
|
||||
|
||||
Tries tweet lookup first (most common), then article endpoint.
|
||||
Returns structured dict with text, author, engagement.
|
||||
Returns placeholder dict (not None) on failure so the caller can tell
|
||||
the user "couldn't fetch" instead of silently ignoring.
|
||||
"""
|
||||
match = re.search(r'(?:twitter\.com|x\.com)/(\w+)/status/(\d+)', url)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
username = match.group(1)
|
||||
tweet_id = match.group(2)
|
||||
|
||||
# Try tweet first (most X URLs are tweets)
|
||||
tweet_result = await get_tweet(tweet_id)
|
||||
|
||||
if tweet_result:
|
||||
tweet_text = tweet_result.get("text", "").strip()
|
||||
is_just_url = tweet_text.startswith("http") and len(tweet_text.split()) <= 2
|
||||
|
||||
if not is_just_url:
|
||||
# Regular tweet with real content — return it
|
||||
tweet_result["url"] = url
|
||||
return tweet_result
|
||||
|
||||
# Tweet was empty/URL-only, or tweet lookup failed — try article endpoint
|
||||
article_result = await get_article(tweet_id)
|
||||
if article_result:
|
||||
article_result["url"] = url
|
||||
article_result["author"] = article_result.get("author") or username
|
||||
# Article endpoint may return title but not full text
|
||||
if article_result.get("title") and not article_result.get("text"):
|
||||
article_result["text"] = (
|
||||
f'This is an X Article titled "{article_result["title"]}" by @{username}. '
|
||||
f"The API returned the title but not the full content. "
|
||||
f"Ask the user to paste the key points so you can analyze them."
|
||||
)
|
||||
return article_result
|
||||
|
||||
# If we got the tweet but it was just a URL, return with helpful context
|
||||
if tweet_result:
|
||||
tweet_result["url"] = url
|
||||
tweet_result["text"] = (
|
||||
f"Tweet by @{username} links to content but contains no text. "
|
||||
f"This may be an X Article. Ask the user to paste the key points."
|
||||
)
|
||||
return tweet_result
|
||||
|
||||
# Everything failed
|
||||
return {
|
||||
"text": f"[Could not fetch content from @{username}]",
|
||||
"url": url,
|
||||
"author": username,
|
||||
"author_name": "",
|
||||
"author_followers": 0,
|
||||
"engagement": 0,
|
||||
"tweet_date": "",
|
||||
"is_article": False,
|
||||
}
|
||||
|
||||
|
||||
# ─── Internal ────────────────────────────────────────────────────────────
|
||||
|
||||
def _normalize_tweet(raw: dict) -> Optional[dict]:
|
||||
"""Normalize a raw API tweet into a consistent structure."""
|
||||
text = raw.get("text", "")
|
||||
if not text:
|
||||
return None
|
||||
|
||||
author = raw.get("author", {})
|
||||
likes = raw.get("likeCount", 0) or 0
|
||||
retweets = raw.get("retweetCount", 0) or 0
|
||||
replies = raw.get("replyCount", 0) or 0
|
||||
views = raw.get("viewCount", 0) or 0
|
||||
|
||||
return {
|
||||
"id": raw.get("id", ""),
|
||||
"text": text,
|
||||
"url": raw.get("twitterUrl", raw.get("url", "")),
|
||||
"author": author.get("userName", "unknown"),
|
||||
"author_name": author.get("name", ""),
|
||||
"author_followers": author.get("followers", 0),
|
||||
"engagement": likes + retweets + replies,
|
||||
"likes": likes,
|
||||
"retweets": retweets,
|
||||
"replies": replies,
|
||||
"views": views,
|
||||
"tweet_date": raw.get("createdAt", ""),
|
||||
"is_reply": bool(raw.get("inReplyToId")),
|
||||
"is_article": False,
|
||||
}
|
||||
347
ops/pipeline-v2/telegram/x_publisher.py
Normal file
347
ops/pipeline-v2/telegram/x_publisher.py
Normal file
|
|
@ -0,0 +1,347 @@
|
|||
"""X (Twitter) publisher — posts approved tweets to X.
|
||||
|
||||
Handles the full tweet lifecycle:
|
||||
1. Agent submits draft → output gate blocks system content
|
||||
2. Draft enters approval_queue (type='tweet')
|
||||
3. Leo reviews substance → Cory approves via Telegram
|
||||
4. On approval, this module posts to X via API
|
||||
5. Records published URL and metrics
|
||||
|
||||
Uses Twitter API v2 via OAuth 1.0a for posting.
|
||||
Read operations still use twitterapi.io (x_client.py).
|
||||
|
||||
Epimetheus owns this module.
|
||||
"""
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
import hmac
|
||||
import logging
|
||||
import sqlite3
|
||||
import time
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import aiohttp
|
||||
|
||||
logger = logging.getLogger("x-publisher")
|
||||
|
||||
# ─── Config ──────────────────────────────────────────────────────────
|
||||
|
||||
# Twitter API v2 credentials for posting
|
||||
# OAuth 1.0a keys — stored in separate secret files
|
||||
_SECRETS_DIR = Path("/opt/teleo-eval/secrets")
|
||||
_CONSUMER_KEY_FILE = _SECRETS_DIR / "x-consumer-key"
|
||||
_CONSUMER_SECRET_FILE = _SECRETS_DIR / "x-consumer-secret"
|
||||
_ACCESS_TOKEN_FILE = _SECRETS_DIR / "x-access-token"
|
||||
_ACCESS_SECRET_FILE = _SECRETS_DIR / "x-access-secret"
|
||||
|
||||
TWITTER_API_V2_URL = "https://api.twitter.com/2/tweets"
|
||||
REQUEST_TIMEOUT = 15
|
||||
|
||||
|
||||
def _load_secret(path: Path) -> Optional[str]:
|
||||
"""Load a secret from a file. Returns None if missing."""
|
||||
try:
|
||||
return path.read_text().strip()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _load_oauth_credentials() -> Optional[dict]:
|
||||
"""Load all 4 OAuth 1.0a credentials. Returns None if any missing."""
|
||||
creds = {
|
||||
"consumer_key": _load_secret(_CONSUMER_KEY_FILE),
|
||||
"consumer_secret": _load_secret(_CONSUMER_SECRET_FILE),
|
||||
"access_token": _load_secret(_ACCESS_TOKEN_FILE),
|
||||
"access_secret": _load_secret(_ACCESS_SECRET_FILE),
|
||||
}
|
||||
missing = [k for k, v in creds.items() if not v]
|
||||
if missing:
|
||||
logger.warning("Missing X API credentials: %s", ", ".join(missing))
|
||||
return None
|
||||
return creds
|
||||
|
||||
|
||||
# ─── OAuth 1.0a Signature ────────────────────────────────────────────
|
||||
|
||||
def _percent_encode(s: str) -> str:
|
||||
return urllib.parse.quote(str(s), safe="")
|
||||
|
||||
|
||||
def _generate_oauth_signature(
|
||||
method: str,
|
||||
url: str,
|
||||
params: dict,
|
||||
consumer_secret: str,
|
||||
token_secret: str,
|
||||
) -> str:
|
||||
"""Generate OAuth 1.0a signature."""
|
||||
sorted_params = "&".join(
|
||||
f"{_percent_encode(k)}={_percent_encode(v)}"
|
||||
for k, v in sorted(params.items())
|
||||
)
|
||||
base_string = f"{method.upper()}&{_percent_encode(url)}&{_percent_encode(sorted_params)}"
|
||||
signing_key = f"{_percent_encode(consumer_secret)}&{_percent_encode(token_secret)}"
|
||||
signature = hmac.new(
|
||||
signing_key.encode(), base_string.encode(), hashlib.sha1
|
||||
).digest()
|
||||
import base64
|
||||
return base64.b64encode(signature).decode()
|
||||
|
||||
|
||||
def _build_oauth_header(
|
||||
method: str,
|
||||
url: str,
|
||||
creds: dict,
|
||||
extra_params: dict = None,
|
||||
) -> str:
|
||||
"""Build the OAuth 1.0a Authorization header."""
|
||||
import uuid
|
||||
oauth_params = {
|
||||
"oauth_consumer_key": creds["consumer_key"],
|
||||
"oauth_nonce": uuid.uuid4().hex,
|
||||
"oauth_signature_method": "HMAC-SHA1",
|
||||
"oauth_timestamp": str(int(time.time())),
|
||||
"oauth_token": creds["access_token"],
|
||||
"oauth_version": "1.0",
|
||||
}
|
||||
|
||||
# Combine oauth params with any extra params for signature
|
||||
all_params = {**oauth_params}
|
||||
if extra_params:
|
||||
all_params.update(extra_params)
|
||||
|
||||
signature = _generate_oauth_signature(
|
||||
method, url, all_params,
|
||||
creds["consumer_secret"], creds["access_secret"],
|
||||
)
|
||||
oauth_params["oauth_signature"] = signature
|
||||
|
||||
header_parts = ", ".join(
|
||||
f'{_percent_encode(k)}="{_percent_encode(v)}"'
|
||||
for k, v in sorted(oauth_params.items())
|
||||
)
|
||||
return f"OAuth {header_parts}"
|
||||
|
||||
|
||||
# ─── Tweet Submission ────────────────────────────────────────────────
|
||||
|
||||
def submit_tweet_draft(
|
||||
conn: sqlite3.Connection,
|
||||
content: str,
|
||||
agent: str,
|
||||
context: dict = None,
|
||||
reply_to_url: str = None,
|
||||
post_type: str = "original",
|
||||
) -> tuple[int, str]:
|
||||
"""Submit a tweet draft to the approval queue.
|
||||
|
||||
Returns (request_id, status_message).
|
||||
status_message is None on success, error string on failure.
|
||||
|
||||
The output gate and OPSEC filter run before insertion.
|
||||
"""
|
||||
# Import here to avoid circular dependency
|
||||
from output_gate import gate_for_tweet_queue
|
||||
from approvals import check_opsec
|
||||
|
||||
# Output gate — block system content
|
||||
gate = gate_for_tweet_queue(content, agent)
|
||||
if not gate:
|
||||
return -1, f"Output gate blocked: {', '.join(gate.blocked_reasons)}"
|
||||
|
||||
# OPSEC filter
|
||||
opsec_violation = check_opsec(content)
|
||||
if opsec_violation:
|
||||
return -1, opsec_violation
|
||||
|
||||
# Build context JSON
|
||||
ctx = {
|
||||
"post_type": post_type,
|
||||
"target_account": "TeleoHumanity", # default, can be overridden
|
||||
}
|
||||
if reply_to_url:
|
||||
ctx["reply_to_url"] = reply_to_url
|
||||
if context:
|
||||
ctx.update(context)
|
||||
|
||||
# Insert into approval queue
|
||||
cursor = conn.execute(
|
||||
"""INSERT INTO approval_queue
|
||||
(type, content, originating_agent, context, leo_review_status,
|
||||
expires_at)
|
||||
VALUES (?, ?, ?, ?, 'pending_leo',
|
||||
datetime('now', '+24 hours'))""",
|
||||
("tweet", content, agent, json.dumps(ctx)),
|
||||
)
|
||||
conn.commit()
|
||||
request_id = cursor.lastrowid
|
||||
logger.info("Tweet draft #%d submitted by %s (%d chars)",
|
||||
request_id, agent, len(content))
|
||||
return request_id, None
|
||||
|
||||
|
||||
# ─── Tweet Posting ───────────────────────────────────────────────────
|
||||
|
||||
async def post_tweet(text: str, reply_to_id: str = None) -> dict:
|
||||
"""Post a tweet to X via Twitter API v2.
|
||||
|
||||
Returns dict with:
|
||||
- success: bool
|
||||
- tweet_id: str (if successful)
|
||||
- tweet_url: str (if successful)
|
||||
- error: str (if failed)
|
||||
"""
|
||||
creds = _load_oauth_credentials()
|
||||
if not creds:
|
||||
return {"success": False, "error": "X API credentials not configured"}
|
||||
|
||||
# Build request body
|
||||
body = {"text": text}
|
||||
if reply_to_id:
|
||||
body["reply"] = {"in_reply_to_tweet_id": reply_to_id}
|
||||
|
||||
# OAuth 1.0a header (for JSON body, don't include body params in signature)
|
||||
auth_header = _build_oauth_header("POST", TWITTER_API_V2_URL, creds)
|
||||
|
||||
headers = {
|
||||
"Authorization": auth_header,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(
|
||||
TWITTER_API_V2_URL,
|
||||
headers=headers,
|
||||
json=body,
|
||||
timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT),
|
||||
) as resp:
|
||||
result = await resp.json()
|
||||
|
||||
if resp.status == 201:
|
||||
tweet_id = result.get("data", {}).get("id", "")
|
||||
return {
|
||||
"success": True,
|
||||
"tweet_id": tweet_id,
|
||||
"tweet_url": f"https://x.com/TeleoHumanity/status/{tweet_id}",
|
||||
}
|
||||
else:
|
||||
error = result.get("detail") or result.get("title") or str(result)
|
||||
logger.error("Tweet post failed (%d): %s", resp.status, error)
|
||||
return {"success": False, "error": f"API error {resp.status}: {error}"}
|
||||
|
||||
except aiohttp.ClientError as e:
|
||||
logger.error("Tweet post network error: %s", e)
|
||||
return {"success": False, "error": f"Network error: {e}"}
|
||||
|
||||
|
||||
async def post_thread(tweets: list[str]) -> list[dict]:
|
||||
"""Post a thread (multiple tweets in reply chain).
|
||||
|
||||
Returns list of post results, one per tweet.
|
||||
"""
|
||||
results = []
|
||||
reply_to = None
|
||||
|
||||
for i, text in enumerate(tweets):
|
||||
result = await post_tweet(text, reply_to_id=reply_to)
|
||||
results.append(result)
|
||||
|
||||
if not result["success"]:
|
||||
logger.error("Thread posting failed at tweet %d/%d: %s",
|
||||
i + 1, len(tweets), result["error"])
|
||||
break
|
||||
|
||||
reply_to = result.get("tweet_id")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ─── Post-Approval Hook ─────────────────────────────────────────────
|
||||
|
||||
async def handle_approved_tweet(
|
||||
conn: sqlite3.Connection,
|
||||
request_id: int,
|
||||
) -> dict:
|
||||
"""Called when a tweet is approved. Posts to X and records the result.
|
||||
|
||||
Returns the post result dict.
|
||||
"""
|
||||
row = conn.execute(
|
||||
"SELECT * FROM approval_queue WHERE id = ? AND type = 'tweet'",
|
||||
(request_id,),
|
||||
).fetchone()
|
||||
|
||||
if not row:
|
||||
return {"success": False, "error": f"Approval #{request_id} not found"}
|
||||
|
||||
if row["status"] != "approved":
|
||||
return {"success": False, "error": f"Approval #{request_id} status is {row['status']}, not approved"}
|
||||
|
||||
content = row["content"]
|
||||
ctx = json.loads(row["context"]) if row["context"] else {}
|
||||
|
||||
# Parse thread (tweets separated by ---)
|
||||
tweets = [t.strip() for t in content.split("\n---\n") if t.strip()]
|
||||
|
||||
# Extract reply_to tweet ID from URL if present
|
||||
reply_to_id = None
|
||||
reply_to_url = ctx.get("reply_to_url", "")
|
||||
if reply_to_url:
|
||||
import re
|
||||
match = re.search(r"/status/(\d+)", reply_to_url)
|
||||
if match:
|
||||
reply_to_id = match.group(1)
|
||||
|
||||
# Post
|
||||
if len(tweets) == 1:
|
||||
result = await post_tweet(tweets[0], reply_to_id=reply_to_id)
|
||||
results = [result]
|
||||
else:
|
||||
# For threads, first tweet may be a reply
|
||||
results = []
|
||||
first = await post_tweet(tweets[0], reply_to_id=reply_to_id)
|
||||
results.append(first)
|
||||
if first["success"] and len(tweets) > 1:
|
||||
thread_results = await post_thread(tweets[1:])
|
||||
# Fix: thread_results already posted independently, need to chain
|
||||
# Actually post_thread handles chaining. Let me re-do this.
|
||||
pass
|
||||
# Simpler: use post_thread for everything if it's a multi-tweet
|
||||
if len(tweets) > 1:
|
||||
results = await post_thread(tweets)
|
||||
|
||||
# Record result
|
||||
success = all(r["success"] for r in results)
|
||||
if success:
|
||||
tweet_urls = [r.get("tweet_url", "") for r in results if r.get("tweet_url")]
|
||||
published_url = tweet_urls[0] if tweet_urls else ""
|
||||
|
||||
conn.execute(
|
||||
"""UPDATE approval_queue
|
||||
SET context = json_set(COALESCE(context, '{}'),
|
||||
'$.published_url', ?,
|
||||
'$.published_at', datetime('now'),
|
||||
'$.tweet_ids', ?)
|
||||
WHERE id = ?""",
|
||||
(published_url, json.dumps([r.get("tweet_id") for r in results]), request_id),
|
||||
)
|
||||
conn.commit()
|
||||
logger.info("Tweet #%d published: %s", request_id, published_url)
|
||||
else:
|
||||
errors = [r.get("error", "unknown") for r in results if not r["success"]]
|
||||
conn.execute(
|
||||
"""UPDATE approval_queue
|
||||
SET context = json_set(COALESCE(context, '{}'),
|
||||
'$.post_error', ?,
|
||||
'$.post_attempted_at', datetime('now'))
|
||||
WHERE id = ?""",
|
||||
("; ".join(errors), request_id),
|
||||
)
|
||||
conn.commit()
|
||||
logger.error("Tweet #%d post failed: %s", request_id, errors)
|
||||
|
||||
return results[0] if len(results) == 1 else {"success": success, "results": results}
|
||||
246
ops/pipeline-v2/telegram/x_search.py
Normal file
246
ops/pipeline-v2/telegram/x_search.py
Normal file
|
|
@ -0,0 +1,246 @@
|
|||
#!/usr/bin/env python3
|
||||
"""X (Twitter) search client for user-triggered research.
|
||||
|
||||
Searches X via twitterapi.io, filters for relevance, returns structured tweet data.
|
||||
Used by the Telegram bot's /research command.
|
||||
|
||||
Epimetheus owns this module.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import aiohttp
|
||||
|
||||
logger = logging.getLogger("x-search")
|
||||
|
||||
API_URL = "https://api.twitterapi.io/twitter/tweet/advanced_search"
|
||||
API_KEY_FILE = "/opt/teleo-eval/secrets/twitterapi-io-key"
|
||||
|
||||
# Rate limiting: 3 research queries per user per day
|
||||
_research_usage: dict[int, list[float]] = {} # user_id → [timestamps]
|
||||
MAX_RESEARCH_PER_DAY = 3
|
||||
|
||||
|
||||
def _load_api_key() -> str | None:
|
||||
try:
|
||||
return Path(API_KEY_FILE).read_text().strip()
|
||||
except Exception:
|
||||
logger.warning("Twitter API key not found at %s", API_KEY_FILE)
|
||||
return None
|
||||
|
||||
|
||||
def check_research_rate_limit(user_id: int) -> bool:
|
||||
"""Check if user has research requests remaining. Returns True if allowed."""
|
||||
now = time.time()
|
||||
times = _research_usage.get(user_id, [])
|
||||
# Prune entries older than 24h
|
||||
times = [t for t in times if now - t < 86400]
|
||||
_research_usage[user_id] = times
|
||||
return len(times) < MAX_RESEARCH_PER_DAY
|
||||
|
||||
|
||||
def record_research_usage(user_id: int):
|
||||
"""Record a research request for rate limiting."""
|
||||
_research_usage.setdefault(user_id, []).append(time.time())
|
||||
|
||||
|
||||
def get_research_remaining(user_id: int) -> int:
|
||||
"""Get remaining research requests for today."""
|
||||
now = time.time()
|
||||
times = [t for t in _research_usage.get(user_id, []) if now - t < 86400]
|
||||
return max(0, MAX_RESEARCH_PER_DAY - len(times))
|
||||
|
||||
|
||||
async def search_x(query: str, max_results: int = 20, min_engagement: int = 3) -> list[dict]:
|
||||
"""Search X for tweets matching query. Returns structured tweet data.
|
||||
|
||||
Filters: recent tweets, min engagement threshold, skip pure retweets.
|
||||
"""
|
||||
key = _load_api_key()
|
||||
if not key:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(
|
||||
API_URL,
|
||||
params={"query": query, "queryType": "Latest"},
|
||||
headers={"X-API-Key": key},
|
||||
timeout=aiohttp.ClientTimeout(total=15),
|
||||
) as resp:
|
||||
if resp.status >= 400:
|
||||
logger.warning("X search API → %d for query: %s", resp.status, query)
|
||||
return []
|
||||
data = await resp.json()
|
||||
tweets = data.get("tweets", [])
|
||||
except Exception as e:
|
||||
logger.warning("X search error: %s", e)
|
||||
return []
|
||||
|
||||
# Filter and structure results
|
||||
results = []
|
||||
for tweet in tweets[:max_results * 2]: # Fetch more, filter down
|
||||
text = tweet.get("text", "")
|
||||
author = tweet.get("author", {})
|
||||
|
||||
# Skip pure retweets (no original text)
|
||||
if text.startswith("RT @"):
|
||||
continue
|
||||
|
||||
# Engagement filter
|
||||
likes = tweet.get("likeCount", 0) or 0
|
||||
retweets = tweet.get("retweetCount", 0) or 0
|
||||
replies = tweet.get("replyCount", 0) or 0
|
||||
engagement = likes + retweets + replies
|
||||
|
||||
if engagement < min_engagement:
|
||||
continue
|
||||
|
||||
results.append({
|
||||
"text": text,
|
||||
"url": tweet.get("twitterUrl", tweet.get("url", "")),
|
||||
"author": author.get("userName", "unknown"),
|
||||
"author_name": author.get("name", ""),
|
||||
"author_followers": author.get("followers", 0),
|
||||
"engagement": engagement,
|
||||
"likes": likes,
|
||||
"retweets": retweets,
|
||||
"replies": replies,
|
||||
"tweet_date": tweet.get("createdAt", ""),
|
||||
"is_reply": bool(tweet.get("inReplyToId")),
|
||||
})
|
||||
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
|
||||
# Sort by engagement (highest first)
|
||||
results.sort(key=lambda t: t["engagement"], reverse=True)
|
||||
return results
|
||||
|
||||
|
||||
def format_tweet_as_source(tweet: dict, query: str, submitted_by: str) -> str:
|
||||
"""Format a tweet as a source file for inbox/queue/."""
|
||||
import re
|
||||
from datetime import date
|
||||
|
||||
slug = re.sub(r"[^a-z0-9]+", "-", tweet["text"][:50].lower()).strip("-")
|
||||
author = tweet["author"]
|
||||
|
||||
return f"""---
|
||||
type: source
|
||||
source_type: x-post
|
||||
title: "X post by @{author}: {tweet['text'][:80].replace('"', "'")}"
|
||||
url: "{tweet['url']}"
|
||||
author: "@{author}"
|
||||
date: {date.today().isoformat()}
|
||||
domain: internet-finance
|
||||
format: social-media
|
||||
status: unprocessed
|
||||
proposed_by: "{submitted_by}"
|
||||
contribution_type: research-direction
|
||||
research_query: "{query.replace('"', "'")}"
|
||||
tweet_author: "@{author}"
|
||||
tweet_author_followers: {tweet.get('author_followers', 0)}
|
||||
tweet_engagement: {tweet.get('engagement', 0)}
|
||||
tweet_date: "{tweet.get('tweet_date', '')}"
|
||||
tags: [x-research, telegram-research]
|
||||
---
|
||||
|
||||
## Tweet by @{author}
|
||||
|
||||
{tweet['text']}
|
||||
|
||||
---
|
||||
|
||||
Engagement: {tweet.get('likes', 0)} likes, {tweet.get('retweets', 0)} retweets, {tweet.get('replies', 0)} replies
|
||||
Author followers: {tweet.get('author_followers', 0)}
|
||||
"""
|
||||
|
||||
|
||||
async def fetch_tweet_by_url(url: str) -> dict | None:
|
||||
"""Fetch a specific tweet/article by X URL. Extracts username and tweet ID,
|
||||
searches via advanced_search (tweet/detail doesn't work with this API provider).
|
||||
"""
|
||||
import re as _re
|
||||
|
||||
# Extract username and tweet ID from URL
|
||||
match = _re.search(r'(?:twitter\.com|x\.com)/(\w+)/status/(\d+)', url)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
username = match.group(1)
|
||||
tweet_id = match.group(2)
|
||||
|
||||
key = _load_api_key()
|
||||
if not key:
|
||||
return None
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Primary: direct tweet lookup by ID (works for any tweet, any age)
|
||||
async with session.get(
|
||||
"https://api.twitterapi.io/twitter/tweets",
|
||||
params={"tweet_ids": tweet_id},
|
||||
headers={"X-API-Key": key},
|
||||
timeout=aiohttp.ClientTimeout(total=10),
|
||||
) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json()
|
||||
tweets = data.get("tweets", [])
|
||||
if tweets:
|
||||
tweet = tweets[0]
|
||||
author_data = tweet.get("author", {})
|
||||
return {
|
||||
"text": tweet.get("text", ""),
|
||||
"url": url,
|
||||
"author": author_data.get("userName", username),
|
||||
"author_name": author_data.get("name", ""),
|
||||
"author_followers": author_data.get("followers", 0),
|
||||
"engagement": (tweet.get("likeCount", 0) or 0) + (tweet.get("retweetCount", 0) or 0),
|
||||
"likes": tweet.get("likeCount", 0),
|
||||
"retweets": tweet.get("retweetCount", 0),
|
||||
"views": tweet.get("viewCount", 0),
|
||||
"tweet_date": tweet.get("createdAt", ""),
|
||||
"is_article": False,
|
||||
}
|
||||
|
||||
# Fallback: try article endpoint (for X long-form articles)
|
||||
async with session.get(
|
||||
"https://api.twitterapi.io/twitter/article",
|
||||
params={"tweet_id": tweet_id},
|
||||
headers={"X-API-Key": key},
|
||||
timeout=aiohttp.ClientTimeout(total=10),
|
||||
) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json()
|
||||
article = data.get("article")
|
||||
if article:
|
||||
return {
|
||||
"text": article.get("text", article.get("content", "")),
|
||||
"url": url,
|
||||
"author": username,
|
||||
"author_name": article.get("author", {}).get("name", ""),
|
||||
"author_followers": article.get("author", {}).get("followers", 0),
|
||||
"engagement": 0,
|
||||
"tweet_date": article.get("createdAt", ""),
|
||||
"is_article": True,
|
||||
"title": article.get("title", ""),
|
||||
}
|
||||
|
||||
# Both failed — return placeholder (Ganymede: surface failure)
|
||||
return {
|
||||
"text": f"[Could not fetch tweet content from @{username}]",
|
||||
"url": url,
|
||||
"author": username,
|
||||
"author_name": "",
|
||||
"author_followers": 0,
|
||||
"engagement": 0,
|
||||
"tweet_date": "",
|
||||
"is_article": False,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning("Tweet fetch error for %s: %s", url, e)
|
||||
|
||||
return None
|
||||
Loading…
Reference in a new issue