teleo-codex/ops/pipeline-v2/telegram/eval_checks.py
m3taversal 7bfce6b706 commit telegram bot module from VPS — 20 files never previously in repo
Pulled from /opt/teleo-eval/telegram/ on VPS. Includes:
- bot.py (92K), kb_retrieval.py, kb_tools.py (agentic retrieval)
- retrieval.py (RRF merge, query decomposition, entity traversal)
- response.py (system prompt builder, response parser)
- agent_config.py, agent_runner.py (multi-agent template unit support)
- approval_stages.py, approvals.py, digest.py (approval workflow)
- eval_checks.py, eval.py (response quality checks)
- output_gate.py, x_publisher.py, x_client.py, x_search.py (X pipeline)
- market_data.py, worktree_lock.py (utilities)
- rio.yaml, theseus.yaml (agent configs)

These files were deployed to VPS but never committed to the repo.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 11:02:32 +02:00

76 lines
2.8 KiB
Python

"""Eval pipeline — pure functions for response quality checks.
Extracted from bot.py so tests can import without telegram dependency.
No side effects, no I/O, no imports beyond stdlib.
Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887>
"""
import re
# Per-model pricing (input $/M tokens, output $/M tokens) — from OpenRouter
MODEL_PRICING = {
"anthropic/claude-opus-4-6": (15.0, 75.0),
"anthropic/claude-sonnet-4-6": (3.0, 15.0),
"anthropic/claude-haiku-4.5": (0.80, 4.0),
"anthropic/claude-3.5-haiku": (0.80, 4.0),
"openai/gpt-4o": (2.50, 10.0),
"openai/gpt-4o-mini": (0.15, 0.60),
}
CONFIDENCE_FLOOR = 0.4
COST_ALERT_THRESHOLD = 0.22 # per-response alert threshold in USD
# URL fabrication regex — matches http:// and https:// URLs
_URL_RE = re.compile(r'https?://[^\s\)\]\"\'<>]+')
class _LLMResponse(str):
"""String subclass carrying token counts and cost from OpenRouter usage field."""
prompt_tokens: int = 0
completion_tokens: int = 0
cost: float = 0.0
model: str = ""
def __new__(cls, text: str, prompt_tokens: int = 0, completion_tokens: int = 0,
cost: float = 0.0, model: str = ""):
obj = super().__new__(cls, text)
obj.prompt_tokens = prompt_tokens
obj.completion_tokens = completion_tokens
obj.cost = cost
obj.model = model
return obj
def estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
"""Estimate cost in USD from token counts and model pricing."""
input_rate, output_rate = MODEL_PRICING.get(model, (3.0, 15.0)) # default to Sonnet
return (prompt_tokens * input_rate + completion_tokens * output_rate) / 1_000_000
def check_url_fabrication(response_text: str, kb_context: str) -> tuple[str, list[str]]:
"""Check for fabricated URLs in response. Replace any not found in KB context.
Returns (cleaned_text, list_of_fabricated_urls).
"""
kb_urls = set(_URL_RE.findall(kb_context)) if kb_context else set()
response_urls = _URL_RE.findall(response_text)
fabricated = [url for url in response_urls if url not in kb_urls]
result = response_text
for url in fabricated:
result = result.replace(url, "[URL removed — not verified]")
return result, fabricated
def apply_confidence_floor(display_response: str, confidence_score: float | None) -> tuple[str, bool, str | None]:
"""Apply confidence floor check.
Returns (possibly_modified_response, is_blocked, block_reason).
"""
if confidence_score is not None and confidence_score < CONFIDENCE_FLOOR:
modified = (
f"⚠️ Low confidence — I may not have reliable data on this topic.\n\n"
+ display_response
)
return modified, True, f"confidence {confidence_score:.2f} < floor {CONFIDENCE_FLOOR}"
return display_response, False, None