Pulled from /opt/teleo-eval/telegram/ on VPS. Includes: - bot.py (92K), kb_retrieval.py, kb_tools.py (agentic retrieval) - retrieval.py (RRF merge, query decomposition, entity traversal) - response.py (system prompt builder, response parser) - agent_config.py, agent_runner.py (multi-agent template unit support) - approval_stages.py, approvals.py, digest.py (approval workflow) - eval_checks.py, eval.py (response quality checks) - output_gate.py, x_publisher.py, x_client.py, x_search.py (X pipeline) - market_data.py, worktree_lock.py (utilities) - rio.yaml, theseus.yaml (agent configs) These files were deployed to VPS but never committed to the repo. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
76 lines
2.8 KiB
Python
76 lines
2.8 KiB
Python
"""Eval pipeline — pure functions for response quality checks.
|
|
|
|
Extracted from bot.py so tests can import without telegram dependency.
|
|
No side effects, no I/O, no imports beyond stdlib.
|
|
|
|
Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887>
|
|
"""
|
|
|
|
import re
|
|
|
|
# Per-model pricing (input $/M tokens, output $/M tokens) — from OpenRouter
|
|
MODEL_PRICING = {
|
|
"anthropic/claude-opus-4-6": (15.0, 75.0),
|
|
"anthropic/claude-sonnet-4-6": (3.0, 15.0),
|
|
"anthropic/claude-haiku-4.5": (0.80, 4.0),
|
|
"anthropic/claude-3.5-haiku": (0.80, 4.0),
|
|
"openai/gpt-4o": (2.50, 10.0),
|
|
"openai/gpt-4o-mini": (0.15, 0.60),
|
|
}
|
|
|
|
CONFIDENCE_FLOOR = 0.4
|
|
COST_ALERT_THRESHOLD = 0.22 # per-response alert threshold in USD
|
|
|
|
# URL fabrication regex — matches http:// and https:// URLs
|
|
_URL_RE = re.compile(r'https?://[^\s\)\]\"\'<>]+')
|
|
|
|
|
|
class _LLMResponse(str):
|
|
"""String subclass carrying token counts and cost from OpenRouter usage field."""
|
|
prompt_tokens: int = 0
|
|
completion_tokens: int = 0
|
|
cost: float = 0.0
|
|
model: str = ""
|
|
|
|
def __new__(cls, text: str, prompt_tokens: int = 0, completion_tokens: int = 0,
|
|
cost: float = 0.0, model: str = ""):
|
|
obj = super().__new__(cls, text)
|
|
obj.prompt_tokens = prompt_tokens
|
|
obj.completion_tokens = completion_tokens
|
|
obj.cost = cost
|
|
obj.model = model
|
|
return obj
|
|
|
|
|
|
def estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
|
|
"""Estimate cost in USD from token counts and model pricing."""
|
|
input_rate, output_rate = MODEL_PRICING.get(model, (3.0, 15.0)) # default to Sonnet
|
|
return (prompt_tokens * input_rate + completion_tokens * output_rate) / 1_000_000
|
|
|
|
|
|
def check_url_fabrication(response_text: str, kb_context: str) -> tuple[str, list[str]]:
|
|
"""Check for fabricated URLs in response. Replace any not found in KB context.
|
|
|
|
Returns (cleaned_text, list_of_fabricated_urls).
|
|
"""
|
|
kb_urls = set(_URL_RE.findall(kb_context)) if kb_context else set()
|
|
response_urls = _URL_RE.findall(response_text)
|
|
fabricated = [url for url in response_urls if url not in kb_urls]
|
|
result = response_text
|
|
for url in fabricated:
|
|
result = result.replace(url, "[URL removed — not verified]")
|
|
return result, fabricated
|
|
|
|
|
|
def apply_confidence_floor(display_response: str, confidence_score: float | None) -> tuple[str, bool, str | None]:
|
|
"""Apply confidence floor check.
|
|
|
|
Returns (possibly_modified_response, is_blocked, block_reason).
|
|
"""
|
|
if confidence_score is not None and confidence_score < CONFIDENCE_FLOOR:
|
|
modified = (
|
|
f"⚠️ Low confidence — I may not have reliable data on this topic.\n\n"
|
|
+ display_response
|
|
)
|
|
return modified, True, f"confidence {confidence_score:.2f} < floor {CONFIDENCE_FLOOR}"
|
|
return display_response, False, None
|