teleo-infrastructure/telegram/eval.py

"""Eval pipeline stub — provides imports for bot.py.
Full implementation pending Ganymede review."""

CONFIDENCE_FLOOR = 0.3
COST_ALERT_THRESHOLD = 0.22


class _LLMResponse(str):
    """str subclass carrying token counts and cost."""
    def __new__(cls, content, prompt_tokens=0, completion_tokens=0, cost=0.0, model=''):
        obj = super().__new__(cls, content)
        obj.prompt_tokens = prompt_tokens
        obj.completion_tokens = completion_tokens
        obj.cost = cost
        obj.model = model
        return obj


def estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
    """Per-model cost estimation."""
    rates = {
        'anthropic/claude-opus-4': (15.0, 75.0),
        'anthropic/claude-sonnet-4': (3.0, 15.0),
        'anthropic/claude-haiku-4.5': (0.80, 4.0),
        'openai/gpt-4o': (2.50, 10.0),
    }
    for prefix, (input_rate, output_rate) in rates.items():
        if prefix in model:
            return (prompt_tokens * input_rate + completion_tokens * output_rate) / 1_000_000
    return (prompt_tokens * 3.0 + completion_tokens * 15.0) / 1_000_000


def check_url_fabrication(response: str, kb_context: str) -> tuple[str, list[str]]:
    """Check for fabricated URLs. Returns (cleaned_response, fabricated_urls)."""
    import re
    urls = re.findall(r'https?://[^\s\)"]+', response)
    if not urls or not kb_context:
        return response, []
    kb_urls = set(re.findall(r'https?://[^\s\)"]+', kb_context))
    fabricated = [u for u in urls if u not in kb_urls and not u.startswith('https://t.me/')]
    cleaned = response
    for u in fabricated:
        cleaned = cleaned.replace(u, '[URL removed]')
    return cleaned, fabricated


def apply_confidence_floor(response: str, confidence: float | None) -> tuple[str, bool, str | None]:
    """Apply confidence floor. Returns (response, blocked, block_reason)."""
    if confidence is not None and confidence < CONFIDENCE_FLOOR:
        caveat = '⚠️ Low confidence response — treat with skepticism.\n\n'
        return caveat + response, True, f'confidence {confidence:.2f} below floor {CONFIDENCE_FLOOR}'
    return response, False, None