teleo-infrastructure/tests/test_eval_pipeline.py
m3taversal 670c50f384
Some checks are pending
CI / lint-and-test (push) Waiting to run
fix: add telegram/ and tests/ to deploy pipeline, remove hardcoded API key
deploy.sh was missing telegram/ and tests/ directories — code existed in
repo but never synced to VPS. Also removes hardcoded twitterapi.io key
from x-ingest.py (reads from secrets file like all other modules).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-20 17:15:55 +01:00

375 lines
15 KiB
Python

"""Tests for eval pipeline — cost tracking, URL fabrication check, confidence floor.
Tests validate against real failure modes from audit records:
- Record #12: hallucinated futard.io URL
- Records #3, #9: confident fabrication at 0.7
- Records #6, #7: low confidence (0.1) with no gate
"""
import re
import sqlite3
import sys
from pathlib import Path
from unittest.mock import patch, MagicMock
import pytest
# Add telegram/ and lib/ to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "telegram"))
sys.path.insert(0, str(Path(__file__).parent.parent))
# ─── _estimate_cost tests ─────────────────────────────────────────────────
# Import the functions directly from bot.py module
# We need to mock the telegram imports since they won't be available in test
# Instead, test the pure functions by reimplementing them (they're simple math)
# Per-1M-token pricing (must match bot.py)
_MODEL_COSTS = {
"anthropic/claude-opus-4-6": (15.0, 75.0),
"anthropic/claude-sonnet-4-6": (3.0, 15.0),
"anthropic/claude-haiku-4.5": (0.80, 4.0),
"openai/gpt-4o": (2.50, 10.0),
"openai/gpt-4o-mini": (0.15, 0.60),
}
def _estimate_cost(model, prompt_tokens, completion_tokens):
"""Mirror of bot.py's _estimate_cost for testing."""
rates = _MODEL_COSTS.get(model, (5.0, 15.0))
return (prompt_tokens * rates[0] + completion_tokens * rates[1]) / 1_000_000
def _check_url_fabrication(response_text, kb_context):
"""Mirror of bot.py's _check_url_fabrication for testing."""
response_urls = set(re.findall(r'https?://[^\s\)>\]]+', response_text))
if not response_urls:
return []
context_urls = set(re.findall(r'https?://[^\s\)>\]]+', kb_context))
return sorted(response_urls - context_urls)
class TestEstimateCost:
"""Cost estimation tests."""
def test_opus_typical_response(self):
"""Typical Opus response: ~2000 prompt tokens, ~500 completion."""
cost = _estimate_cost("anthropic/claude-opus-4-6", 2000, 500)
# 2000 * 15/1M + 500 * 75/1M = 0.03 + 0.0375 = 0.0675
assert abs(cost - 0.0675) < 0.0001
def test_haiku_cheap(self):
"""Haiku calls should be very cheap."""
cost = _estimate_cost("anthropic/claude-haiku-4.5", 1000, 200)
# 1000 * 0.8/1M + 200 * 4/1M = 0.0008 + 0.0008 = 0.0016
assert abs(cost - 0.0016) < 0.0001
def test_unknown_model_uses_conservative_default(self):
"""Unknown model falls back to $5/$15 per M tokens."""
cost = _estimate_cost("some-unknown/model", 1000, 1000)
# 1000 * 5/1M + 1000 * 15/1M = 0.005 + 0.015 = 0.02
assert abs(cost - 0.02) < 0.0001
def test_zero_tokens_zero_cost(self):
cost = _estimate_cost("anthropic/claude-opus-4-6", 0, 0)
assert cost == 0.0
def test_gpt4o_mini_cheapest(self):
"""GPT-4o-mini should be cheapest mainstream model."""
cost = _estimate_cost("openai/gpt-4o-mini", 10000, 1000)
assert cost < 0.003 # very cheap
def test_opus_more_expensive_than_haiku(self):
"""Same token counts, Opus should be ~20x more expensive than Haiku."""
opus = _estimate_cost("anthropic/claude-opus-4-6", 1000, 500)
haiku = _estimate_cost("anthropic/claude-haiku-4.5", 1000, 500)
assert opus > haiku * 10
class TestURLFabrication:
"""URL fabrication detection — catches failure mode #2 (record #12)."""
def test_no_urls_in_response(self):
"""Response without URLs passes."""
result = _check_url_fabrication("MetaDAO uses futarchy for governance.", "some kb context")
assert result == []
def test_url_present_in_context(self):
"""URL that exists in KB context is NOT flagged."""
response = "Check out https://metadao.fi/proposals for details."
context = "Source: https://metadao.fi/proposals — MetaDAO governance"
result = _check_url_fabrication(response, context)
assert result == []
def test_fabricated_url_caught(self):
"""Record #12: bot fabricated futard.io URL — should be caught."""
response = "You can find the proposal at https://futard.io/proposal/GPT8d..."
context = "MetaDAO uses conditional tokens for governance decisions."
result = _check_url_fabrication(response, context)
assert len(result) == 1
assert "futard.io" in result[0]
def test_multiple_fabricated_urls(self):
"""Multiple fabricated URLs all get caught."""
response = (
"See https://fake1.com/page and also https://fake2.org/data "
"and the real one https://metadao.fi"
)
context = "Source: https://metadao.fi — real URL"
result = _check_url_fabrication(response, context)
assert len(result) == 2
assert "fake1.com" in result[0] or "fake1.com" in result[1]
assert "fake2.org" in result[0] or "fake2.org" in result[1]
def test_url_in_parentheses(self):
"""URL inside markdown link syntax should be extracted."""
response = "Check [here](https://fabricated.io/page) for more."
context = "No URLs in context."
result = _check_url_fabrication(response, context)
assert len(result) == 1
assert "fabricated.io" in result[0]
def test_empty_context_flags_all_urls(self):
"""If KB context has no URLs, any response URL is fabricated."""
response = "See https://example.com for more."
result = _check_url_fabrication(response, "")
assert len(result) == 1
def test_url_replacement_in_response(self):
"""Verify that URL replacement produces correct output."""
display = "Visit https://futard.io/proposal/GPT8d for details."
fabricated = _check_url_fabrication(display, "no urls here")
for url in fabricated:
display = display.replace(url, "[URL removed — not verified]")
assert "futard.io" not in display
assert "[URL removed — not verified]" in display
class TestConfidenceFloor:
"""Confidence floor tests — catches failure modes #4, #6, #7."""
def test_low_confidence_gets_caveat(self):
"""Confidence < 0.3 should trigger caveat prefix."""
confidence = 0.1
display = "The first project was Saber Vote Market."
if confidence < 0.3:
display = f"⚠️ Low confidence — the knowledge base may not have good coverage here.\n\n{display}"
assert display.startswith("⚠️ Low confidence")
assert "Saber Vote Market" in display
def test_high_confidence_no_caveat(self):
"""Confidence >= 0.3 should pass through unchanged."""
confidence = 0.7
display = "MetaDAO uses conditional tokens."
original = display
if confidence < 0.3:
display = f"⚠️ Low confidence\n\n{display}"
assert display == original
def test_none_confidence_no_caveat(self):
"""None confidence (parsing failure) should not trigger caveat."""
confidence = None
display = "Some response."
original = display
if confidence is not None and confidence < 0.3:
display = f"⚠️ Low confidence\n\n{display}"
assert display == original
def test_boundary_value_0_3(self):
"""Confidence exactly 0.3 should NOT trigger (< not <=)."""
confidence = 0.3
blocked = 1 if confidence < 0.3 else 0
assert blocked == 0
def test_boundary_value_0_29(self):
"""Confidence 0.29 should trigger."""
confidence = 0.29
blocked = 1 if confidence < 0.3 else 0
assert blocked == 1
class TestSchemaV10:
"""Test that migration v10 adds correct columns."""
def test_migration_adds_columns(self):
"""Verify migration v10 adds all 8 new columns to response_audit."""
conn = sqlite3.connect(":memory:")
conn.execute("CREATE TABLE schema_version (version INTEGER PRIMARY KEY, applied_at TEXT)")
conn.execute("INSERT INTO schema_version (version) VALUES (9)")
# Create response_audit with v9 schema (no cost/blocked columns)
conn.execute("""
CREATE TABLE response_audit (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT,
chat_id INTEGER,
user TEXT,
agent TEXT DEFAULT 'rio',
model TEXT,
query TEXT,
conversation_window TEXT,
entities_matched TEXT,
claims_matched TEXT,
retrieval_layers_hit TEXT,
retrieval_gap TEXT,
market_data TEXT,
research_context TEXT,
kb_context_text TEXT,
tool_calls TEXT,
raw_response TEXT,
display_response TEXT,
confidence_score REAL,
response_time_ms INTEGER,
created_at TEXT
)
""")
# Run migration v10
new_cols = [
("prompt_tokens", "INTEGER"),
("completion_tokens", "INTEGER"),
("generation_cost", "REAL"),
("embedding_cost", "REAL"),
("total_cost", "REAL"),
("blocked", "INTEGER DEFAULT 0"),
("block_reason", "TEXT"),
("query_type", "TEXT"),
]
for col_name, col_type in new_cols:
try:
conn.execute(f"ALTER TABLE response_audit ADD COLUMN {col_name} {col_type}")
except Exception:
pass
# Verify all columns exist
cols = [row[1] for row in conn.execute("PRAGMA table_info(response_audit)").fetchall()]
for col_name, _ in new_cols:
assert col_name in cols, f"Missing column: {col_name}"
def test_insert_with_new_columns(self):
"""Verify insert works with new columns."""
conn = sqlite3.connect(":memory:")
conn.execute("""
CREATE TABLE response_audit (
id INTEGER PRIMARY KEY AUTOINCREMENT,
query TEXT,
prompt_tokens INTEGER,
completion_tokens INTEGER,
generation_cost REAL,
total_cost REAL,
blocked INTEGER DEFAULT 0,
block_reason TEXT
)
""")
conn.execute(
"INSERT INTO response_audit (query, prompt_tokens, completion_tokens, generation_cost, total_cost, blocked, block_reason) VALUES (?, ?, ?, ?, ?, ?, ?)",
("test query", 2000, 500, 0.0675, 0.0675, 1, "confidence_floor: 0.1"),
)
row = conn.execute("SELECT * FROM response_audit").fetchone()
assert row[1] == "test query"
assert row[2] == 2000
assert row[6] == 1
def test_migration_idempotent(self):
"""Running migration twice should not error (column already exists)."""
conn = sqlite3.connect(":memory:")
conn.execute("CREATE TABLE response_audit (id INTEGER PRIMARY KEY, query TEXT)")
# Run twice
for _ in range(2):
for col_name, col_type in [("blocked", "INTEGER DEFAULT 0"), ("total_cost", "REAL")]:
try:
conn.execute(f"ALTER TABLE response_audit ADD COLUMN {col_name} {col_type}")
except sqlite3.OperationalError:
pass # Expected on second run
cols = [row[1] for row in conn.execute("PRAGMA table_info(response_audit)").fetchall()]
assert "blocked" in cols
assert "total_cost" in cols
class TestLLMResponse:
"""Test the _LLMResponse string subclass."""
def test_behaves_as_string(self):
"""_LLMResponse should work as a regular string."""
# Can't import bot.py (telegram dependency), so test the pattern
class _LLMResponse(str):
prompt_tokens: int = 0
completion_tokens: int = 0
cost: float = 0.0
model: str = ""
r = _LLMResponse("Hello world")
assert str(r) == "Hello world"
assert "Hello" in r
assert len(r) == 11
def test_carries_metadata(self):
class _LLMResponse(str):
prompt_tokens: int = 0
completion_tokens: int = 0
cost: float = 0.0
model: str = ""
r = _LLMResponse("response text")
r.prompt_tokens = 2000
r.completion_tokens = 500
r.cost = 0.0675
r.model = "anthropic/claude-opus-4-6"
assert r.prompt_tokens == 2000
assert r.cost == 0.0675
# getattr works (this is how bot.py accesses it)
assert getattr(r, 'prompt_tokens', 0) == 2000
assert getattr(r, 'cost', 0.0) == 0.0675
def test_getattr_on_none_returns_default(self):
"""When response is None, getattr should return defaults."""
response = None
assert getattr(response, 'prompt_tokens', 0) == 0
assert getattr(response, 'cost', 0.0) == 0.0
class TestRealFailureModes:
"""Replay real failure modes from audit records to verify checks would catch them."""
def test_record_12_fabricated_url(self):
"""Record #12: futard.io/proposal/GPT8d... — a completely fabricated URL."""
response = (
"You can find the proposal at https://futard.io/proposal/GPT8d... "
"which shows the conditional token mechanics."
)
kb_context = (
"MetaDAO governance uses conditional tokens. When a proposal passes, "
"tokens on the winning side become redeemable."
)
fabricated = _check_url_fabrication(response, kb_context)
assert len(fabricated) > 0, "Should catch fabricated futard.io URL"
# Verify replacement works
display = response
for url in fabricated:
display = display.replace(url, "[URL removed — not verified]")
assert "futard.io" not in display
def test_record_3_confident_fabrication(self):
"""Record #3: bot listed 4 wrong ownership coins at 0.7 confidence.
The confidence floor (0.3) doesn't catch this — 0.7 > 0.3.
This test documents the gap. Layer 3 (Haiku grounding) is needed.
"""
confidence = 0.7
blocked = 1 if confidence < 0.3 else 0
assert blocked == 0, "Confidence floor correctly does NOT catch high-confidence fabrication"
# This is a known gap — documented, not a test failure
def test_record_6_low_confidence_speculating(self):
"""Record #6: confidence 0.1, bot still speculated.
Confidence floor should flag this in observation mode.
"""
confidence = 0.1
blocked = 1 if confidence is not None and confidence < 0.3 else 0
assert blocked == 1, "Should flag confidence 0.1 in observation mode"