"""Tests for eval pipeline — cost tracking, URL fabrication check, confidence floor. Tests validate against real failure modes from audit records: - Record #12: hallucinated futard.io URL - Records #3, #9: confident fabrication at 0.7 - Records #6, #7: low confidence (0.1) with no gate """ import re import sqlite3 import sys from pathlib import Path from unittest.mock import patch, MagicMock import pytest # Add telegram/ and lib/ to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "telegram")) sys.path.insert(0, str(Path(__file__).parent.parent)) # ─── _estimate_cost tests ───────────────────────────────────────────────── # Import the functions directly from bot.py module # We need to mock the telegram imports since they won't be available in test # Instead, test the pure functions by reimplementing them (they're simple math) # Per-1M-token pricing (must match bot.py) _MODEL_COSTS = { "anthropic/claude-opus-4-6": (15.0, 75.0), "anthropic/claude-sonnet-4-6": (3.0, 15.0), "anthropic/claude-haiku-4.5": (0.80, 4.0), "openai/gpt-4o": (2.50, 10.0), "openai/gpt-4o-mini": (0.15, 0.60), } def _estimate_cost(model, prompt_tokens, completion_tokens): """Mirror of bot.py's _estimate_cost for testing.""" rates = _MODEL_COSTS.get(model, (5.0, 15.0)) return (prompt_tokens * rates[0] + completion_tokens * rates[1]) / 1_000_000 def _check_url_fabrication(response_text, kb_context): """Mirror of bot.py's _check_url_fabrication for testing.""" response_urls = set(re.findall(r'https?://[^\s\)>\]]+', response_text)) if not response_urls: return [] context_urls = set(re.findall(r'https?://[^\s\)>\]]+', kb_context)) return sorted(response_urls - context_urls) class TestEstimateCost: """Cost estimation tests.""" def test_opus_typical_response(self): """Typical Opus response: ~2000 prompt tokens, ~500 completion.""" cost = _estimate_cost("anthropic/claude-opus-4-6", 2000, 500) # 2000 * 15/1M + 500 * 75/1M = 0.03 + 0.0375 = 0.0675 assert abs(cost - 0.0675) < 0.0001 def test_haiku_cheap(self): """Haiku calls should be very cheap.""" cost = _estimate_cost("anthropic/claude-haiku-4.5", 1000, 200) # 1000 * 0.8/1M + 200 * 4/1M = 0.0008 + 0.0008 = 0.0016 assert abs(cost - 0.0016) < 0.0001 def test_unknown_model_uses_conservative_default(self): """Unknown model falls back to $5/$15 per M tokens.""" cost = _estimate_cost("some-unknown/model", 1000, 1000) # 1000 * 5/1M + 1000 * 15/1M = 0.005 + 0.015 = 0.02 assert abs(cost - 0.02) < 0.0001 def test_zero_tokens_zero_cost(self): cost = _estimate_cost("anthropic/claude-opus-4-6", 0, 0) assert cost == 0.0 def test_gpt4o_mini_cheapest(self): """GPT-4o-mini should be cheapest mainstream model.""" cost = _estimate_cost("openai/gpt-4o-mini", 10000, 1000) assert cost < 0.003 # very cheap def test_opus_more_expensive_than_haiku(self): """Same token counts, Opus should be ~20x more expensive than Haiku.""" opus = _estimate_cost("anthropic/claude-opus-4-6", 1000, 500) haiku = _estimate_cost("anthropic/claude-haiku-4.5", 1000, 500) assert opus > haiku * 10 class TestURLFabrication: """URL fabrication detection — catches failure mode #2 (record #12).""" def test_no_urls_in_response(self): """Response without URLs passes.""" result = _check_url_fabrication("MetaDAO uses futarchy for governance.", "some kb context") assert result == [] def test_url_present_in_context(self): """URL that exists in KB context is NOT flagged.""" response = "Check out https://metadao.fi/proposals for details." context = "Source: https://metadao.fi/proposals — MetaDAO governance" result = _check_url_fabrication(response, context) assert result == [] def test_fabricated_url_caught(self): """Record #12: bot fabricated futard.io URL — should be caught.""" response = "You can find the proposal at https://futard.io/proposal/GPT8d..." context = "MetaDAO uses conditional tokens for governance decisions." result = _check_url_fabrication(response, context) assert len(result) == 1 assert "futard.io" in result[0] def test_multiple_fabricated_urls(self): """Multiple fabricated URLs all get caught.""" response = ( "See https://fake1.com/page and also https://fake2.org/data " "and the real one https://metadao.fi" ) context = "Source: https://metadao.fi — real URL" result = _check_url_fabrication(response, context) assert len(result) == 2 assert "fake1.com" in result[0] or "fake1.com" in result[1] assert "fake2.org" in result[0] or "fake2.org" in result[1] def test_url_in_parentheses(self): """URL inside markdown link syntax should be extracted.""" response = "Check [here](https://fabricated.io/page) for more." context = "No URLs in context." result = _check_url_fabrication(response, context) assert len(result) == 1 assert "fabricated.io" in result[0] def test_empty_context_flags_all_urls(self): """If KB context has no URLs, any response URL is fabricated.""" response = "See https://example.com for more." result = _check_url_fabrication(response, "") assert len(result) == 1 def test_url_replacement_in_response(self): """Verify that URL replacement produces correct output.""" display = "Visit https://futard.io/proposal/GPT8d for details." fabricated = _check_url_fabrication(display, "no urls here") for url in fabricated: display = display.replace(url, "[URL removed — not verified]") assert "futard.io" not in display assert "[URL removed — not verified]" in display class TestConfidenceFloor: """Confidence floor tests — catches failure modes #4, #6, #7.""" def test_low_confidence_gets_caveat(self): """Confidence < 0.3 should trigger caveat prefix.""" confidence = 0.1 display = "The first project was Saber Vote Market." if confidence < 0.3: display = f"⚠️ Low confidence — the knowledge base may not have good coverage here.\n\n{display}" assert display.startswith("⚠️ Low confidence") assert "Saber Vote Market" in display def test_high_confidence_no_caveat(self): """Confidence >= 0.3 should pass through unchanged.""" confidence = 0.7 display = "MetaDAO uses conditional tokens." original = display if confidence < 0.3: display = f"⚠️ Low confidence\n\n{display}" assert display == original def test_none_confidence_no_caveat(self): """None confidence (parsing failure) should not trigger caveat.""" confidence = None display = "Some response." original = display if confidence is not None and confidence < 0.3: display = f"⚠️ Low confidence\n\n{display}" assert display == original def test_boundary_value_0_3(self): """Confidence exactly 0.3 should NOT trigger (< not <=).""" confidence = 0.3 blocked = 1 if confidence < 0.3 else 0 assert blocked == 0 def test_boundary_value_0_29(self): """Confidence 0.29 should trigger.""" confidence = 0.29 blocked = 1 if confidence < 0.3 else 0 assert blocked == 1 class TestSchemaV10: """Test that migration v10 adds correct columns.""" def test_migration_adds_columns(self): """Verify migration v10 adds all 8 new columns to response_audit.""" conn = sqlite3.connect(":memory:") conn.execute("CREATE TABLE schema_version (version INTEGER PRIMARY KEY, applied_at TEXT)") conn.execute("INSERT INTO schema_version (version) VALUES (9)") # Create response_audit with v9 schema (no cost/blocked columns) conn.execute(""" CREATE TABLE response_audit ( id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TEXT, chat_id INTEGER, user TEXT, agent TEXT DEFAULT 'rio', model TEXT, query TEXT, conversation_window TEXT, entities_matched TEXT, claims_matched TEXT, retrieval_layers_hit TEXT, retrieval_gap TEXT, market_data TEXT, research_context TEXT, kb_context_text TEXT, tool_calls TEXT, raw_response TEXT, display_response TEXT, confidence_score REAL, response_time_ms INTEGER, created_at TEXT ) """) # Run migration v10 new_cols = [ ("prompt_tokens", "INTEGER"), ("completion_tokens", "INTEGER"), ("generation_cost", "REAL"), ("embedding_cost", "REAL"), ("total_cost", "REAL"), ("blocked", "INTEGER DEFAULT 0"), ("block_reason", "TEXT"), ("query_type", "TEXT"), ] for col_name, col_type in new_cols: try: conn.execute(f"ALTER TABLE response_audit ADD COLUMN {col_name} {col_type}") except Exception: pass # Verify all columns exist cols = [row[1] for row in conn.execute("PRAGMA table_info(response_audit)").fetchall()] for col_name, _ in new_cols: assert col_name in cols, f"Missing column: {col_name}" def test_insert_with_new_columns(self): """Verify insert works with new columns.""" conn = sqlite3.connect(":memory:") conn.execute(""" CREATE TABLE response_audit ( id INTEGER PRIMARY KEY AUTOINCREMENT, query TEXT, prompt_tokens INTEGER, completion_tokens INTEGER, generation_cost REAL, total_cost REAL, blocked INTEGER DEFAULT 0, block_reason TEXT ) """) conn.execute( "INSERT INTO response_audit (query, prompt_tokens, completion_tokens, generation_cost, total_cost, blocked, block_reason) VALUES (?, ?, ?, ?, ?, ?, ?)", ("test query", 2000, 500, 0.0675, 0.0675, 1, "confidence_floor: 0.1"), ) row = conn.execute("SELECT * FROM response_audit").fetchone() assert row[1] == "test query" assert row[2] == 2000 assert row[6] == 1 def test_migration_idempotent(self): """Running migration twice should not error (column already exists).""" conn = sqlite3.connect(":memory:") conn.execute("CREATE TABLE response_audit (id INTEGER PRIMARY KEY, query TEXT)") # Run twice for _ in range(2): for col_name, col_type in [("blocked", "INTEGER DEFAULT 0"), ("total_cost", "REAL")]: try: conn.execute(f"ALTER TABLE response_audit ADD COLUMN {col_name} {col_type}") except sqlite3.OperationalError: pass # Expected on second run cols = [row[1] for row in conn.execute("PRAGMA table_info(response_audit)").fetchall()] assert "blocked" in cols assert "total_cost" in cols class TestLLMResponse: """Test the _LLMResponse string subclass.""" def test_behaves_as_string(self): """_LLMResponse should work as a regular string.""" # Can't import bot.py (telegram dependency), so test the pattern class _LLMResponse(str): prompt_tokens: int = 0 completion_tokens: int = 0 cost: float = 0.0 model: str = "" r = _LLMResponse("Hello world") assert str(r) == "Hello world" assert "Hello" in r assert len(r) == 11 def test_carries_metadata(self): class _LLMResponse(str): prompt_tokens: int = 0 completion_tokens: int = 0 cost: float = 0.0 model: str = "" r = _LLMResponse("response text") r.prompt_tokens = 2000 r.completion_tokens = 500 r.cost = 0.0675 r.model = "anthropic/claude-opus-4-6" assert r.prompt_tokens == 2000 assert r.cost == 0.0675 # getattr works (this is how bot.py accesses it) assert getattr(r, 'prompt_tokens', 0) == 2000 assert getattr(r, 'cost', 0.0) == 0.0675 def test_getattr_on_none_returns_default(self): """When response is None, getattr should return defaults.""" response = None assert getattr(response, 'prompt_tokens', 0) == 0 assert getattr(response, 'cost', 0.0) == 0.0 class TestRealFailureModes: """Replay real failure modes from audit records to verify checks would catch them.""" def test_record_12_fabricated_url(self): """Record #12: futard.io/proposal/GPT8d... — a completely fabricated URL.""" response = ( "You can find the proposal at https://futard.io/proposal/GPT8d... " "which shows the conditional token mechanics." ) kb_context = ( "MetaDAO governance uses conditional tokens. When a proposal passes, " "tokens on the winning side become redeemable." ) fabricated = _check_url_fabrication(response, kb_context) assert len(fabricated) > 0, "Should catch fabricated futard.io URL" # Verify replacement works display = response for url in fabricated: display = display.replace(url, "[URL removed — not verified]") assert "futard.io" not in display def test_record_3_confident_fabrication(self): """Record #3: bot listed 4 wrong ownership coins at 0.7 confidence. The confidence floor (0.3) doesn't catch this — 0.7 > 0.3. This test documents the gap. Layer 3 (Haiku grounding) is needed. """ confidence = 0.7 blocked = 1 if confidence < 0.3 else 0 assert blocked == 0, "Confidence floor correctly does NOT catch high-confidence fabrication" # This is a known gap — documented, not a test failure def test_record_6_low_confidence_speculating(self): """Record #6: confidence 0.1, bot still speculated. Confidence floor should flag this in observation mode. """ confidence = 0.1 blocked = 1 if confidence is not None and confidence < 0.3 else 0 assert blocked == 1, "Should flag confidence 0.1 in observation mode"