Merges all work from epimetheus/enrichment-dedup-fix and epimetheus/eval-and-reweave-fixes: - Eval pipeline: _LLMResponse in call_openrouter, URL fabrication check, confidence floor, cost alerts - Reweave fixes: _is_entity gate, _same_source filter, temp 0.3, blank line sanitization - Enrichment dedup: three-layer fix (source-slug, PR-number, post-rebase scan) - Cherry-pick merge: replaces rebase-retry, --ours entity conflict resolution - TG batching: group by chat_id + time proximity, force-split on unparseable timestamps - Schema migration v10: response_audit columns for cost/confidence/blocking 67 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
320 lines
13 KiB
Python
320 lines
13 KiB
Python
"""Tests for eval pipeline — cost tracking, URL fabrication check, confidence floor.
|
|
|
|
Imports from telegram/eval.py (production code). No local reimplementations.
|
|
|
|
Tests validate against real failure modes from audit records:
|
|
- Record #12: hallucinated futard.io URL
|
|
- Records #3, #9: confident fabrication at 0.7
|
|
- Records #6, #7: low confidence (0.1) with no gate
|
|
"""
|
|
|
|
import sqlite3
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
# Add telegram/ to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "telegram"))
|
|
|
|
from eval import (
|
|
_LLMResponse,
|
|
estimate_cost,
|
|
check_url_fabrication,
|
|
apply_confidence_floor,
|
|
MODEL_PRICING,
|
|
CONFIDENCE_FLOOR,
|
|
COST_ALERT_THRESHOLD,
|
|
)
|
|
|
|
|
|
# ─── estimate_cost tests ─────────────────────────────────────────────────
|
|
|
|
|
|
class TestEstimateCost:
|
|
"""Cost estimation tests."""
|
|
|
|
def test_opus_typical_response(self):
|
|
"""Typical Opus response: ~2000 prompt tokens, ~500 completion."""
|
|
cost = estimate_cost("anthropic/claude-opus-4-6", 2000, 500)
|
|
# 2000 * 15/1M + 500 * 75/1M = 0.03 + 0.0375 = 0.0675
|
|
assert abs(cost - 0.0675) < 0.0001
|
|
|
|
def test_haiku_cheap(self):
|
|
"""Haiku calls should be very cheap."""
|
|
cost = estimate_cost("anthropic/claude-haiku-4.5", 1000, 200)
|
|
# 1000 * 0.8/1M + 200 * 4/1M = 0.0008 + 0.0008 = 0.0016
|
|
assert abs(cost - 0.0016) < 0.0001
|
|
|
|
def test_unknown_model_uses_sonnet_default(self):
|
|
"""Unknown model falls back to Sonnet pricing ($3/$15)."""
|
|
cost = estimate_cost("some-unknown/model", 1000, 1000)
|
|
# 1000 * 3/1M + 1000 * 15/1M = 0.003 + 0.015 = 0.018
|
|
assert abs(cost - 0.018) < 0.0001
|
|
|
|
def test_zero_tokens_zero_cost(self):
|
|
cost = estimate_cost("anthropic/claude-opus-4-6", 0, 0)
|
|
assert cost == 0.0
|
|
|
|
def test_gpt4o_mini_cheapest(self):
|
|
"""GPT-4o-mini should be cheapest mainstream model."""
|
|
cost = estimate_cost("openai/gpt-4o-mini", 10000, 1000)
|
|
assert cost < 0.003 # very cheap
|
|
|
|
def test_opus_more_expensive_than_haiku(self):
|
|
"""Same token counts, Opus should be ~20x more expensive than Haiku."""
|
|
opus = estimate_cost("anthropic/claude-opus-4-6", 1000, 500)
|
|
haiku = estimate_cost("anthropic/claude-haiku-4.5", 1000, 500)
|
|
assert opus > haiku * 10
|
|
|
|
|
|
# ─── URL fabrication tests ───────────────────────────────────────────────
|
|
|
|
|
|
class TestURLFabrication:
|
|
"""URL fabrication detection — catches failure mode #2 (record #12)."""
|
|
|
|
def test_no_urls_in_response(self):
|
|
"""Response without URLs passes through unchanged."""
|
|
cleaned, fabricated = check_url_fabrication("MetaDAO uses futarchy.", "some kb context")
|
|
assert fabricated == []
|
|
assert cleaned == "MetaDAO uses futarchy."
|
|
|
|
def test_url_present_in_context(self):
|
|
"""URL that exists in KB context is NOT flagged."""
|
|
response = "Check out https://metadao.fi/proposals for details."
|
|
context = "Source: https://metadao.fi/proposals — MetaDAO governance"
|
|
cleaned, fabricated = check_url_fabrication(response, context)
|
|
assert fabricated == []
|
|
assert cleaned == response
|
|
|
|
def test_fabricated_url_caught(self):
|
|
"""Record #12: bot fabricated futard.io URL — should be caught."""
|
|
response = "You can find the proposal at https://futard.io/proposal/GPT8d..."
|
|
context = "MetaDAO uses conditional tokens for governance decisions."
|
|
cleaned, fabricated = check_url_fabrication(response, context)
|
|
assert len(fabricated) == 1
|
|
assert "futard.io" in fabricated[0]
|
|
assert "futard.io" not in cleaned
|
|
assert "[URL removed — not verified]" in cleaned
|
|
|
|
def test_multiple_fabricated_urls(self):
|
|
"""Multiple fabricated URLs all get caught."""
|
|
response = (
|
|
"See https://fake1.com/page and also https://fake2.org/data "
|
|
"and the real one https://metadao.fi"
|
|
)
|
|
context = "Source: https://metadao.fi — real URL"
|
|
cleaned, fabricated = check_url_fabrication(response, context)
|
|
assert len(fabricated) == 2
|
|
fab_str = " ".join(fabricated)
|
|
assert "fake1.com" in fab_str
|
|
assert "fake2.org" in fab_str
|
|
|
|
def test_url_in_parentheses(self):
|
|
"""URL inside markdown link syntax should be extracted."""
|
|
response = "Check [here](https://fabricated.io/page) for more."
|
|
context = "No URLs in context."
|
|
cleaned, fabricated = check_url_fabrication(response, context)
|
|
assert len(fabricated) == 1
|
|
assert "fabricated.io" in fabricated[0]
|
|
|
|
def test_empty_context_flags_all_urls(self):
|
|
"""If KB context has no URLs, any response URL is fabricated."""
|
|
cleaned, fabricated = check_url_fabrication("See https://example.com for more.", "")
|
|
assert len(fabricated) == 1
|
|
|
|
|
|
# ─── Confidence floor tests ─────────────────────────────────────────────
|
|
|
|
|
|
class TestConfidenceFloor:
|
|
"""Confidence floor tests — catches failure modes #4, #6, #7."""
|
|
|
|
def test_low_confidence_gets_caveat(self):
|
|
"""Confidence < 0.3 should trigger caveat prefix."""
|
|
display, blocked, reason = apply_confidence_floor("Some response.", 0.1)
|
|
assert blocked is True
|
|
assert "0.10" in display
|
|
assert "caution" in display.lower()
|
|
assert reason is not None
|
|
|
|
def test_high_confidence_no_caveat(self):
|
|
"""Confidence >= 0.3 should pass through unchanged."""
|
|
display, blocked, reason = apply_confidence_floor("MetaDAO uses conditional tokens.", 0.7)
|
|
assert blocked is False
|
|
assert reason is None
|
|
assert display == "MetaDAO uses conditional tokens."
|
|
|
|
def test_none_confidence_no_caveat(self):
|
|
"""None confidence (parsing failure) should not trigger caveat."""
|
|
display, blocked, reason = apply_confidence_floor("Some response.", None)
|
|
assert blocked is False
|
|
assert display == "Some response."
|
|
|
|
def test_boundary_value_0_3(self):
|
|
"""Confidence exactly 0.3 should NOT trigger (< not <=)."""
|
|
display, blocked, reason = apply_confidence_floor("Response.", 0.3)
|
|
assert blocked is False
|
|
|
|
def test_boundary_value_0_29(self):
|
|
"""Confidence 0.29 should trigger."""
|
|
display, blocked, reason = apply_confidence_floor("Response.", 0.29)
|
|
assert blocked is True
|
|
|
|
|
|
# ─── _LLMResponse tests ─────────────────────────────────────────────────
|
|
|
|
|
|
class TestLLMResponse:
|
|
"""Test the _LLMResponse string subclass."""
|
|
|
|
def test_behaves_as_string(self):
|
|
r = _LLMResponse("Hello world")
|
|
assert str(r) == "Hello world"
|
|
assert "Hello" in r
|
|
assert len(r) == 11
|
|
|
|
def test_carries_metadata(self):
|
|
r = _LLMResponse("response text", prompt_tokens=2000,
|
|
completion_tokens=500, cost=0.0675,
|
|
model="anthropic/claude-opus-4-6")
|
|
assert r.prompt_tokens == 2000
|
|
assert r.completion_tokens == 500
|
|
assert r.cost == 0.0675
|
|
assert r.model == "anthropic/claude-opus-4-6"
|
|
|
|
def test_getattr_works(self):
|
|
"""bot.py uses getattr(response, 'cost', 0.0)."""
|
|
r = _LLMResponse("text", cost=0.05)
|
|
assert getattr(r, 'cost', 0.0) == 0.05
|
|
|
|
def test_getattr_on_none_returns_default(self):
|
|
"""When response is None, getattr should return defaults."""
|
|
response = None
|
|
assert getattr(response, 'prompt_tokens', 0) == 0
|
|
assert getattr(response, 'cost', 0.0) == 0.0
|
|
|
|
|
|
# ─── Schema migration tests ─────────────────────────────────────────────
|
|
|
|
|
|
class TestSchemaV10:
|
|
"""Test that migration v10 adds correct columns."""
|
|
|
|
def test_migration_adds_columns(self):
|
|
"""Verify migration v10 adds all 8 new columns to response_audit."""
|
|
conn = sqlite3.connect(":memory:")
|
|
conn.execute("CREATE TABLE schema_version (version INTEGER PRIMARY KEY, applied_at TEXT)")
|
|
conn.execute("INSERT INTO schema_version (version) VALUES (9)")
|
|
conn.execute("""
|
|
CREATE TABLE response_audit (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
timestamp TEXT, chat_id INTEGER, user TEXT,
|
|
agent TEXT DEFAULT 'rio', model TEXT, query TEXT,
|
|
confidence_score REAL, response_time_ms INTEGER,
|
|
created_at TEXT
|
|
)
|
|
""")
|
|
|
|
# Run the actual migration logic (same as db.py v10)
|
|
new_cols = [
|
|
("prompt_tokens", "INTEGER"),
|
|
("completion_tokens", "INTEGER"),
|
|
("generation_cost", "REAL"),
|
|
("embedding_cost", "REAL"),
|
|
("total_cost", "REAL"),
|
|
("blocked", "INTEGER DEFAULT 0"),
|
|
("block_reason", "TEXT"),
|
|
("query_type", "TEXT"),
|
|
]
|
|
for col_name, col_type in new_cols:
|
|
try:
|
|
conn.execute(f"ALTER TABLE response_audit ADD COLUMN {col_name} {col_type}")
|
|
except sqlite3.OperationalError:
|
|
pass
|
|
|
|
cols = [row[1] for row in conn.execute("PRAGMA table_info(response_audit)").fetchall()]
|
|
for col_name, _ in new_cols:
|
|
assert col_name in cols, f"Missing column: {col_name}"
|
|
|
|
def test_insert_with_new_columns(self):
|
|
"""Verify insert works with eval columns."""
|
|
conn = sqlite3.connect(":memory:")
|
|
conn.execute("""
|
|
CREATE TABLE response_audit (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
query TEXT, prompt_tokens INTEGER, completion_tokens INTEGER,
|
|
generation_cost REAL, blocked INTEGER DEFAULT 0, block_reason TEXT
|
|
)
|
|
""")
|
|
conn.execute(
|
|
"INSERT INTO response_audit (query, prompt_tokens, completion_tokens, generation_cost, blocked, block_reason) VALUES (?, ?, ?, ?, ?, ?)",
|
|
("test query", 2000, 500, 0.0675, 1, "confidence_floor: 0.1"),
|
|
)
|
|
row = conn.execute("SELECT * FROM response_audit").fetchone()
|
|
assert row[1] == "test query"
|
|
assert row[2] == 2000
|
|
assert row[5] == 1
|
|
|
|
def test_migration_idempotent(self):
|
|
"""Running migration twice should not error."""
|
|
conn = sqlite3.connect(":memory:")
|
|
conn.execute("CREATE TABLE response_audit (id INTEGER PRIMARY KEY, query TEXT)")
|
|
for _ in range(2):
|
|
for col_name, col_type in [("blocked", "INTEGER DEFAULT 0"), ("total_cost", "REAL")]:
|
|
try:
|
|
conn.execute(f"ALTER TABLE response_audit ADD COLUMN {col_name} {col_type}")
|
|
except sqlite3.OperationalError:
|
|
pass
|
|
cols = [row[1] for row in conn.execute("PRAGMA table_info(response_audit)").fetchall()]
|
|
assert "blocked" in cols
|
|
assert "total_cost" in cols
|
|
|
|
|
|
# ─── Real failure mode replays ───────────────────────────────────────────
|
|
|
|
|
|
class TestRealFailureModes:
|
|
"""Replay real failure modes from audit records."""
|
|
|
|
def test_record_12_fabricated_url(self):
|
|
"""Record #12: futard.io/proposal/GPT8d... — completely fabricated."""
|
|
response = (
|
|
"You can find the proposal at https://futard.io/proposal/GPT8d... "
|
|
"which shows the conditional token mechanics."
|
|
)
|
|
kb_context = "MetaDAO uses conditional tokens for governance decisions."
|
|
cleaned, fabricated = check_url_fabrication(response, kb_context)
|
|
assert len(fabricated) > 0, "Should catch fabricated futard.io URL"
|
|
assert "futard.io" not in cleaned
|
|
|
|
def test_record_3_confident_fabrication(self):
|
|
"""Record #3: 0.7 confidence fabrication — floor doesn't catch.
|
|
Documents the gap — Layer 3 needed."""
|
|
_, blocked, _ = apply_confidence_floor("Wrong content", 0.7)
|
|
assert blocked is False # Correctly doesn't catch — known gap
|
|
|
|
def test_record_6_low_confidence(self):
|
|
"""Record #6: confidence 0.1, should be flagged."""
|
|
_, blocked, _ = apply_confidence_floor("Speculative response", 0.1)
|
|
assert blocked is True
|
|
|
|
|
|
# ─── Constants validation ────────────────────────────────────────────────
|
|
|
|
|
|
class TestConstants:
|
|
def test_confidence_floor_value(self):
|
|
assert CONFIDENCE_FLOOR == 0.3
|
|
|
|
def test_cost_alert_threshold(self):
|
|
assert COST_ALERT_THRESHOLD == 0.22
|
|
|
|
def test_opus_pricing_present(self):
|
|
assert "anthropic/claude-opus-4-6" in MODEL_PRICING
|
|
|
|
def test_haiku_pricing_correct(self):
|
|
input_rate, output_rate = MODEL_PRICING["anthropic/claude-haiku-4.5"]
|
|
assert input_rate == 0.80
|
|
assert output_rate == 4.0
|