teleo-infrastructure/tests/test_research_eval_schema_sql.py
twentyOne2x 1a71efcde2
Add Teleo research eval schema
Adds graph schema prerequisite plus research-eval schema/docs/tests for Leo tool-use benchmarks and x402 research telemetry. Validated by full local pytest and green CI.
2026-06-24 14:21:03 +02:00

365 lines
13 KiB
Python

from __future__ import annotations
import sqlite3
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
GRAPH_SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-graph-v1.sql"
RESEARCH_EVAL_SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-research-eval-v1.sql"
def _conn() -> sqlite3.Connection:
conn = sqlite3.connect(":memory:")
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
conn.executescript(GRAPH_SCHEMA_SQL.read_text())
conn.executescript(RESEARCH_EVAL_SCHEMA_SQL.read_text())
return conn
def test_research_eval_schema_applies_after_graph_schema():
conn = _conn()
versions = {
row["version"]
for row in conn.execute("SELECT version FROM graph_schema_version").fetchall()
}
assert versions == {
"teleo-agent-graph-v1",
"teleo-agent-research-eval-v1",
}
tables = {
row["name"]
for row in conn.execute(
"SELECT name FROM sqlite_master WHERE type = 'table'"
).fetchall()
}
assert {
"agent_research_runs",
"agent_tool_invocations",
"agent_research_sources",
"agent_eval_cases",
"agent_eval_results",
"work_order_graph_links",
} <= tables
def test_ranger_liquidation_case_routes_to_source_backed_research_not_market_data():
conn = _conn()
conn.execute(
"INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')"
)
conn.execute(
"""INSERT INTO agent_eval_cases
(id, suite_id, case_slug, prompt_sha256, prompt_excerpt, expected_route,
expected_provider, must_use_tools_json, must_not_use_tools_json, tags_json, rubric_json)
VALUES
(
'eval-ranger-liquidated-v1',
'leo-research-routing-v1',
'ranger-liquidated-not-fair-value',
'sha256:ranger-prompt',
'Is Ranger Finance fairly valued today given Ranger Finance is liquidated and gone?',
'web_search',
'agentcash-stableenrich-exa-search',
'["source-backed web research"]',
'["structured_market_data_only", "live_token_fair_value"]',
'["ranger_liquidated", "valuation", "source_verification"]',
'{"routing": "verify liquidation before valuation framing"}'
)"""
)
conn.execute(
"""INSERT INTO agent_research_runs
(id, agent_slug, source_surface, source_ref, request_kind, sponsored_work_order_id,
payment_receipt_id, prompt_sha256, prompt_excerpt, selected_provider, selected_route,
status, answer_sha256, answer_excerpt, proof_ref, cost_amount, latency_ms, source_count)
VALUES
(
'run-ranger-liquidated-001',
'leo',
'telegram',
'telegram:group:message-123',
'paid_quote',
'sponsored_work_orders:test-ranger-001',
'payment_receipts:test-ranger-001',
'sha256:ranger-prompt',
'Is Ranger Finance fairly valued today given Ranger Finance is liquidated and gone?',
'agentcash-stableenrich-exa-search',
'web_search',
'answered',
'sha256:ranger-answer',
'Verified liquidation/gone status before valuation framing.',
'proof/leo-ranger-liquidated-routing.json',
0.01,
1240,
3
)"""
)
conn.executemany(
"""INSERT INTO agent_tool_invocations
(id, research_run_id, sequence, provider, tool_name, tool_category, decision,
decision_reason, paid, rail, network, amount, payment_receipt_id, input_sha256,
output_sha256, source_count, latency_ms)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
[
(
"tool-ranger-market-rejected",
"run-ranger-liquidated-001",
1,
"DexScreener",
"structured-market-context",
"market_data",
"rejected",
"Ranger liquidation status must be verified before treating this as a live token valuation.",
0,
"free",
None,
0,
None,
"sha256:market-input",
None,
0,
12,
),
(
"tool-ranger-web-selected",
"run-ranger-liquidated-001",
2,
"AgentCash StableEnrich",
"exa-search",
"web_search",
"executed",
"Source-backed liquidation and status verification required.",
1,
"agentcash",
"solana:5eykt4UsFv8P8NJdTREpY1vzqKqZKvdp",
0.01,
"payment_receipts:test-ranger-001",
"sha256:exa-input",
"sha256:exa-output",
3,
1228,
),
],
)
conn.executemany(
"""INSERT INTO agent_research_sources
(id, research_run_id, tool_invocation_id, source_type, source_uri_sha256,
title, cited, retrieval_rank, support_status)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
[
(
"source-ranger-official",
"run-ranger-liquidated-001",
"tool-ranger-web-selected",
"web",
"sha256:ranger-official",
"Ranger status source",
1,
1,
"supports",
),
(
"source-ranger-forum",
"run-ranger-liquidated-001",
"tool-ranger-web-selected",
"web",
"sha256:ranger-forum",
"MetaDAO/Ranger discussion source",
1,
2,
"context",
),
],
)
conn.execute(
"""INSERT INTO graph_evaluation_runs
(id, target_layer, target_id, trigger_type, evaluator, verdict, confidence, notes)
VALUES
(
'graph-eval-ranger-routing',
'claim',
'ranger-liquidated-status',
'manual',
'leo-research-routing-benchmark',
'approve',
0.92,
'Tool choice matched Ranger liquidation guard.'
)"""
)
conn.execute(
"""INSERT INTO agent_eval_results
(id, eval_case_id, research_run_id, graph_evaluation_run_id, status, score,
routing_correct, tool_choice_score, source_quality_score, groundedness_score,
freshness_score, cost_efficiency_score, safety_payment_score, proof_ref)
VALUES
(
'eval-result-ranger-liquidated-001',
'eval-ranger-liquidated-v1',
'run-ranger-liquidated-001',
'graph-eval-ranger-routing',
'passed',
0.94,
1,
1.0,
0.9,
0.9,
0.85,
0.8,
1.0,
'proof/leo-ranger-liquidated-routing.json'
)"""
)
conn.execute(
"""INSERT INTO work_order_graph_links
(id, sponsored_work_order_id, role, graph_layer, graph_id, rationale)
VALUES
(
'wo-ranger-run-link',
'sponsored_work_orders:test-ranger-001',
'research_run',
'agent_research_run',
'run-ranger-liquidated-001',
'Paid work order produced source-backed research run.'
)"""
)
row = conn.execute(
"""SELECT
r.selected_route,
r.selected_provider,
er.status AS eval_status,
er.routing_correct,
er.tool_choice_score,
COUNT(s.id) AS cited_source_count
FROM agent_research_runs r
JOIN agent_eval_results er ON er.research_run_id = r.id
LEFT JOIN agent_research_sources s ON s.research_run_id = r.id AND s.cited = 1
WHERE r.id = 'run-ranger-liquidated-001'
GROUP BY r.id, er.id"""
).fetchone()
market_executed = conn.execute(
"""SELECT COUNT(*) AS n
FROM agent_tool_invocations
WHERE research_run_id = 'run-ranger-liquidated-001'
AND tool_category = 'market_data'
AND decision = 'executed'"""
).fetchone()["n"]
rejected_market = conn.execute(
"""SELECT COUNT(*) AS n
FROM agent_tool_invocations
WHERE research_run_id = 'run-ranger-liquidated-001'
AND tool_category = 'market_data'
AND decision = 'rejected'"""
).fetchone()["n"]
assert dict(row) == {
"selected_route": "web_search",
"selected_provider": "agentcash-stableenrich-exa-search",
"eval_status": "passed",
"routing_correct": 1,
"tool_choice_score": 1.0,
"cited_source_count": 2,
}
assert market_executed == 0
assert rejected_market == 1
def test_schema_rejects_secret_flags_bad_scores_and_bad_tool_decisions():
conn = _conn()
conn.execute(
"INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')"
)
conn.execute(
"""INSERT INTO agent_research_runs
(id, agent_slug, source_surface, request_kind, prompt_sha256, selected_route, status)
VALUES ('run-constraints', 'leo', 'test', 'benchmark', 'sha256:prompt', 'web_search', 'answered')"""
)
conn.execute(
"""INSERT INTO agent_eval_cases
(id, suite_id, case_slug, prompt_sha256, prompt_excerpt, expected_route)
VALUES ('case-constraints', 'suite', 'case', 'sha256:prompt', 'redacted prompt', 'web_search')"""
)
invalid_statements = [
"""INSERT INTO agent_research_runs
(id, agent_slug, source_surface, request_kind, prompt_sha256, selected_route, status, secret_values_included)
VALUES ('run-secret', 'leo', 'test', 'benchmark', 'sha256:secret', 'web_search', 'answered', 1)""",
"""INSERT INTO agent_tool_invocations
(id, research_run_id, provider, tool_name, tool_category, decision, decision_reason)
VALUES ('tool-bad-decision', 'run-constraints', 'p', 't', 'web_search', 'approved', 'bad enum')""",
"""INSERT INTO agent_eval_results
(id, eval_case_id, research_run_id, status, score)
VALUES ('eval-bad-score', 'case-constraints', 'run-constraints', 'passed', 1.1)""",
"""INSERT INTO agent_eval_results
(id, eval_case_id, research_run_id, status, routing_correct)
VALUES ('eval-bad-bool', 'case-constraints', 'run-constraints', 'passed', 2)""",
]
for statement in invalid_statements:
try:
conn.execute(statement)
except sqlite3.IntegrityError:
pass
else:
raise AssertionError(f"invalid statement unexpectedly passed: {statement}")
def test_research_run_can_be_recorded_without_raw_prompt_or_private_payloads():
conn = _conn()
conn.execute(
"INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')"
)
conn.execute(
"""INSERT INTO agent_research_runs
(id, agent_slug, source_surface, source_ref, request_kind, prompt_sha256,
selected_route, status, answer_sha256, proof_ref)
VALUES
(
'run-hash-only',
'leo',
'api',
'api:request-redacted',
'paid_work_order',
'sha256:prompt-only',
'social_trends',
'answered',
'sha256:answer-only',
'proof/hash-only.json'
)"""
)
conn.execute(
"""INSERT INTO agent_tool_invocations
(id, research_run_id, provider, tool_name, tool_category, decision,
decision_reason, input_sha256, output_sha256)
VALUES
(
'tool-hash-only',
'run-hash-only',
'AgentCash StableSocial',
'lightreel-trends',
'social_trends',
'executed',
'Question asks for current Twitter/X discussion.',
'sha256:input-only',
'sha256:output-only'
)"""
)
row = conn.execute(
"""SELECT
r.prompt_excerpt,
r.answer_excerpt,
r.secret_values_included AS run_secret_flag,
t.secret_values_included AS tool_secret_flag
FROM agent_research_runs r
JOIN agent_tool_invocations t ON t.research_run_id = r.id
WHERE r.id = 'run-hash-only'"""
).fetchone()
assert row["prompt_excerpt"] is None
assert row["answer_excerpt"] is None
assert row["run_secret_flag"] == 0
assert row["tool_secret_flag"] == 0