Adds graph schema prerequisite plus research-eval schema/docs/tests for Leo tool-use benchmarks and x402 research telemetry. Validated by full local pytest and green CI.
365 lines
13 KiB
Python
365 lines
13 KiB
Python
from __future__ import annotations
|
|
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
GRAPH_SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-graph-v1.sql"
|
|
RESEARCH_EVAL_SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-research-eval-v1.sql"
|
|
|
|
|
|
def _conn() -> sqlite3.Connection:
|
|
conn = sqlite3.connect(":memory:")
|
|
conn.row_factory = sqlite3.Row
|
|
conn.execute("PRAGMA foreign_keys = ON")
|
|
conn.executescript(GRAPH_SCHEMA_SQL.read_text())
|
|
conn.executescript(RESEARCH_EVAL_SCHEMA_SQL.read_text())
|
|
return conn
|
|
|
|
|
|
def test_research_eval_schema_applies_after_graph_schema():
|
|
conn = _conn()
|
|
|
|
versions = {
|
|
row["version"]
|
|
for row in conn.execute("SELECT version FROM graph_schema_version").fetchall()
|
|
}
|
|
assert versions == {
|
|
"teleo-agent-graph-v1",
|
|
"teleo-agent-research-eval-v1",
|
|
}
|
|
|
|
tables = {
|
|
row["name"]
|
|
for row in conn.execute(
|
|
"SELECT name FROM sqlite_master WHERE type = 'table'"
|
|
).fetchall()
|
|
}
|
|
assert {
|
|
"agent_research_runs",
|
|
"agent_tool_invocations",
|
|
"agent_research_sources",
|
|
"agent_eval_cases",
|
|
"agent_eval_results",
|
|
"work_order_graph_links",
|
|
} <= tables
|
|
|
|
|
|
def test_ranger_liquidation_case_routes_to_source_backed_research_not_market_data():
|
|
conn = _conn()
|
|
conn.execute(
|
|
"INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')"
|
|
)
|
|
conn.execute(
|
|
"""INSERT INTO agent_eval_cases
|
|
(id, suite_id, case_slug, prompt_sha256, prompt_excerpt, expected_route,
|
|
expected_provider, must_use_tools_json, must_not_use_tools_json, tags_json, rubric_json)
|
|
VALUES
|
|
(
|
|
'eval-ranger-liquidated-v1',
|
|
'leo-research-routing-v1',
|
|
'ranger-liquidated-not-fair-value',
|
|
'sha256:ranger-prompt',
|
|
'Is Ranger Finance fairly valued today given Ranger Finance is liquidated and gone?',
|
|
'web_search',
|
|
'agentcash-stableenrich-exa-search',
|
|
'["source-backed web research"]',
|
|
'["structured_market_data_only", "live_token_fair_value"]',
|
|
'["ranger_liquidated", "valuation", "source_verification"]',
|
|
'{"routing": "verify liquidation before valuation framing"}'
|
|
)"""
|
|
)
|
|
conn.execute(
|
|
"""INSERT INTO agent_research_runs
|
|
(id, agent_slug, source_surface, source_ref, request_kind, sponsored_work_order_id,
|
|
payment_receipt_id, prompt_sha256, prompt_excerpt, selected_provider, selected_route,
|
|
status, answer_sha256, answer_excerpt, proof_ref, cost_amount, latency_ms, source_count)
|
|
VALUES
|
|
(
|
|
'run-ranger-liquidated-001',
|
|
'leo',
|
|
'telegram',
|
|
'telegram:group:message-123',
|
|
'paid_quote',
|
|
'sponsored_work_orders:test-ranger-001',
|
|
'payment_receipts:test-ranger-001',
|
|
'sha256:ranger-prompt',
|
|
'Is Ranger Finance fairly valued today given Ranger Finance is liquidated and gone?',
|
|
'agentcash-stableenrich-exa-search',
|
|
'web_search',
|
|
'answered',
|
|
'sha256:ranger-answer',
|
|
'Verified liquidation/gone status before valuation framing.',
|
|
'proof/leo-ranger-liquidated-routing.json',
|
|
0.01,
|
|
1240,
|
|
3
|
|
)"""
|
|
)
|
|
conn.executemany(
|
|
"""INSERT INTO agent_tool_invocations
|
|
(id, research_run_id, sequence, provider, tool_name, tool_category, decision,
|
|
decision_reason, paid, rail, network, amount, payment_receipt_id, input_sha256,
|
|
output_sha256, source_count, latency_ms)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
[
|
|
(
|
|
"tool-ranger-market-rejected",
|
|
"run-ranger-liquidated-001",
|
|
1,
|
|
"DexScreener",
|
|
"structured-market-context",
|
|
"market_data",
|
|
"rejected",
|
|
"Ranger liquidation status must be verified before treating this as a live token valuation.",
|
|
0,
|
|
"free",
|
|
None,
|
|
0,
|
|
None,
|
|
"sha256:market-input",
|
|
None,
|
|
0,
|
|
12,
|
|
),
|
|
(
|
|
"tool-ranger-web-selected",
|
|
"run-ranger-liquidated-001",
|
|
2,
|
|
"AgentCash StableEnrich",
|
|
"exa-search",
|
|
"web_search",
|
|
"executed",
|
|
"Source-backed liquidation and status verification required.",
|
|
1,
|
|
"agentcash",
|
|
"solana:5eykt4UsFv8P8NJdTREpY1vzqKqZKvdp",
|
|
0.01,
|
|
"payment_receipts:test-ranger-001",
|
|
"sha256:exa-input",
|
|
"sha256:exa-output",
|
|
3,
|
|
1228,
|
|
),
|
|
],
|
|
)
|
|
conn.executemany(
|
|
"""INSERT INTO agent_research_sources
|
|
(id, research_run_id, tool_invocation_id, source_type, source_uri_sha256,
|
|
title, cited, retrieval_rank, support_status)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
[
|
|
(
|
|
"source-ranger-official",
|
|
"run-ranger-liquidated-001",
|
|
"tool-ranger-web-selected",
|
|
"web",
|
|
"sha256:ranger-official",
|
|
"Ranger status source",
|
|
1,
|
|
1,
|
|
"supports",
|
|
),
|
|
(
|
|
"source-ranger-forum",
|
|
"run-ranger-liquidated-001",
|
|
"tool-ranger-web-selected",
|
|
"web",
|
|
"sha256:ranger-forum",
|
|
"MetaDAO/Ranger discussion source",
|
|
1,
|
|
2,
|
|
"context",
|
|
),
|
|
],
|
|
)
|
|
conn.execute(
|
|
"""INSERT INTO graph_evaluation_runs
|
|
(id, target_layer, target_id, trigger_type, evaluator, verdict, confidence, notes)
|
|
VALUES
|
|
(
|
|
'graph-eval-ranger-routing',
|
|
'claim',
|
|
'ranger-liquidated-status',
|
|
'manual',
|
|
'leo-research-routing-benchmark',
|
|
'approve',
|
|
0.92,
|
|
'Tool choice matched Ranger liquidation guard.'
|
|
)"""
|
|
)
|
|
conn.execute(
|
|
"""INSERT INTO agent_eval_results
|
|
(id, eval_case_id, research_run_id, graph_evaluation_run_id, status, score,
|
|
routing_correct, tool_choice_score, source_quality_score, groundedness_score,
|
|
freshness_score, cost_efficiency_score, safety_payment_score, proof_ref)
|
|
VALUES
|
|
(
|
|
'eval-result-ranger-liquidated-001',
|
|
'eval-ranger-liquidated-v1',
|
|
'run-ranger-liquidated-001',
|
|
'graph-eval-ranger-routing',
|
|
'passed',
|
|
0.94,
|
|
1,
|
|
1.0,
|
|
0.9,
|
|
0.9,
|
|
0.85,
|
|
0.8,
|
|
1.0,
|
|
'proof/leo-ranger-liquidated-routing.json'
|
|
)"""
|
|
)
|
|
conn.execute(
|
|
"""INSERT INTO work_order_graph_links
|
|
(id, sponsored_work_order_id, role, graph_layer, graph_id, rationale)
|
|
VALUES
|
|
(
|
|
'wo-ranger-run-link',
|
|
'sponsored_work_orders:test-ranger-001',
|
|
'research_run',
|
|
'agent_research_run',
|
|
'run-ranger-liquidated-001',
|
|
'Paid work order produced source-backed research run.'
|
|
)"""
|
|
)
|
|
|
|
row = conn.execute(
|
|
"""SELECT
|
|
r.selected_route,
|
|
r.selected_provider,
|
|
er.status AS eval_status,
|
|
er.routing_correct,
|
|
er.tool_choice_score,
|
|
COUNT(s.id) AS cited_source_count
|
|
FROM agent_research_runs r
|
|
JOIN agent_eval_results er ON er.research_run_id = r.id
|
|
LEFT JOIN agent_research_sources s ON s.research_run_id = r.id AND s.cited = 1
|
|
WHERE r.id = 'run-ranger-liquidated-001'
|
|
GROUP BY r.id, er.id"""
|
|
).fetchone()
|
|
|
|
market_executed = conn.execute(
|
|
"""SELECT COUNT(*) AS n
|
|
FROM agent_tool_invocations
|
|
WHERE research_run_id = 'run-ranger-liquidated-001'
|
|
AND tool_category = 'market_data'
|
|
AND decision = 'executed'"""
|
|
).fetchone()["n"]
|
|
rejected_market = conn.execute(
|
|
"""SELECT COUNT(*) AS n
|
|
FROM agent_tool_invocations
|
|
WHERE research_run_id = 'run-ranger-liquidated-001'
|
|
AND tool_category = 'market_data'
|
|
AND decision = 'rejected'"""
|
|
).fetchone()["n"]
|
|
|
|
assert dict(row) == {
|
|
"selected_route": "web_search",
|
|
"selected_provider": "agentcash-stableenrich-exa-search",
|
|
"eval_status": "passed",
|
|
"routing_correct": 1,
|
|
"tool_choice_score": 1.0,
|
|
"cited_source_count": 2,
|
|
}
|
|
assert market_executed == 0
|
|
assert rejected_market == 1
|
|
|
|
|
|
def test_schema_rejects_secret_flags_bad_scores_and_bad_tool_decisions():
|
|
conn = _conn()
|
|
conn.execute(
|
|
"INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')"
|
|
)
|
|
conn.execute(
|
|
"""INSERT INTO agent_research_runs
|
|
(id, agent_slug, source_surface, request_kind, prompt_sha256, selected_route, status)
|
|
VALUES ('run-constraints', 'leo', 'test', 'benchmark', 'sha256:prompt', 'web_search', 'answered')"""
|
|
)
|
|
conn.execute(
|
|
"""INSERT INTO agent_eval_cases
|
|
(id, suite_id, case_slug, prompt_sha256, prompt_excerpt, expected_route)
|
|
VALUES ('case-constraints', 'suite', 'case', 'sha256:prompt', 'redacted prompt', 'web_search')"""
|
|
)
|
|
|
|
invalid_statements = [
|
|
"""INSERT INTO agent_research_runs
|
|
(id, agent_slug, source_surface, request_kind, prompt_sha256, selected_route, status, secret_values_included)
|
|
VALUES ('run-secret', 'leo', 'test', 'benchmark', 'sha256:secret', 'web_search', 'answered', 1)""",
|
|
"""INSERT INTO agent_tool_invocations
|
|
(id, research_run_id, provider, tool_name, tool_category, decision, decision_reason)
|
|
VALUES ('tool-bad-decision', 'run-constraints', 'p', 't', 'web_search', 'approved', 'bad enum')""",
|
|
"""INSERT INTO agent_eval_results
|
|
(id, eval_case_id, research_run_id, status, score)
|
|
VALUES ('eval-bad-score', 'case-constraints', 'run-constraints', 'passed', 1.1)""",
|
|
"""INSERT INTO agent_eval_results
|
|
(id, eval_case_id, research_run_id, status, routing_correct)
|
|
VALUES ('eval-bad-bool', 'case-constraints', 'run-constraints', 'passed', 2)""",
|
|
]
|
|
|
|
for statement in invalid_statements:
|
|
try:
|
|
conn.execute(statement)
|
|
except sqlite3.IntegrityError:
|
|
pass
|
|
else:
|
|
raise AssertionError(f"invalid statement unexpectedly passed: {statement}")
|
|
|
|
|
|
def test_research_run_can_be_recorded_without_raw_prompt_or_private_payloads():
|
|
conn = _conn()
|
|
conn.execute(
|
|
"INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')"
|
|
)
|
|
conn.execute(
|
|
"""INSERT INTO agent_research_runs
|
|
(id, agent_slug, source_surface, source_ref, request_kind, prompt_sha256,
|
|
selected_route, status, answer_sha256, proof_ref)
|
|
VALUES
|
|
(
|
|
'run-hash-only',
|
|
'leo',
|
|
'api',
|
|
'api:request-redacted',
|
|
'paid_work_order',
|
|
'sha256:prompt-only',
|
|
'social_trends',
|
|
'answered',
|
|
'sha256:answer-only',
|
|
'proof/hash-only.json'
|
|
)"""
|
|
)
|
|
conn.execute(
|
|
"""INSERT INTO agent_tool_invocations
|
|
(id, research_run_id, provider, tool_name, tool_category, decision,
|
|
decision_reason, input_sha256, output_sha256)
|
|
VALUES
|
|
(
|
|
'tool-hash-only',
|
|
'run-hash-only',
|
|
'AgentCash StableSocial',
|
|
'lightreel-trends',
|
|
'social_trends',
|
|
'executed',
|
|
'Question asks for current Twitter/X discussion.',
|
|
'sha256:input-only',
|
|
'sha256:output-only'
|
|
)"""
|
|
)
|
|
|
|
row = conn.execute(
|
|
"""SELECT
|
|
r.prompt_excerpt,
|
|
r.answer_excerpt,
|
|
r.secret_values_included AS run_secret_flag,
|
|
t.secret_values_included AS tool_secret_flag
|
|
FROM agent_research_runs r
|
|
JOIN agent_tool_invocations t ON t.research_run_id = r.id
|
|
WHERE r.id = 'run-hash-only'"""
|
|
).fetchone()
|
|
|
|
assert row["prompt_excerpt"] is None
|
|
assert row["answer_excerpt"] is None
|
|
assert row["run_secret_flag"] == 0
|
|
assert row["tool_secret_flag"] == 0
|