teleo-infrastructure/schemas/teleo-agent-research-eval-v1.sql
twentyOne2x 1a71efcde2
Add Teleo research eval schema
Adds graph schema prerequisite plus research-eval schema/docs/tests for Leo tool-use benchmarks and x402 research telemetry. Validated by full local pytest and green CI.
2026-06-24 14:21:03 +02:00

247 lines
10 KiB
SQL

-- Teleo Agent Research Eval Schema v1
-- Common SQL subset intended for ephemeral SQLite tests and Postgres/Supabase
-- staging. IDs are app-generated text IDs so this can run across engines.
--
-- Apply after teleo-agent-graph-v1.sql.
--
-- Secret policy: store hashes, redacted excerpts, and proof references only.
-- Raw prompts, bearer tokens, API keys, wallet secrets, and private receipts do
-- not belong in these tables.
INSERT OR IGNORE INTO graph_schema_version (version, source)
VALUES ('teleo-agent-research-eval-v1', 'leo-x402-research-routing-benchmark');
CREATE TABLE IF NOT EXISTS agent_research_runs (
id TEXT PRIMARY KEY,
agent_slug TEXT NOT NULL REFERENCES agents(slug),
source_surface TEXT NOT NULL
CHECK(source_surface IN ('telegram', 'api', 'checkout', 'web', 'cli', 'test', 'other')),
source_ref TEXT,
request_kind TEXT NOT NULL DEFAULT 'free'
CHECK(request_kind IN ('free', 'paid_quote', 'paid_work_order', 'benchmark', 'system')),
sponsored_work_order_id TEXT,
payment_receipt_id TEXT,
prompt_sha256 TEXT NOT NULL,
prompt_excerpt TEXT,
selected_provider TEXT,
selected_route TEXT NOT NULL DEFAULT 'unknown'
CHECK(selected_route IN (
'none',
'web_search',
'social_trends',
'structured_market_data',
'local_context',
'mixed',
'unknown'
)),
status TEXT NOT NULL DEFAULT 'running'
CHECK(status IN (
'quoted',
'payment_pending',
'running',
'answered',
'abstained',
'blocked',
'failed',
'cancelled'
)),
answer_sha256 TEXT,
answer_excerpt TEXT,
proof_ref TEXT,
cost_amount REAL NOT NULL DEFAULT 0 CHECK(cost_amount >= 0),
currency TEXT NOT NULL DEFAULT 'USDC',
latency_ms INTEGER CHECK(latency_ms IS NULL OR latency_ms >= 0),
source_count INTEGER NOT NULL DEFAULT 0 CHECK(source_count >= 0),
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
completed_at TEXT,
CHECK(prompt_excerpt IS NULL OR length(prompt_excerpt) <= 1000),
CHECK(answer_excerpt IS NULL OR length(answer_excerpt) <= 2000)
);
CREATE INDEX IF NOT EXISTS idx_agent_research_runs_agent_created
ON agent_research_runs(agent_slug, created_at);
CREATE INDEX IF NOT EXISTS idx_agent_research_runs_work_order
ON agent_research_runs(sponsored_work_order_id);
CREATE INDEX IF NOT EXISTS idx_agent_research_runs_status_route
ON agent_research_runs(status, selected_route);
CREATE TABLE IF NOT EXISTS agent_tool_invocations (
id TEXT PRIMARY KEY,
research_run_id TEXT NOT NULL REFERENCES agent_research_runs(id) ON DELETE CASCADE,
sequence INTEGER NOT NULL DEFAULT 0 CHECK(sequence >= 0),
provider TEXT NOT NULL,
tool_name TEXT NOT NULL,
tool_category TEXT NOT NULL
CHECK(tool_category IN (
'web_search',
'social_trends',
'market_data',
'page_read',
'x402_checkout',
'agentcash',
'faremeter',
'database',
'local_context',
'other'
)),
endpoint_host TEXT,
endpoint_hash TEXT,
decision TEXT NOT NULL
CHECK(decision IN ('candidate', 'selected', 'executed', 'skipped', 'rejected', 'fallback', 'failed')),
decision_reason TEXT NOT NULL,
paid INTEGER NOT NULL DEFAULT 0 CHECK(paid IN (0, 1)),
rail TEXT CHECK(rail IS NULL OR rail IN ('x402', 'agentcash', 'manual', 'free', 'other')),
network TEXT,
amount REAL CHECK(amount IS NULL OR amount >= 0),
currency TEXT NOT NULL DEFAULT 'USDC',
payment_receipt_id TEXT,
input_sha256 TEXT,
output_sha256 TEXT,
source_count INTEGER NOT NULL DEFAULT 0 CHECK(source_count >= 0),
latency_ms INTEGER CHECK(latency_ms IS NULL OR latency_ms >= 0),
error_class TEXT,
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(research_run_id, sequence)
);
CREATE INDEX IF NOT EXISTS idx_agent_tool_invocations_run_decision
ON agent_tool_invocations(research_run_id, decision);
CREATE INDEX IF NOT EXISTS idx_agent_tool_invocations_provider_category
ON agent_tool_invocations(provider, tool_category);
CREATE INDEX IF NOT EXISTS idx_agent_tool_invocations_receipt
ON agent_tool_invocations(payment_receipt_id);
CREATE TABLE IF NOT EXISTS agent_research_sources (
id TEXT PRIMARY KEY,
research_run_id TEXT NOT NULL REFERENCES agent_research_runs(id) ON DELETE CASCADE,
tool_invocation_id TEXT REFERENCES agent_tool_invocations(id) ON DELETE SET NULL,
source_type TEXT NOT NULL
CHECK(source_type IN ('web', 'social', 'market', 'db', 'document', 'other')),
source_uri TEXT,
source_uri_sha256 TEXT,
title TEXT,
cited INTEGER NOT NULL DEFAULT 0 CHECK(cited IN (0, 1)),
retrieval_rank INTEGER CHECK(retrieval_rank IS NULL OR retrieval_rank >= 0),
observed_at TEXT,
support_status TEXT NOT NULL DEFAULT 'unknown'
CHECK(support_status IN ('supports', 'context', 'conflicts', 'stale', 'unknown')),
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
created_at TEXT DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_agent_research_sources_run
ON agent_research_sources(research_run_id, cited);
CREATE INDEX IF NOT EXISTS idx_agent_research_sources_tool
ON agent_research_sources(tool_invocation_id);
CREATE TABLE IF NOT EXISTS agent_eval_cases (
id TEXT PRIMARY KEY,
suite_id TEXT NOT NULL,
case_slug TEXT NOT NULL,
case_version INTEGER NOT NULL DEFAULT 1 CHECK(case_version >= 1),
prompt_sha256 TEXT NOT NULL,
prompt_excerpt TEXT NOT NULL CHECK(length(prompt_excerpt) <= 1000),
fixture_context_sha256 TEXT,
fixture_context_excerpt TEXT CHECK(fixture_context_excerpt IS NULL OR length(fixture_context_excerpt) <= 2000),
expected_route TEXT NOT NULL
CHECK(expected_route IN (
'none',
'web_search',
'social_trends',
'structured_market_data',
'local_context',
'mixed',
'unknown'
)),
expected_provider TEXT,
must_use_tools_json TEXT NOT NULL DEFAULT '[]',
must_not_use_tools_json TEXT NOT NULL DEFAULT '[]',
tags_json TEXT NOT NULL DEFAULT '[]',
rubric_json TEXT NOT NULL DEFAULT '{}',
stale_after TEXT,
active INTEGER NOT NULL DEFAULT 1 CHECK(active IN (0, 1)),
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(suite_id, case_slug, case_version)
);
CREATE INDEX IF NOT EXISTS idx_agent_eval_cases_suite_active
ON agent_eval_cases(suite_id, active);
CREATE INDEX IF NOT EXISTS idx_agent_eval_cases_route
ON agent_eval_cases(expected_route);
CREATE TABLE IF NOT EXISTS agent_eval_results (
id TEXT PRIMARY KEY,
eval_case_id TEXT NOT NULL REFERENCES agent_eval_cases(id) ON DELETE CASCADE,
research_run_id TEXT NOT NULL REFERENCES agent_research_runs(id) ON DELETE CASCADE,
graph_evaluation_run_id TEXT REFERENCES graph_evaluation_runs(id) ON DELETE SET NULL,
status TEXT NOT NULL
CHECK(status IN ('passed', 'failed', 'warning', 'blocked', 'skipped')),
score REAL CHECK(score IS NULL OR (score >= 0 AND score <= 1)),
routing_correct INTEGER CHECK(routing_correct IS NULL OR routing_correct IN (0, 1)),
tool_choice_score REAL CHECK(tool_choice_score IS NULL OR (tool_choice_score >= 0 AND tool_choice_score <= 1)),
source_quality_score REAL CHECK(source_quality_score IS NULL OR (source_quality_score >= 0 AND source_quality_score <= 1)),
groundedness_score REAL CHECK(groundedness_score IS NULL OR (groundedness_score >= 0 AND groundedness_score <= 1)),
freshness_score REAL CHECK(freshness_score IS NULL OR (freshness_score >= 0 AND freshness_score <= 1)),
cost_efficiency_score REAL CHECK(cost_efficiency_score IS NULL OR (cost_efficiency_score >= 0 AND cost_efficiency_score <= 1)),
safety_payment_score REAL CHECK(safety_payment_score IS NULL OR (safety_payment_score >= 0 AND safety_payment_score <= 1)),
failure_reason TEXT,
judge TEXT,
proof_ref TEXT,
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(eval_case_id, research_run_id)
);
CREATE INDEX IF NOT EXISTS idx_agent_eval_results_case_status
ON agent_eval_results(eval_case_id, status);
CREATE INDEX IF NOT EXISTS idx_agent_eval_results_run
ON agent_eval_results(research_run_id);
CREATE INDEX IF NOT EXISTS idx_agent_eval_results_graph_eval
ON agent_eval_results(graph_evaluation_run_id);
CREATE TABLE IF NOT EXISTS work_order_graph_links (
id TEXT PRIMARY KEY,
sponsored_work_order_id TEXT NOT NULL,
role TEXT NOT NULL
CHECK(role IN (
'input_context',
'evaluation_target',
'created_evidence',
'created_claim',
'created_eval_run',
'research_run',
'tool_trace',
'history_trace',
'outcome_trace'
)),
graph_layer TEXT NOT NULL
CHECK(graph_layer IN (
'persona',
'strategy',
'position',
'belief',
'claim',
'evidence',
'edge',
'graph_evaluation_run',
'cascade_event',
'graph_history_event',
'agent_research_run',
'agent_tool_invocation',
'agent_eval_result',
'outcome_observation'
)),
graph_id TEXT NOT NULL,
rationale TEXT,
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(sponsored_work_order_id, role, graph_layer, graph_id)
);
CREATE INDEX IF NOT EXISTS idx_work_order_graph_links_work_order
ON work_order_graph_links(sponsored_work_order_id);
CREATE INDEX IF NOT EXISTS idx_work_order_graph_links_graph
ON work_order_graph_links(graph_layer, graph_id);