Add Teleo research eval schema

Adds graph schema prerequisite plus research-eval schema/docs/tests for Leo tool-use benchmarks and x402 research telemetry. Validated by full local pytest and green CI.
This commit is contained in:
twentyOne2x 2026-06-24 14:21:03 +02:00 committed by GitHub
parent 533295d38c
commit 1a71efcde2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 1169 additions and 0 deletions

View file

@ -0,0 +1,104 @@
# Teleo Agent Graph Schema v1
Source idea: `teleo-agent-architecture-COMBINED (2).excalidraw`.
This schema models the agent commons as a graph:
```text
persona -> strategy -> position -> belief -> claim -> evidence
```
The top layers are agent-owned. The lower layers are shared commons.
Changes cascade upward: evidence changes re-evaluate claims, claims flag beliefs,
beliefs flag positions, and positions can force persona/strategy review.
## Design Commitments
- Personas are authored, stable, and loaded every turn.
- Strategies are derived from personas using the Rumelt kernel:
diagnosis, guiding policy, proximate objectives.
- Positions and beliefs are per-agent public commitments.
- Claims are owned by no agent.
- Evidence is owned by no agent.
- Claims link to claims through typed weighted edges.
- One evidence node can ground many claims.
- One claim can be cited by many beliefs across agents and domains.
- `cited_by` and `importance` are computed/readback fields, not hand-authored
truth.
- Every edge has a relation, weight, and rationale so cascade behavior is
auditable.
## Main Tables
| Table | Purpose |
| --- | --- |
| `agents` | Agent registry: Leo, Rio, Theseus, etc. |
| `agent_persona_revisions` | Stable authored identity, voice, and role snapshots |
| `agent_strategy_revisions` | Derived diagnosis, guiding policy, and objectives |
| `agent_positions` | Per-agent public commitments with falsification criteria |
| `agent_beliefs` | Per-agent falsifiable beliefs citing claims |
| `claims` | Shared claim commons |
| `evidence` | Shared sourced/verifiable evidence commons |
| `position_belief_edges` | Position depends on belief |
| `belief_claim_edges` | Belief cites or depends on claim |
| `claim_edges` | Claim-to-claim typed relationship |
| `claim_evidence_edges` | Claim grounded by evidence |
| `graph_evaluation_runs` | Evaluation/re-evaluation records |
| `cascade_events` | Upward propagation queue/history |
| `graph_history_events` | Sanitized GitHub/Forgejo/local-git manifest events |
| `graph_node_history_links` | Links history events to graph nodes |
## Claim Node
Diagram frontmatter maps to `claims`:
| Diagram field | Column |
| --- | --- |
| `type: claim` | implicit table |
| `domain` | `claims.domain` |
| `description` | `claims.description` |
| `confidence` | `claims.confidence` |
| `source` | `claims.source_summary`, plus evidence edges |
| `created` | `claims.created_at` |
| `last_evaluated` | `claims.last_evaluated` |
| `cross_references` | `claim_edges` |
| `importance` | `claims.importance`, computed from inbound refs |
| `attribution` | `claims.attribution_json` |
## Claim Relations
| Relation | Meaning |
| --- | --- |
| `depends_on` | This claim cannot be true unless the linked claim is true |
| `supports` | Linked claim provides evidence for this one |
| `challenged_by` | Linked claim is counter-argument or counter-evidence |
| `cited_by` | Computed inbound reference, not hand-authored |
| `related` | Topical link without a specific evidential relationship |
## Experiment Use
This schema should be applied after a test database is created and before a
history manifest is loaded:
```text
spin database
apply teleo-agent-graph-v1.sql
load history manifest through graph adapter
run persona/journey/red-team experiments
verify cascades and graph invariants
tear database down
```
## Minimum Invariants
- Every active belief must cite at least three claims before it can be marked
`load_bearing`.
- Every active claim must have at least one evidence edge before it can be
marked `accepted`.
- Red-team or quarantined claims cannot be cited by active beliefs unless the
edge relation is `challenged_by`.
- `claim_edges` cannot self-reference.
- `importance` should be recomputed from inbound belief and claim references
during loader/evaluation jobs.
- Any evidence update must produce cascade events for affected claims and
upstream beliefs/positions.

View file

@ -0,0 +1,251 @@
-- Teleo Agent Graph Schema v1
-- Common SQL subset intended for ephemeral SQLite tests and Postgres/Supabase
-- staging. IDs are app-generated text IDs so this can run across engines.
CREATE TABLE IF NOT EXISTS graph_schema_version (
version TEXT PRIMARY KEY,
source TEXT NOT NULL,
applied_at TEXT DEFAULT CURRENT_TIMESTAMP
);
INSERT OR IGNORE INTO graph_schema_version (version, source)
VALUES ('teleo-agent-graph-v1', 'teleo-agent-architecture-excalidraw');
CREATE TABLE IF NOT EXISTS agents (
slug TEXT PRIMARY KEY,
display_name TEXT NOT NULL,
archetype TEXT,
status TEXT NOT NULL DEFAULT 'active'
CHECK(status IN ('active', 'inactive', 'deprecated')),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS agent_persona_revisions (
id TEXT PRIMARY KEY,
agent_slug TEXT NOT NULL REFERENCES agents(slug),
revision INTEGER NOT NULL,
identity TEXT NOT NULL,
voice TEXT NOT NULL,
role TEXT NOT NULL,
authored_by TEXT,
stable INTEGER NOT NULL DEFAULT 1 CHECK(stable IN (0, 1)),
loads_every_turn INTEGER NOT NULL DEFAULT 1 CHECK(loads_every_turn IN (0, 1)),
active INTEGER NOT NULL DEFAULT 1 CHECK(active IN (0, 1)),
notes TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(agent_slug, revision)
);
CREATE TABLE IF NOT EXISTS agent_strategy_revisions (
id TEXT PRIMARY KEY,
agent_slug TEXT NOT NULL REFERENCES agents(slug),
persona_revision_id TEXT REFERENCES agent_persona_revisions(id),
revision INTEGER NOT NULL,
diagnosis TEXT NOT NULL,
guiding_policy TEXT NOT NULL,
proximate_objectives_json TEXT NOT NULL DEFAULT '[]',
derivation_notes TEXT,
active INTEGER NOT NULL DEFAULT 1 CHECK(active IN (0, 1)),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(agent_slug, revision)
);
CREATE TABLE IF NOT EXISTS agent_positions (
id TEXT PRIMARY KEY,
agent_slug TEXT NOT NULL REFERENCES agents(slug),
title TEXT NOT NULL,
statement TEXT NOT NULL,
falsification_criteria TEXT,
public_commitment INTEGER NOT NULL DEFAULT 1 CHECK(public_commitment IN (0, 1)),
confidence TEXT NOT NULL DEFAULT 'experimental'
CHECK(confidence IN ('proven', 'likely', 'experimental', 'speculative')),
status TEXT NOT NULL DEFAULT 'active'
CHECK(status IN ('draft', 'active', 'flagged', 'retired')),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
last_reviewed TEXT
);
CREATE TABLE IF NOT EXISTS agent_beliefs (
id TEXT PRIMARY KEY,
agent_slug TEXT NOT NULL REFERENCES agents(slug),
belief_code TEXT NOT NULL,
title TEXT NOT NULL,
statement TEXT NOT NULL,
falsification_criteria TEXT,
is_keystone INTEGER NOT NULL DEFAULT 0 CHECK(is_keystone IN (0, 1)),
min_claims INTEGER NOT NULL DEFAULT 3,
confidence TEXT NOT NULL DEFAULT 'experimental'
CHECK(confidence IN ('proven', 'likely', 'experimental', 'speculative')),
status TEXT NOT NULL DEFAULT 'active'
CHECK(status IN ('draft', 'active', 'load_bearing', 'flagged', 'retired')),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
last_evaluated TEXT,
UNIQUE(agent_slug, belief_code)
);
CREATE TABLE IF NOT EXISTS evidence (
id TEXT PRIMARY KEY,
evidence_type TEXT NOT NULL
CHECK(evidence_type IN ('study', 'data', 'event', 'formal_result', 'legal', 'market', 'historical', 'other')),
title TEXT NOT NULL,
source_uri TEXT,
citation TEXT,
summary TEXT NOT NULL,
verification_status TEXT NOT NULL DEFAULT 'unverified'
CHECK(verification_status IN ('unverified', 'sourced', 'verified', 'disputed', 'retracted')),
observed_at TEXT,
attribution_json TEXT NOT NULL DEFAULT '{}',
created_at TEXT DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS claims (
id TEXT PRIMARY KEY,
slug TEXT NOT NULL UNIQUE,
domain TEXT NOT NULL,
description TEXT NOT NULL,
confidence TEXT NOT NULL DEFAULT 'experimental'
CHECK(confidence IN ('proven', 'likely', 'experimental', 'speculative')),
source_summary TEXT,
proposed_by TEXT,
primary_evidence_id TEXT REFERENCES evidence(id),
importance REAL NOT NULL DEFAULT 0 CHECK(importance >= 0 AND importance <= 1),
status TEXT NOT NULL DEFAULT 'draft'
CHECK(status IN ('draft', 'active', 'accepted', 'challenged', 'quarantined', 'retired')),
attribution_json TEXT NOT NULL DEFAULT '{}',
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
last_evaluated TEXT
);
CREATE TABLE IF NOT EXISTS position_belief_edges (
id TEXT PRIMARY KEY,
position_id TEXT NOT NULL REFERENCES agent_positions(id),
belief_id TEXT NOT NULL REFERENCES agent_beliefs(id),
relation TEXT NOT NULL DEFAULT 'depends_on'
CHECK(relation IN ('depends_on', 'supports', 'challenged_by', 'related')),
weight REAL NOT NULL DEFAULT 1 CHECK(weight >= 0 AND weight <= 1),
rationale TEXT NOT NULL,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(position_id, belief_id, relation)
);
CREATE TABLE IF NOT EXISTS belief_claim_edges (
id TEXT PRIMARY KEY,
belief_id TEXT NOT NULL REFERENCES agent_beliefs(id),
claim_id TEXT NOT NULL REFERENCES claims(id),
relation TEXT NOT NULL DEFAULT 'cites'
CHECK(relation IN ('cites', 'depends_on', 'supports', 'challenged_by', 'related')),
weight REAL NOT NULL DEFAULT 1 CHECK(weight >= 0 AND weight <= 1),
rationale TEXT NOT NULL,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(belief_id, claim_id, relation)
);
CREATE TABLE IF NOT EXISTS claim_edges (
id TEXT PRIMARY KEY,
from_claim_id TEXT NOT NULL REFERENCES claims(id),
to_claim_id TEXT NOT NULL REFERENCES claims(id),
relation TEXT NOT NULL
CHECK(relation IN ('depends_on', 'supports', 'challenged_by', 'cited_by', 'related')),
weight REAL NOT NULL DEFAULT 1 CHECK(weight >= 0 AND weight <= 1),
rationale TEXT NOT NULL,
authored_by TEXT,
computed INTEGER NOT NULL DEFAULT 0 CHECK(computed IN (0, 1)),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
CHECK(from_claim_id <> to_claim_id),
UNIQUE(from_claim_id, to_claim_id, relation)
);
CREATE TABLE IF NOT EXISTS claim_evidence_edges (
id TEXT PRIMARY KEY,
claim_id TEXT NOT NULL REFERENCES claims(id),
evidence_id TEXT NOT NULL REFERENCES evidence(id),
relation TEXT NOT NULL DEFAULT 'supports'
CHECK(relation IN ('primary', 'supports', 'challenges', 'context', 'weakens')),
weight REAL NOT NULL DEFAULT 1 CHECK(weight >= 0 AND weight <= 1),
rationale TEXT NOT NULL,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(claim_id, evidence_id, relation)
);
CREATE TABLE IF NOT EXISTS graph_evaluation_runs (
id TEXT PRIMARY KEY,
target_layer TEXT NOT NULL
CHECK(target_layer IN ('persona', 'strategy', 'position', 'belief', 'claim', 'evidence', 'edge')),
target_id TEXT NOT NULL,
trigger_type TEXT NOT NULL
CHECK(trigger_type IN ('scheduled', 'history_replay', 'evidence_changed', 'claim_changed', 'manual', 'red_team')),
trigger_id TEXT,
evaluator TEXT NOT NULL,
model TEXT,
verdict TEXT NOT NULL
CHECK(verdict IN ('approve', 'request_changes', 'reject', 'flag', 'quarantine', 'no_op')),
confidence REAL CHECK(confidence IS NULL OR (confidence >= 0 AND confidence <= 1)),
notes TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS cascade_events (
id TEXT PRIMARY KEY,
changed_layer TEXT NOT NULL
CHECK(changed_layer IN ('evidence', 'claim', 'belief', 'position', 'strategy', 'persona')),
changed_id TEXT NOT NULL,
affected_layer TEXT NOT NULL
CHECK(affected_layer IN ('claim', 'belief', 'position', 'strategy', 'persona')),
affected_id TEXT NOT NULL,
direction TEXT NOT NULL DEFAULT 'up'
CHECK(direction IN ('up', 'down', 'lateral')),
status TEXT NOT NULL DEFAULT 'queued'
CHECK(status IN ('queued', 'reviewing', 'resolved', 'ignored')),
reason TEXT NOT NULL,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
resolved_at TEXT
);
CREATE TABLE IF NOT EXISTS graph_history_events (
id TEXT PRIMARY KEY,
provider TEXT NOT NULL CHECK(provider IN ('github', 'forgejo', 'local_git', 'web', 'x', 'telegram', 'manual')),
repo TEXT,
provider_event_id TEXT,
event_type TEXT NOT NULL,
actor TEXT,
occurred_at TEXT,
payload_json TEXT NOT NULL DEFAULT '{}',
redacted INTEGER NOT NULL DEFAULT 1 CHECK(redacted IN (0, 1)),
created_at TEXT DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS graph_node_history_links (
history_event_id TEXT NOT NULL REFERENCES graph_history_events(id),
node_layer TEXT NOT NULL
CHECK(node_layer IN ('persona', 'strategy', 'position', 'belief', 'claim', 'evidence', 'edge')),
node_id TEXT NOT NULL,
role TEXT NOT NULL
CHECK(role IN ('created', 'updated', 'evaluated', 'merged', 'challenged', 'cited', 'sourced')),
PRIMARY KEY (history_event_id, node_layer, node_id, role)
);
CREATE INDEX IF NOT EXISTS idx_persona_revisions_agent_active
ON agent_persona_revisions(agent_slug, active);
CREATE INDEX IF NOT EXISTS idx_strategy_revisions_agent_active
ON agent_strategy_revisions(agent_slug, active);
CREATE INDEX IF NOT EXISTS idx_positions_agent_status
ON agent_positions(agent_slug, status);
CREATE INDEX IF NOT EXISTS idx_beliefs_agent_status
ON agent_beliefs(agent_slug, status);
CREATE INDEX IF NOT EXISTS idx_claims_domain_status
ON claims(domain, status);
CREATE INDEX IF NOT EXISTS idx_claims_importance
ON claims(importance);
CREATE INDEX IF NOT EXISTS idx_evidence_status
ON evidence(verification_status);
CREATE INDEX IF NOT EXISTS idx_belief_claim_edges_claim
ON belief_claim_edges(claim_id, relation);
CREATE INDEX IF NOT EXISTS idx_claim_edges_to
ON claim_edges(to_claim_id, relation);
CREATE INDEX IF NOT EXISTS idx_claim_evidence_edges_evidence
ON claim_evidence_edges(evidence_id, relation);
CREATE INDEX IF NOT EXISTS idx_cascade_status
ON cascade_events(status, affected_layer);
CREATE INDEX IF NOT EXISTS idx_history_provider_repo
ON graph_history_events(provider, repo, event_type);

View file

@ -0,0 +1,73 @@
# Teleo Agent Research Eval Schema v1
Apply this schema after `teleo-agent-graph-v1.sql`.
This schema records how Leo and other agents answer research requests, which
tools they choose, what sources they cite, and whether benchmark cases passed.
It is operational/economic telemetry, not the claim/evidence graph itself.
## Design Commitments
- The graph schema remains the knowledge spine: persona, strategy, beliefs,
claims, evidence, graph evals, and cascades.
- Research-eval rows explain how a request was handled and whether the route was
good enough to trust or ship.
- Payment funds work. It does not directly mutate claims, confidence, beliefs,
or rewards.
- Tool-use benchmarking must distinguish candidates, selected tools, executed
tools, skipped tools, and rejected tools.
- Secrets and private payloads are never stored. Tables store hashes, redacted
excerpts, proof references, source metadata, and receipt ids.
## Main Tables
| Table | Purpose |
| --- | --- |
| `agent_research_runs` | One row per research request from Telegram, API, checkout, CLI, or benchmark. |
| `agent_tool_invocations` | One row per candidate, selected, executed, skipped, rejected, fallback, or failed tool decision. |
| `agent_research_sources` | Retrieved or cited source rows tied to a run and optionally a tool invocation. |
| `agent_eval_cases` | Versioned benchmark prompts, expected routes/providers, tool constraints, tags, and rubrics. |
| `agent_eval_results` | Per-case result, routing correctness, tool score, source quality, groundedness, cost, and safety scores. |
| `work_order_graph_links` | Links sponsored work orders to research runs, tool traces, graph evals, evidence, claims, and outcomes. |
## Leo x402 Research Flow
```text
Telegram/API question
-> agent_research_runs
-> agent_tool_invocations
-> agent_research_sources
-> agent_eval_results when a benchmark case applies
-> work_order_graph_links when a paid work order or graph artifact is involved
```
For paid research, `agent_research_runs.sponsored_work_order_id` and
`payment_receipt_id` carry the external work-order/payment anchors. The payment
receipt table is still owned by the economic/payment layer; this schema only
keeps references.
## Ranger Liquidation Guard
The Ranger benchmark class should be represented as:
- `agent_eval_cases.expected_route = 'web_search'`
- `agent_eval_cases.tags_json` includes `ranger_liquidated`
- `agent_eval_cases.must_not_use_tools_json` includes market-data-only routes
- `agent_tool_invocations` records market data as `rejected` or `skipped` when
it is not the right tool
- `agent_eval_results.routing_correct = 1` only if Leo routed to source-backed
research instead of live-token valuation
This ensures "Ranger is liquidated/gone" is verified before any valuation
framing and never silently treated as a normal live fair-value token question.
## Minimum Invariants
- No row may set `secret_values_included = 1`.
- A benchmark result must link to both an eval case and a research run.
- Tool invocation sequence numbers are unique per research run.
- Scores are bounded between `0` and `1`.
- Research runs store prompt and answer hashes plus optional redacted excerpts,
not raw private prompts.
- `outcome_observations` remain the downstream business-value layer; raw tool
traces belong here, not there.

View file

@ -0,0 +1,247 @@
-- Teleo Agent Research Eval Schema v1
-- Common SQL subset intended for ephemeral SQLite tests and Postgres/Supabase
-- staging. IDs are app-generated text IDs so this can run across engines.
--
-- Apply after teleo-agent-graph-v1.sql.
--
-- Secret policy: store hashes, redacted excerpts, and proof references only.
-- Raw prompts, bearer tokens, API keys, wallet secrets, and private receipts do
-- not belong in these tables.
INSERT OR IGNORE INTO graph_schema_version (version, source)
VALUES ('teleo-agent-research-eval-v1', 'leo-x402-research-routing-benchmark');
CREATE TABLE IF NOT EXISTS agent_research_runs (
id TEXT PRIMARY KEY,
agent_slug TEXT NOT NULL REFERENCES agents(slug),
source_surface TEXT NOT NULL
CHECK(source_surface IN ('telegram', 'api', 'checkout', 'web', 'cli', 'test', 'other')),
source_ref TEXT,
request_kind TEXT NOT NULL DEFAULT 'free'
CHECK(request_kind IN ('free', 'paid_quote', 'paid_work_order', 'benchmark', 'system')),
sponsored_work_order_id TEXT,
payment_receipt_id TEXT,
prompt_sha256 TEXT NOT NULL,
prompt_excerpt TEXT,
selected_provider TEXT,
selected_route TEXT NOT NULL DEFAULT 'unknown'
CHECK(selected_route IN (
'none',
'web_search',
'social_trends',
'structured_market_data',
'local_context',
'mixed',
'unknown'
)),
status TEXT NOT NULL DEFAULT 'running'
CHECK(status IN (
'quoted',
'payment_pending',
'running',
'answered',
'abstained',
'blocked',
'failed',
'cancelled'
)),
answer_sha256 TEXT,
answer_excerpt TEXT,
proof_ref TEXT,
cost_amount REAL NOT NULL DEFAULT 0 CHECK(cost_amount >= 0),
currency TEXT NOT NULL DEFAULT 'USDC',
latency_ms INTEGER CHECK(latency_ms IS NULL OR latency_ms >= 0),
source_count INTEGER NOT NULL DEFAULT 0 CHECK(source_count >= 0),
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
completed_at TEXT,
CHECK(prompt_excerpt IS NULL OR length(prompt_excerpt) <= 1000),
CHECK(answer_excerpt IS NULL OR length(answer_excerpt) <= 2000)
);
CREATE INDEX IF NOT EXISTS idx_agent_research_runs_agent_created
ON agent_research_runs(agent_slug, created_at);
CREATE INDEX IF NOT EXISTS idx_agent_research_runs_work_order
ON agent_research_runs(sponsored_work_order_id);
CREATE INDEX IF NOT EXISTS idx_agent_research_runs_status_route
ON agent_research_runs(status, selected_route);
CREATE TABLE IF NOT EXISTS agent_tool_invocations (
id TEXT PRIMARY KEY,
research_run_id TEXT NOT NULL REFERENCES agent_research_runs(id) ON DELETE CASCADE,
sequence INTEGER NOT NULL DEFAULT 0 CHECK(sequence >= 0),
provider TEXT NOT NULL,
tool_name TEXT NOT NULL,
tool_category TEXT NOT NULL
CHECK(tool_category IN (
'web_search',
'social_trends',
'market_data',
'page_read',
'x402_checkout',
'agentcash',
'faremeter',
'database',
'local_context',
'other'
)),
endpoint_host TEXT,
endpoint_hash TEXT,
decision TEXT NOT NULL
CHECK(decision IN ('candidate', 'selected', 'executed', 'skipped', 'rejected', 'fallback', 'failed')),
decision_reason TEXT NOT NULL,
paid INTEGER NOT NULL DEFAULT 0 CHECK(paid IN (0, 1)),
rail TEXT CHECK(rail IS NULL OR rail IN ('x402', 'agentcash', 'manual', 'free', 'other')),
network TEXT,
amount REAL CHECK(amount IS NULL OR amount >= 0),
currency TEXT NOT NULL DEFAULT 'USDC',
payment_receipt_id TEXT,
input_sha256 TEXT,
output_sha256 TEXT,
source_count INTEGER NOT NULL DEFAULT 0 CHECK(source_count >= 0),
latency_ms INTEGER CHECK(latency_ms IS NULL OR latency_ms >= 0),
error_class TEXT,
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(research_run_id, sequence)
);
CREATE INDEX IF NOT EXISTS idx_agent_tool_invocations_run_decision
ON agent_tool_invocations(research_run_id, decision);
CREATE INDEX IF NOT EXISTS idx_agent_tool_invocations_provider_category
ON agent_tool_invocations(provider, tool_category);
CREATE INDEX IF NOT EXISTS idx_agent_tool_invocations_receipt
ON agent_tool_invocations(payment_receipt_id);
CREATE TABLE IF NOT EXISTS agent_research_sources (
id TEXT PRIMARY KEY,
research_run_id TEXT NOT NULL REFERENCES agent_research_runs(id) ON DELETE CASCADE,
tool_invocation_id TEXT REFERENCES agent_tool_invocations(id) ON DELETE SET NULL,
source_type TEXT NOT NULL
CHECK(source_type IN ('web', 'social', 'market', 'db', 'document', 'other')),
source_uri TEXT,
source_uri_sha256 TEXT,
title TEXT,
cited INTEGER NOT NULL DEFAULT 0 CHECK(cited IN (0, 1)),
retrieval_rank INTEGER CHECK(retrieval_rank IS NULL OR retrieval_rank >= 0),
observed_at TEXT,
support_status TEXT NOT NULL DEFAULT 'unknown'
CHECK(support_status IN ('supports', 'context', 'conflicts', 'stale', 'unknown')),
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
created_at TEXT DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_agent_research_sources_run
ON agent_research_sources(research_run_id, cited);
CREATE INDEX IF NOT EXISTS idx_agent_research_sources_tool
ON agent_research_sources(tool_invocation_id);
CREATE TABLE IF NOT EXISTS agent_eval_cases (
id TEXT PRIMARY KEY,
suite_id TEXT NOT NULL,
case_slug TEXT NOT NULL,
case_version INTEGER NOT NULL DEFAULT 1 CHECK(case_version >= 1),
prompt_sha256 TEXT NOT NULL,
prompt_excerpt TEXT NOT NULL CHECK(length(prompt_excerpt) <= 1000),
fixture_context_sha256 TEXT,
fixture_context_excerpt TEXT CHECK(fixture_context_excerpt IS NULL OR length(fixture_context_excerpt) <= 2000),
expected_route TEXT NOT NULL
CHECK(expected_route IN (
'none',
'web_search',
'social_trends',
'structured_market_data',
'local_context',
'mixed',
'unknown'
)),
expected_provider TEXT,
must_use_tools_json TEXT NOT NULL DEFAULT '[]',
must_not_use_tools_json TEXT NOT NULL DEFAULT '[]',
tags_json TEXT NOT NULL DEFAULT '[]',
rubric_json TEXT NOT NULL DEFAULT '{}',
stale_after TEXT,
active INTEGER NOT NULL DEFAULT 1 CHECK(active IN (0, 1)),
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(suite_id, case_slug, case_version)
);
CREATE INDEX IF NOT EXISTS idx_agent_eval_cases_suite_active
ON agent_eval_cases(suite_id, active);
CREATE INDEX IF NOT EXISTS idx_agent_eval_cases_route
ON agent_eval_cases(expected_route);
CREATE TABLE IF NOT EXISTS agent_eval_results (
id TEXT PRIMARY KEY,
eval_case_id TEXT NOT NULL REFERENCES agent_eval_cases(id) ON DELETE CASCADE,
research_run_id TEXT NOT NULL REFERENCES agent_research_runs(id) ON DELETE CASCADE,
graph_evaluation_run_id TEXT REFERENCES graph_evaluation_runs(id) ON DELETE SET NULL,
status TEXT NOT NULL
CHECK(status IN ('passed', 'failed', 'warning', 'blocked', 'skipped')),
score REAL CHECK(score IS NULL OR (score >= 0 AND score <= 1)),
routing_correct INTEGER CHECK(routing_correct IS NULL OR routing_correct IN (0, 1)),
tool_choice_score REAL CHECK(tool_choice_score IS NULL OR (tool_choice_score >= 0 AND tool_choice_score <= 1)),
source_quality_score REAL CHECK(source_quality_score IS NULL OR (source_quality_score >= 0 AND source_quality_score <= 1)),
groundedness_score REAL CHECK(groundedness_score IS NULL OR (groundedness_score >= 0 AND groundedness_score <= 1)),
freshness_score REAL CHECK(freshness_score IS NULL OR (freshness_score >= 0 AND freshness_score <= 1)),
cost_efficiency_score REAL CHECK(cost_efficiency_score IS NULL OR (cost_efficiency_score >= 0 AND cost_efficiency_score <= 1)),
safety_payment_score REAL CHECK(safety_payment_score IS NULL OR (safety_payment_score >= 0 AND safety_payment_score <= 1)),
failure_reason TEXT,
judge TEXT,
proof_ref TEXT,
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(eval_case_id, research_run_id)
);
CREATE INDEX IF NOT EXISTS idx_agent_eval_results_case_status
ON agent_eval_results(eval_case_id, status);
CREATE INDEX IF NOT EXISTS idx_agent_eval_results_run
ON agent_eval_results(research_run_id);
CREATE INDEX IF NOT EXISTS idx_agent_eval_results_graph_eval
ON agent_eval_results(graph_evaluation_run_id);
CREATE TABLE IF NOT EXISTS work_order_graph_links (
id TEXT PRIMARY KEY,
sponsored_work_order_id TEXT NOT NULL,
role TEXT NOT NULL
CHECK(role IN (
'input_context',
'evaluation_target',
'created_evidence',
'created_claim',
'created_eval_run',
'research_run',
'tool_trace',
'history_trace',
'outcome_trace'
)),
graph_layer TEXT NOT NULL
CHECK(graph_layer IN (
'persona',
'strategy',
'position',
'belief',
'claim',
'evidence',
'edge',
'graph_evaluation_run',
'cascade_event',
'graph_history_event',
'agent_research_run',
'agent_tool_invocation',
'agent_eval_result',
'outcome_observation'
)),
graph_id TEXT NOT NULL,
rationale TEXT,
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(sponsored_work_order_id, role, graph_layer, graph_id)
);
CREATE INDEX IF NOT EXISTS idx_work_order_graph_links_work_order
ON work_order_graph_links(sponsored_work_order_id);
CREATE INDEX IF NOT EXISTS idx_work_order_graph_links_graph
ON work_order_graph_links(graph_layer, graph_id);

View file

@ -0,0 +1,129 @@
from __future__ import annotations
import sqlite3
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-graph-v1.sql"
def test_agent_graph_schema_applies_and_models_shared_nodes():
conn = sqlite3.connect(":memory:")
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
conn.executescript(SCHEMA_SQL.read_text())
conn.executemany(
"INSERT INTO agents (slug, display_name, archetype) VALUES (?, ?, ?)",
[
("leo", "Leo", "cross-domain synthesizer"),
("theseus", "Theseus", "AI alignment"),
],
)
conn.execute(
"""INSERT INTO agent_persona_revisions
(id, agent_slug, revision, identity, voice, role, authored_by)
VALUES
('persona-leo-v1', 'leo', 1, 'cross-domain synthesizer', 'direct', 'evaluate commons', 'diagram'),
('persona-theseus-v1', 'theseus', 1, 'alignment maze navigator', 'precise', 'AI evidence lead', 'diagram')"""
)
conn.execute(
"""INSERT INTO agent_strategy_revisions
(id, agent_slug, persona_revision_id, revision, diagnosis, guiding_policy, proximate_objectives_json)
VALUES
('strategy-leo-v1', 'leo', 'persona-leo-v1', 1, 'coordination is the bottleneck', 'surface cross-domain isomorphisms', '[]'),
('strategy-theseus-v1', 'theseus', 'persona-theseus-v1', 1, 'AI discourse is ungrounded', 'separate generation from evaluation', '[]')"""
)
conn.executemany(
"""INSERT INTO evidence
(id, evidence_type, title, summary, verification_status)
VALUES (?, ?, ?, ?, 'verified')""",
[
("e-kim-2025", "study", "Kim et al. ICML 2025", "Shared evidence grounding coordination and verification degradation."),
("e-arrow", "formal_result", "Arrow impossibility theorem", "Formal result grounding alignment impossibility claim."),
],
)
conn.executemany(
"""INSERT INTO claims
(id, slug, domain, description, confidence, primary_evidence_id, status)
VALUES (?, ?, ?, ?, ?, ?, 'accepted')""",
[
("c-coordination", "alignment-is-coordination", "ai-alignment", "Alignment is a coordination problem, not only a technical one.", "likely", "e-kim-2025"),
("c-verification", "verification-degrades-with-capability", "ai-alignment", "Verification degrades as capability gaps grow.", "experimental", "e-kim-2025"),
("c-arrow", "universal-alignment-impossible", "ai-alignment", "Universal alignment is mathematically impossible under strong aggregation assumptions.", "likely", "e-arrow"),
],
)
conn.executemany(
"""INSERT INTO claim_evidence_edges
(id, claim_id, evidence_id, relation, weight, rationale)
VALUES (?, ?, ?, 'supports', ?, ?)""",
[
("ce-kim-coordination", "c-coordination", "e-kim-2025", 0.9, "Diagram shared-node case: one evidence node grounds multiple claims."),
("ce-kim-verification", "c-verification", "e-kim-2025", 0.8, "Same evidence also grounds verification degradation."),
("ce-arrow", "c-arrow", "e-arrow", 0.9, "Formal result evidence."),
],
)
conn.executemany(
"""INSERT INTO agent_beliefs
(id, agent_slug, belief_code, title, statement, falsification_criteria, is_keystone)
VALUES (?, ?, ?, ?, ?, ?, ?)""",
[
("b-leo-b1", "leo", "B1", "Coordination bottleneck", "Coordination is the bottleneck.", "Falsified by civ-scale pure-tech solution.", 1),
("b-theseus-t2", "theseus", "T2", "Alignment as coordination", "Alignment is a coordination problem.", "Falsified by a robust one-agent technical alignment solution.", 1),
("b-theseus-t4", "theseus", "T4", "Verification degradation", "Verification degrades faster than capability grows.", "Falsified by scalable oversight evidence.", 0),
],
)
conn.executemany(
"""INSERT INTO belief_claim_edges
(id, belief_id, claim_id, relation, weight, rationale)
VALUES (?, ?, ?, 'cites', ?, ?)""",
[
("bc-leo-coordination", "b-leo-b1", "c-coordination", 1.0, "Keystone belief cites shared claim."),
("bc-theseus-coordination", "b-theseus-t2", "c-coordination", 0.9, "Different agent cites same shared claim."),
("bc-theseus-verification", "b-theseus-t4", "c-verification", 0.9, "Belief cites verification claim."),
("bc-theseus-arrow", "b-theseus-t2", "c-arrow", 0.6, "Belief also cites formal-result claim."),
],
)
conn.execute(
"""INSERT INTO claim_edges
(id, from_claim_id, to_claim_id, relation, weight, rationale)
VALUES ('edge-verification-supports-coordination', 'c-verification', 'c-coordination', 'supports', 0.6, 'Oversight degradation strengthens coordination framing.')"""
)
conn.execute(
"""INSERT INTO cascade_events
(id, changed_layer, changed_id, affected_layer, affected_id, reason)
VALUES ('cascade-kim-to-coordination', 'evidence', 'e-kim-2025', 'claim', 'c-coordination', 'shared evidence updated')"""
)
shared_evidence_count = conn.execute(
"SELECT COUNT(*) AS n FROM claim_evidence_edges WHERE evidence_id = 'e-kim-2025'"
).fetchone()["n"]
shared_claim_count = conn.execute(
"SELECT COUNT(*) AS n FROM belief_claim_edges WHERE claim_id = 'c-coordination'"
).fetchone()["n"]
cascade_count = conn.execute("SELECT COUNT(*) AS n FROM cascade_events").fetchone()["n"]
assert shared_evidence_count == 2
assert shared_claim_count == 2
assert cascade_count == 1
def test_claim_edges_reject_self_reference():
conn = sqlite3.connect(":memory:")
conn.execute("PRAGMA foreign_keys = ON")
conn.executescript(SCHEMA_SQL.read_text())
conn.execute(
"""INSERT INTO claims (id, slug, domain, description)
VALUES ('c1', 'claim-one', 'ai-alignment', 'A claim specific enough to disagree with.')"""
)
try:
conn.execute(
"""INSERT INTO claim_edges
(id, from_claim_id, to_claim_id, relation, rationale)
VALUES ('self', 'c1', 'c1', 'related', 'self edge should fail')"""
)
except sqlite3.IntegrityError:
pass
else:
raise AssertionError("claim_edges allowed a self-reference")

View file

@ -0,0 +1,365 @@
from __future__ import annotations
import sqlite3
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
GRAPH_SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-graph-v1.sql"
RESEARCH_EVAL_SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-research-eval-v1.sql"
def _conn() -> sqlite3.Connection:
conn = sqlite3.connect(":memory:")
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
conn.executescript(GRAPH_SCHEMA_SQL.read_text())
conn.executescript(RESEARCH_EVAL_SCHEMA_SQL.read_text())
return conn
def test_research_eval_schema_applies_after_graph_schema():
conn = _conn()
versions = {
row["version"]
for row in conn.execute("SELECT version FROM graph_schema_version").fetchall()
}
assert versions == {
"teleo-agent-graph-v1",
"teleo-agent-research-eval-v1",
}
tables = {
row["name"]
for row in conn.execute(
"SELECT name FROM sqlite_master WHERE type = 'table'"
).fetchall()
}
assert {
"agent_research_runs",
"agent_tool_invocations",
"agent_research_sources",
"agent_eval_cases",
"agent_eval_results",
"work_order_graph_links",
} <= tables
def test_ranger_liquidation_case_routes_to_source_backed_research_not_market_data():
conn = _conn()
conn.execute(
"INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')"
)
conn.execute(
"""INSERT INTO agent_eval_cases
(id, suite_id, case_slug, prompt_sha256, prompt_excerpt, expected_route,
expected_provider, must_use_tools_json, must_not_use_tools_json, tags_json, rubric_json)
VALUES
(
'eval-ranger-liquidated-v1',
'leo-research-routing-v1',
'ranger-liquidated-not-fair-value',
'sha256:ranger-prompt',
'Is Ranger Finance fairly valued today given Ranger Finance is liquidated and gone?',
'web_search',
'agentcash-stableenrich-exa-search',
'["source-backed web research"]',
'["structured_market_data_only", "live_token_fair_value"]',
'["ranger_liquidated", "valuation", "source_verification"]',
'{"routing": "verify liquidation before valuation framing"}'
)"""
)
conn.execute(
"""INSERT INTO agent_research_runs
(id, agent_slug, source_surface, source_ref, request_kind, sponsored_work_order_id,
payment_receipt_id, prompt_sha256, prompt_excerpt, selected_provider, selected_route,
status, answer_sha256, answer_excerpt, proof_ref, cost_amount, latency_ms, source_count)
VALUES
(
'run-ranger-liquidated-001',
'leo',
'telegram',
'telegram:group:message-123',
'paid_quote',
'sponsored_work_orders:test-ranger-001',
'payment_receipts:test-ranger-001',
'sha256:ranger-prompt',
'Is Ranger Finance fairly valued today given Ranger Finance is liquidated and gone?',
'agentcash-stableenrich-exa-search',
'web_search',
'answered',
'sha256:ranger-answer',
'Verified liquidation/gone status before valuation framing.',
'proof/leo-ranger-liquidated-routing.json',
0.01,
1240,
3
)"""
)
conn.executemany(
"""INSERT INTO agent_tool_invocations
(id, research_run_id, sequence, provider, tool_name, tool_category, decision,
decision_reason, paid, rail, network, amount, payment_receipt_id, input_sha256,
output_sha256, source_count, latency_ms)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
[
(
"tool-ranger-market-rejected",
"run-ranger-liquidated-001",
1,
"DexScreener",
"structured-market-context",
"market_data",
"rejected",
"Ranger liquidation status must be verified before treating this as a live token valuation.",
0,
"free",
None,
0,
None,
"sha256:market-input",
None,
0,
12,
),
(
"tool-ranger-web-selected",
"run-ranger-liquidated-001",
2,
"AgentCash StableEnrich",
"exa-search",
"web_search",
"executed",
"Source-backed liquidation and status verification required.",
1,
"agentcash",
"solana:5eykt4UsFv8P8NJdTREpY1vzqKqZKvdp",
0.01,
"payment_receipts:test-ranger-001",
"sha256:exa-input",
"sha256:exa-output",
3,
1228,
),
],
)
conn.executemany(
"""INSERT INTO agent_research_sources
(id, research_run_id, tool_invocation_id, source_type, source_uri_sha256,
title, cited, retrieval_rank, support_status)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
[
(
"source-ranger-official",
"run-ranger-liquidated-001",
"tool-ranger-web-selected",
"web",
"sha256:ranger-official",
"Ranger status source",
1,
1,
"supports",
),
(
"source-ranger-forum",
"run-ranger-liquidated-001",
"tool-ranger-web-selected",
"web",
"sha256:ranger-forum",
"MetaDAO/Ranger discussion source",
1,
2,
"context",
),
],
)
conn.execute(
"""INSERT INTO graph_evaluation_runs
(id, target_layer, target_id, trigger_type, evaluator, verdict, confidence, notes)
VALUES
(
'graph-eval-ranger-routing',
'claim',
'ranger-liquidated-status',
'manual',
'leo-research-routing-benchmark',
'approve',
0.92,
'Tool choice matched Ranger liquidation guard.'
)"""
)
conn.execute(
"""INSERT INTO agent_eval_results
(id, eval_case_id, research_run_id, graph_evaluation_run_id, status, score,
routing_correct, tool_choice_score, source_quality_score, groundedness_score,
freshness_score, cost_efficiency_score, safety_payment_score, proof_ref)
VALUES
(
'eval-result-ranger-liquidated-001',
'eval-ranger-liquidated-v1',
'run-ranger-liquidated-001',
'graph-eval-ranger-routing',
'passed',
0.94,
1,
1.0,
0.9,
0.9,
0.85,
0.8,
1.0,
'proof/leo-ranger-liquidated-routing.json'
)"""
)
conn.execute(
"""INSERT INTO work_order_graph_links
(id, sponsored_work_order_id, role, graph_layer, graph_id, rationale)
VALUES
(
'wo-ranger-run-link',
'sponsored_work_orders:test-ranger-001',
'research_run',
'agent_research_run',
'run-ranger-liquidated-001',
'Paid work order produced source-backed research run.'
)"""
)
row = conn.execute(
"""SELECT
r.selected_route,
r.selected_provider,
er.status AS eval_status,
er.routing_correct,
er.tool_choice_score,
COUNT(s.id) AS cited_source_count
FROM agent_research_runs r
JOIN agent_eval_results er ON er.research_run_id = r.id
LEFT JOIN agent_research_sources s ON s.research_run_id = r.id AND s.cited = 1
WHERE r.id = 'run-ranger-liquidated-001'
GROUP BY r.id, er.id"""
).fetchone()
market_executed = conn.execute(
"""SELECT COUNT(*) AS n
FROM agent_tool_invocations
WHERE research_run_id = 'run-ranger-liquidated-001'
AND tool_category = 'market_data'
AND decision = 'executed'"""
).fetchone()["n"]
rejected_market = conn.execute(
"""SELECT COUNT(*) AS n
FROM agent_tool_invocations
WHERE research_run_id = 'run-ranger-liquidated-001'
AND tool_category = 'market_data'
AND decision = 'rejected'"""
).fetchone()["n"]
assert dict(row) == {
"selected_route": "web_search",
"selected_provider": "agentcash-stableenrich-exa-search",
"eval_status": "passed",
"routing_correct": 1,
"tool_choice_score": 1.0,
"cited_source_count": 2,
}
assert market_executed == 0
assert rejected_market == 1
def test_schema_rejects_secret_flags_bad_scores_and_bad_tool_decisions():
conn = _conn()
conn.execute(
"INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')"
)
conn.execute(
"""INSERT INTO agent_research_runs
(id, agent_slug, source_surface, request_kind, prompt_sha256, selected_route, status)
VALUES ('run-constraints', 'leo', 'test', 'benchmark', 'sha256:prompt', 'web_search', 'answered')"""
)
conn.execute(
"""INSERT INTO agent_eval_cases
(id, suite_id, case_slug, prompt_sha256, prompt_excerpt, expected_route)
VALUES ('case-constraints', 'suite', 'case', 'sha256:prompt', 'redacted prompt', 'web_search')"""
)
invalid_statements = [
"""INSERT INTO agent_research_runs
(id, agent_slug, source_surface, request_kind, prompt_sha256, selected_route, status, secret_values_included)
VALUES ('run-secret', 'leo', 'test', 'benchmark', 'sha256:secret', 'web_search', 'answered', 1)""",
"""INSERT INTO agent_tool_invocations
(id, research_run_id, provider, tool_name, tool_category, decision, decision_reason)
VALUES ('tool-bad-decision', 'run-constraints', 'p', 't', 'web_search', 'approved', 'bad enum')""",
"""INSERT INTO agent_eval_results
(id, eval_case_id, research_run_id, status, score)
VALUES ('eval-bad-score', 'case-constraints', 'run-constraints', 'passed', 1.1)""",
"""INSERT INTO agent_eval_results
(id, eval_case_id, research_run_id, status, routing_correct)
VALUES ('eval-bad-bool', 'case-constraints', 'run-constraints', 'passed', 2)""",
]
for statement in invalid_statements:
try:
conn.execute(statement)
except sqlite3.IntegrityError:
pass
else:
raise AssertionError(f"invalid statement unexpectedly passed: {statement}")
def test_research_run_can_be_recorded_without_raw_prompt_or_private_payloads():
conn = _conn()
conn.execute(
"INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')"
)
conn.execute(
"""INSERT INTO agent_research_runs
(id, agent_slug, source_surface, source_ref, request_kind, prompt_sha256,
selected_route, status, answer_sha256, proof_ref)
VALUES
(
'run-hash-only',
'leo',
'api',
'api:request-redacted',
'paid_work_order',
'sha256:prompt-only',
'social_trends',
'answered',
'sha256:answer-only',
'proof/hash-only.json'
)"""
)
conn.execute(
"""INSERT INTO agent_tool_invocations
(id, research_run_id, provider, tool_name, tool_category, decision,
decision_reason, input_sha256, output_sha256)
VALUES
(
'tool-hash-only',
'run-hash-only',
'AgentCash StableSocial',
'lightreel-trends',
'social_trends',
'executed',
'Question asks for current Twitter/X discussion.',
'sha256:input-only',
'sha256:output-only'
)"""
)
row = conn.execute(
"""SELECT
r.prompt_excerpt,
r.answer_excerpt,
r.secret_values_included AS run_secret_flag,
t.secret_values_included AS tool_secret_flag
FROM agent_research_runs r
JOIN agent_tool_invocations t ON t.research_run_id = r.id
WHERE r.id = 'run-hash-only'"""
).fetchone()
assert row["prompt_excerpt"] is None
assert row["answer_excerpt"] is None
assert row["run_secret_flag"] == 0
assert row["tool_secret_flag"] == 0