Add Teleo research eval schema
Adds graph schema prerequisite plus research-eval schema/docs/tests for Leo tool-use benchmarks and x402 research telemetry. Validated by full local pytest and green CI.
This commit is contained in:
parent
533295d38c
commit
1a71efcde2
6 changed files with 1169 additions and 0 deletions
104
schemas/teleo-agent-graph-v1.md
Normal file
104
schemas/teleo-agent-graph-v1.md
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
# Teleo Agent Graph Schema v1
|
||||
|
||||
Source idea: `teleo-agent-architecture-COMBINED (2).excalidraw`.
|
||||
|
||||
This schema models the agent commons as a graph:
|
||||
|
||||
```text
|
||||
persona -> strategy -> position -> belief -> claim -> evidence
|
||||
```
|
||||
|
||||
The top layers are agent-owned. The lower layers are shared commons.
|
||||
Changes cascade upward: evidence changes re-evaluate claims, claims flag beliefs,
|
||||
beliefs flag positions, and positions can force persona/strategy review.
|
||||
|
||||
## Design Commitments
|
||||
|
||||
- Personas are authored, stable, and loaded every turn.
|
||||
- Strategies are derived from personas using the Rumelt kernel:
|
||||
diagnosis, guiding policy, proximate objectives.
|
||||
- Positions and beliefs are per-agent public commitments.
|
||||
- Claims are owned by no agent.
|
||||
- Evidence is owned by no agent.
|
||||
- Claims link to claims through typed weighted edges.
|
||||
- One evidence node can ground many claims.
|
||||
- One claim can be cited by many beliefs across agents and domains.
|
||||
- `cited_by` and `importance` are computed/readback fields, not hand-authored
|
||||
truth.
|
||||
- Every edge has a relation, weight, and rationale so cascade behavior is
|
||||
auditable.
|
||||
|
||||
## Main Tables
|
||||
|
||||
| Table | Purpose |
|
||||
| --- | --- |
|
||||
| `agents` | Agent registry: Leo, Rio, Theseus, etc. |
|
||||
| `agent_persona_revisions` | Stable authored identity, voice, and role snapshots |
|
||||
| `agent_strategy_revisions` | Derived diagnosis, guiding policy, and objectives |
|
||||
| `agent_positions` | Per-agent public commitments with falsification criteria |
|
||||
| `agent_beliefs` | Per-agent falsifiable beliefs citing claims |
|
||||
| `claims` | Shared claim commons |
|
||||
| `evidence` | Shared sourced/verifiable evidence commons |
|
||||
| `position_belief_edges` | Position depends on belief |
|
||||
| `belief_claim_edges` | Belief cites or depends on claim |
|
||||
| `claim_edges` | Claim-to-claim typed relationship |
|
||||
| `claim_evidence_edges` | Claim grounded by evidence |
|
||||
| `graph_evaluation_runs` | Evaluation/re-evaluation records |
|
||||
| `cascade_events` | Upward propagation queue/history |
|
||||
| `graph_history_events` | Sanitized GitHub/Forgejo/local-git manifest events |
|
||||
| `graph_node_history_links` | Links history events to graph nodes |
|
||||
|
||||
## Claim Node
|
||||
|
||||
Diagram frontmatter maps to `claims`:
|
||||
|
||||
| Diagram field | Column |
|
||||
| --- | --- |
|
||||
| `type: claim` | implicit table |
|
||||
| `domain` | `claims.domain` |
|
||||
| `description` | `claims.description` |
|
||||
| `confidence` | `claims.confidence` |
|
||||
| `source` | `claims.source_summary`, plus evidence edges |
|
||||
| `created` | `claims.created_at` |
|
||||
| `last_evaluated` | `claims.last_evaluated` |
|
||||
| `cross_references` | `claim_edges` |
|
||||
| `importance` | `claims.importance`, computed from inbound refs |
|
||||
| `attribution` | `claims.attribution_json` |
|
||||
|
||||
## Claim Relations
|
||||
|
||||
| Relation | Meaning |
|
||||
| --- | --- |
|
||||
| `depends_on` | This claim cannot be true unless the linked claim is true |
|
||||
| `supports` | Linked claim provides evidence for this one |
|
||||
| `challenged_by` | Linked claim is counter-argument or counter-evidence |
|
||||
| `cited_by` | Computed inbound reference, not hand-authored |
|
||||
| `related` | Topical link without a specific evidential relationship |
|
||||
|
||||
## Experiment Use
|
||||
|
||||
This schema should be applied after a test database is created and before a
|
||||
history manifest is loaded:
|
||||
|
||||
```text
|
||||
spin database
|
||||
apply teleo-agent-graph-v1.sql
|
||||
load history manifest through graph adapter
|
||||
run persona/journey/red-team experiments
|
||||
verify cascades and graph invariants
|
||||
tear database down
|
||||
```
|
||||
|
||||
## Minimum Invariants
|
||||
|
||||
- Every active belief must cite at least three claims before it can be marked
|
||||
`load_bearing`.
|
||||
- Every active claim must have at least one evidence edge before it can be
|
||||
marked `accepted`.
|
||||
- Red-team or quarantined claims cannot be cited by active beliefs unless the
|
||||
edge relation is `challenged_by`.
|
||||
- `claim_edges` cannot self-reference.
|
||||
- `importance` should be recomputed from inbound belief and claim references
|
||||
during loader/evaluation jobs.
|
||||
- Any evidence update must produce cascade events for affected claims and
|
||||
upstream beliefs/positions.
|
||||
251
schemas/teleo-agent-graph-v1.sql
Normal file
251
schemas/teleo-agent-graph-v1.sql
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
-- Teleo Agent Graph Schema v1
|
||||
-- Common SQL subset intended for ephemeral SQLite tests and Postgres/Supabase
|
||||
-- staging. IDs are app-generated text IDs so this can run across engines.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS graph_schema_version (
|
||||
version TEXT PRIMARY KEY,
|
||||
source TEXT NOT NULL,
|
||||
applied_at TEXT DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
INSERT OR IGNORE INTO graph_schema_version (version, source)
|
||||
VALUES ('teleo-agent-graph-v1', 'teleo-agent-architecture-excalidraw');
|
||||
|
||||
CREATE TABLE IF NOT EXISTS agents (
|
||||
slug TEXT PRIMARY KEY,
|
||||
display_name TEXT NOT NULL,
|
||||
archetype TEXT,
|
||||
status TEXT NOT NULL DEFAULT 'active'
|
||||
CHECK(status IN ('active', 'inactive', 'deprecated')),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS agent_persona_revisions (
|
||||
id TEXT PRIMARY KEY,
|
||||
agent_slug TEXT NOT NULL REFERENCES agents(slug),
|
||||
revision INTEGER NOT NULL,
|
||||
identity TEXT NOT NULL,
|
||||
voice TEXT NOT NULL,
|
||||
role TEXT NOT NULL,
|
||||
authored_by TEXT,
|
||||
stable INTEGER NOT NULL DEFAULT 1 CHECK(stable IN (0, 1)),
|
||||
loads_every_turn INTEGER NOT NULL DEFAULT 1 CHECK(loads_every_turn IN (0, 1)),
|
||||
active INTEGER NOT NULL DEFAULT 1 CHECK(active IN (0, 1)),
|
||||
notes TEXT,
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(agent_slug, revision)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS agent_strategy_revisions (
|
||||
id TEXT PRIMARY KEY,
|
||||
agent_slug TEXT NOT NULL REFERENCES agents(slug),
|
||||
persona_revision_id TEXT REFERENCES agent_persona_revisions(id),
|
||||
revision INTEGER NOT NULL,
|
||||
diagnosis TEXT NOT NULL,
|
||||
guiding_policy TEXT NOT NULL,
|
||||
proximate_objectives_json TEXT NOT NULL DEFAULT '[]',
|
||||
derivation_notes TEXT,
|
||||
active INTEGER NOT NULL DEFAULT 1 CHECK(active IN (0, 1)),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(agent_slug, revision)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS agent_positions (
|
||||
id TEXT PRIMARY KEY,
|
||||
agent_slug TEXT NOT NULL REFERENCES agents(slug),
|
||||
title TEXT NOT NULL,
|
||||
statement TEXT NOT NULL,
|
||||
falsification_criteria TEXT,
|
||||
public_commitment INTEGER NOT NULL DEFAULT 1 CHECK(public_commitment IN (0, 1)),
|
||||
confidence TEXT NOT NULL DEFAULT 'experimental'
|
||||
CHECK(confidence IN ('proven', 'likely', 'experimental', 'speculative')),
|
||||
status TEXT NOT NULL DEFAULT 'active'
|
||||
CHECK(status IN ('draft', 'active', 'flagged', 'retired')),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
last_reviewed TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS agent_beliefs (
|
||||
id TEXT PRIMARY KEY,
|
||||
agent_slug TEXT NOT NULL REFERENCES agents(slug),
|
||||
belief_code TEXT NOT NULL,
|
||||
title TEXT NOT NULL,
|
||||
statement TEXT NOT NULL,
|
||||
falsification_criteria TEXT,
|
||||
is_keystone INTEGER NOT NULL DEFAULT 0 CHECK(is_keystone IN (0, 1)),
|
||||
min_claims INTEGER NOT NULL DEFAULT 3,
|
||||
confidence TEXT NOT NULL DEFAULT 'experimental'
|
||||
CHECK(confidence IN ('proven', 'likely', 'experimental', 'speculative')),
|
||||
status TEXT NOT NULL DEFAULT 'active'
|
||||
CHECK(status IN ('draft', 'active', 'load_bearing', 'flagged', 'retired')),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
last_evaluated TEXT,
|
||||
UNIQUE(agent_slug, belief_code)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS evidence (
|
||||
id TEXT PRIMARY KEY,
|
||||
evidence_type TEXT NOT NULL
|
||||
CHECK(evidence_type IN ('study', 'data', 'event', 'formal_result', 'legal', 'market', 'historical', 'other')),
|
||||
title TEXT NOT NULL,
|
||||
source_uri TEXT,
|
||||
citation TEXT,
|
||||
summary TEXT NOT NULL,
|
||||
verification_status TEXT NOT NULL DEFAULT 'unverified'
|
||||
CHECK(verification_status IN ('unverified', 'sourced', 'verified', 'disputed', 'retracted')),
|
||||
observed_at TEXT,
|
||||
attribution_json TEXT NOT NULL DEFAULT '{}',
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS claims (
|
||||
id TEXT PRIMARY KEY,
|
||||
slug TEXT NOT NULL UNIQUE,
|
||||
domain TEXT NOT NULL,
|
||||
description TEXT NOT NULL,
|
||||
confidence TEXT NOT NULL DEFAULT 'experimental'
|
||||
CHECK(confidence IN ('proven', 'likely', 'experimental', 'speculative')),
|
||||
source_summary TEXT,
|
||||
proposed_by TEXT,
|
||||
primary_evidence_id TEXT REFERENCES evidence(id),
|
||||
importance REAL NOT NULL DEFAULT 0 CHECK(importance >= 0 AND importance <= 1),
|
||||
status TEXT NOT NULL DEFAULT 'draft'
|
||||
CHECK(status IN ('draft', 'active', 'accepted', 'challenged', 'quarantined', 'retired')),
|
||||
attribution_json TEXT NOT NULL DEFAULT '{}',
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
last_evaluated TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS position_belief_edges (
|
||||
id TEXT PRIMARY KEY,
|
||||
position_id TEXT NOT NULL REFERENCES agent_positions(id),
|
||||
belief_id TEXT NOT NULL REFERENCES agent_beliefs(id),
|
||||
relation TEXT NOT NULL DEFAULT 'depends_on'
|
||||
CHECK(relation IN ('depends_on', 'supports', 'challenged_by', 'related')),
|
||||
weight REAL NOT NULL DEFAULT 1 CHECK(weight >= 0 AND weight <= 1),
|
||||
rationale TEXT NOT NULL,
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(position_id, belief_id, relation)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS belief_claim_edges (
|
||||
id TEXT PRIMARY KEY,
|
||||
belief_id TEXT NOT NULL REFERENCES agent_beliefs(id),
|
||||
claim_id TEXT NOT NULL REFERENCES claims(id),
|
||||
relation TEXT NOT NULL DEFAULT 'cites'
|
||||
CHECK(relation IN ('cites', 'depends_on', 'supports', 'challenged_by', 'related')),
|
||||
weight REAL NOT NULL DEFAULT 1 CHECK(weight >= 0 AND weight <= 1),
|
||||
rationale TEXT NOT NULL,
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(belief_id, claim_id, relation)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS claim_edges (
|
||||
id TEXT PRIMARY KEY,
|
||||
from_claim_id TEXT NOT NULL REFERENCES claims(id),
|
||||
to_claim_id TEXT NOT NULL REFERENCES claims(id),
|
||||
relation TEXT NOT NULL
|
||||
CHECK(relation IN ('depends_on', 'supports', 'challenged_by', 'cited_by', 'related')),
|
||||
weight REAL NOT NULL DEFAULT 1 CHECK(weight >= 0 AND weight <= 1),
|
||||
rationale TEXT NOT NULL,
|
||||
authored_by TEXT,
|
||||
computed INTEGER NOT NULL DEFAULT 0 CHECK(computed IN (0, 1)),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
CHECK(from_claim_id <> to_claim_id),
|
||||
UNIQUE(from_claim_id, to_claim_id, relation)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS claim_evidence_edges (
|
||||
id TEXT PRIMARY KEY,
|
||||
claim_id TEXT NOT NULL REFERENCES claims(id),
|
||||
evidence_id TEXT NOT NULL REFERENCES evidence(id),
|
||||
relation TEXT NOT NULL DEFAULT 'supports'
|
||||
CHECK(relation IN ('primary', 'supports', 'challenges', 'context', 'weakens')),
|
||||
weight REAL NOT NULL DEFAULT 1 CHECK(weight >= 0 AND weight <= 1),
|
||||
rationale TEXT NOT NULL,
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(claim_id, evidence_id, relation)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS graph_evaluation_runs (
|
||||
id TEXT PRIMARY KEY,
|
||||
target_layer TEXT NOT NULL
|
||||
CHECK(target_layer IN ('persona', 'strategy', 'position', 'belief', 'claim', 'evidence', 'edge')),
|
||||
target_id TEXT NOT NULL,
|
||||
trigger_type TEXT NOT NULL
|
||||
CHECK(trigger_type IN ('scheduled', 'history_replay', 'evidence_changed', 'claim_changed', 'manual', 'red_team')),
|
||||
trigger_id TEXT,
|
||||
evaluator TEXT NOT NULL,
|
||||
model TEXT,
|
||||
verdict TEXT NOT NULL
|
||||
CHECK(verdict IN ('approve', 'request_changes', 'reject', 'flag', 'quarantine', 'no_op')),
|
||||
confidence REAL CHECK(confidence IS NULL OR (confidence >= 0 AND confidence <= 1)),
|
||||
notes TEXT,
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS cascade_events (
|
||||
id TEXT PRIMARY KEY,
|
||||
changed_layer TEXT NOT NULL
|
||||
CHECK(changed_layer IN ('evidence', 'claim', 'belief', 'position', 'strategy', 'persona')),
|
||||
changed_id TEXT NOT NULL,
|
||||
affected_layer TEXT NOT NULL
|
||||
CHECK(affected_layer IN ('claim', 'belief', 'position', 'strategy', 'persona')),
|
||||
affected_id TEXT NOT NULL,
|
||||
direction TEXT NOT NULL DEFAULT 'up'
|
||||
CHECK(direction IN ('up', 'down', 'lateral')),
|
||||
status TEXT NOT NULL DEFAULT 'queued'
|
||||
CHECK(status IN ('queued', 'reviewing', 'resolved', 'ignored')),
|
||||
reason TEXT NOT NULL,
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
resolved_at TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS graph_history_events (
|
||||
id TEXT PRIMARY KEY,
|
||||
provider TEXT NOT NULL CHECK(provider IN ('github', 'forgejo', 'local_git', 'web', 'x', 'telegram', 'manual')),
|
||||
repo TEXT,
|
||||
provider_event_id TEXT,
|
||||
event_type TEXT NOT NULL,
|
||||
actor TEXT,
|
||||
occurred_at TEXT,
|
||||
payload_json TEXT NOT NULL DEFAULT '{}',
|
||||
redacted INTEGER NOT NULL DEFAULT 1 CHECK(redacted IN (0, 1)),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS graph_node_history_links (
|
||||
history_event_id TEXT NOT NULL REFERENCES graph_history_events(id),
|
||||
node_layer TEXT NOT NULL
|
||||
CHECK(node_layer IN ('persona', 'strategy', 'position', 'belief', 'claim', 'evidence', 'edge')),
|
||||
node_id TEXT NOT NULL,
|
||||
role TEXT NOT NULL
|
||||
CHECK(role IN ('created', 'updated', 'evaluated', 'merged', 'challenged', 'cited', 'sourced')),
|
||||
PRIMARY KEY (history_event_id, node_layer, node_id, role)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_persona_revisions_agent_active
|
||||
ON agent_persona_revisions(agent_slug, active);
|
||||
CREATE INDEX IF NOT EXISTS idx_strategy_revisions_agent_active
|
||||
ON agent_strategy_revisions(agent_slug, active);
|
||||
CREATE INDEX IF NOT EXISTS idx_positions_agent_status
|
||||
ON agent_positions(agent_slug, status);
|
||||
CREATE INDEX IF NOT EXISTS idx_beliefs_agent_status
|
||||
ON agent_beliefs(agent_slug, status);
|
||||
CREATE INDEX IF NOT EXISTS idx_claims_domain_status
|
||||
ON claims(domain, status);
|
||||
CREATE INDEX IF NOT EXISTS idx_claims_importance
|
||||
ON claims(importance);
|
||||
CREATE INDEX IF NOT EXISTS idx_evidence_status
|
||||
ON evidence(verification_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_belief_claim_edges_claim
|
||||
ON belief_claim_edges(claim_id, relation);
|
||||
CREATE INDEX IF NOT EXISTS idx_claim_edges_to
|
||||
ON claim_edges(to_claim_id, relation);
|
||||
CREATE INDEX IF NOT EXISTS idx_claim_evidence_edges_evidence
|
||||
ON claim_evidence_edges(evidence_id, relation);
|
||||
CREATE INDEX IF NOT EXISTS idx_cascade_status
|
||||
ON cascade_events(status, affected_layer);
|
||||
CREATE INDEX IF NOT EXISTS idx_history_provider_repo
|
||||
ON graph_history_events(provider, repo, event_type);
|
||||
73
schemas/teleo-agent-research-eval-v1.md
Normal file
73
schemas/teleo-agent-research-eval-v1.md
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
# Teleo Agent Research Eval Schema v1
|
||||
|
||||
Apply this schema after `teleo-agent-graph-v1.sql`.
|
||||
|
||||
This schema records how Leo and other agents answer research requests, which
|
||||
tools they choose, what sources they cite, and whether benchmark cases passed.
|
||||
It is operational/economic telemetry, not the claim/evidence graph itself.
|
||||
|
||||
## Design Commitments
|
||||
|
||||
- The graph schema remains the knowledge spine: persona, strategy, beliefs,
|
||||
claims, evidence, graph evals, and cascades.
|
||||
- Research-eval rows explain how a request was handled and whether the route was
|
||||
good enough to trust or ship.
|
||||
- Payment funds work. It does not directly mutate claims, confidence, beliefs,
|
||||
or rewards.
|
||||
- Tool-use benchmarking must distinguish candidates, selected tools, executed
|
||||
tools, skipped tools, and rejected tools.
|
||||
- Secrets and private payloads are never stored. Tables store hashes, redacted
|
||||
excerpts, proof references, source metadata, and receipt ids.
|
||||
|
||||
## Main Tables
|
||||
|
||||
| Table | Purpose |
|
||||
| --- | --- |
|
||||
| `agent_research_runs` | One row per research request from Telegram, API, checkout, CLI, or benchmark. |
|
||||
| `agent_tool_invocations` | One row per candidate, selected, executed, skipped, rejected, fallback, or failed tool decision. |
|
||||
| `agent_research_sources` | Retrieved or cited source rows tied to a run and optionally a tool invocation. |
|
||||
| `agent_eval_cases` | Versioned benchmark prompts, expected routes/providers, tool constraints, tags, and rubrics. |
|
||||
| `agent_eval_results` | Per-case result, routing correctness, tool score, source quality, groundedness, cost, and safety scores. |
|
||||
| `work_order_graph_links` | Links sponsored work orders to research runs, tool traces, graph evals, evidence, claims, and outcomes. |
|
||||
|
||||
## Leo x402 Research Flow
|
||||
|
||||
```text
|
||||
Telegram/API question
|
||||
-> agent_research_runs
|
||||
-> agent_tool_invocations
|
||||
-> agent_research_sources
|
||||
-> agent_eval_results when a benchmark case applies
|
||||
-> work_order_graph_links when a paid work order or graph artifact is involved
|
||||
```
|
||||
|
||||
For paid research, `agent_research_runs.sponsored_work_order_id` and
|
||||
`payment_receipt_id` carry the external work-order/payment anchors. The payment
|
||||
receipt table is still owned by the economic/payment layer; this schema only
|
||||
keeps references.
|
||||
|
||||
## Ranger Liquidation Guard
|
||||
|
||||
The Ranger benchmark class should be represented as:
|
||||
|
||||
- `agent_eval_cases.expected_route = 'web_search'`
|
||||
- `agent_eval_cases.tags_json` includes `ranger_liquidated`
|
||||
- `agent_eval_cases.must_not_use_tools_json` includes market-data-only routes
|
||||
- `agent_tool_invocations` records market data as `rejected` or `skipped` when
|
||||
it is not the right tool
|
||||
- `agent_eval_results.routing_correct = 1` only if Leo routed to source-backed
|
||||
research instead of live-token valuation
|
||||
|
||||
This ensures "Ranger is liquidated/gone" is verified before any valuation
|
||||
framing and never silently treated as a normal live fair-value token question.
|
||||
|
||||
## Minimum Invariants
|
||||
|
||||
- No row may set `secret_values_included = 1`.
|
||||
- A benchmark result must link to both an eval case and a research run.
|
||||
- Tool invocation sequence numbers are unique per research run.
|
||||
- Scores are bounded between `0` and `1`.
|
||||
- Research runs store prompt and answer hashes plus optional redacted excerpts,
|
||||
not raw private prompts.
|
||||
- `outcome_observations` remain the downstream business-value layer; raw tool
|
||||
traces belong here, not there.
|
||||
247
schemas/teleo-agent-research-eval-v1.sql
Normal file
247
schemas/teleo-agent-research-eval-v1.sql
Normal file
|
|
@ -0,0 +1,247 @@
|
|||
-- Teleo Agent Research Eval Schema v1
|
||||
-- Common SQL subset intended for ephemeral SQLite tests and Postgres/Supabase
|
||||
-- staging. IDs are app-generated text IDs so this can run across engines.
|
||||
--
|
||||
-- Apply after teleo-agent-graph-v1.sql.
|
||||
--
|
||||
-- Secret policy: store hashes, redacted excerpts, and proof references only.
|
||||
-- Raw prompts, bearer tokens, API keys, wallet secrets, and private receipts do
|
||||
-- not belong in these tables.
|
||||
|
||||
INSERT OR IGNORE INTO graph_schema_version (version, source)
|
||||
VALUES ('teleo-agent-research-eval-v1', 'leo-x402-research-routing-benchmark');
|
||||
|
||||
CREATE TABLE IF NOT EXISTS agent_research_runs (
|
||||
id TEXT PRIMARY KEY,
|
||||
agent_slug TEXT NOT NULL REFERENCES agents(slug),
|
||||
source_surface TEXT NOT NULL
|
||||
CHECK(source_surface IN ('telegram', 'api', 'checkout', 'web', 'cli', 'test', 'other')),
|
||||
source_ref TEXT,
|
||||
request_kind TEXT NOT NULL DEFAULT 'free'
|
||||
CHECK(request_kind IN ('free', 'paid_quote', 'paid_work_order', 'benchmark', 'system')),
|
||||
sponsored_work_order_id TEXT,
|
||||
payment_receipt_id TEXT,
|
||||
prompt_sha256 TEXT NOT NULL,
|
||||
prompt_excerpt TEXT,
|
||||
selected_provider TEXT,
|
||||
selected_route TEXT NOT NULL DEFAULT 'unknown'
|
||||
CHECK(selected_route IN (
|
||||
'none',
|
||||
'web_search',
|
||||
'social_trends',
|
||||
'structured_market_data',
|
||||
'local_context',
|
||||
'mixed',
|
||||
'unknown'
|
||||
)),
|
||||
status TEXT NOT NULL DEFAULT 'running'
|
||||
CHECK(status IN (
|
||||
'quoted',
|
||||
'payment_pending',
|
||||
'running',
|
||||
'answered',
|
||||
'abstained',
|
||||
'blocked',
|
||||
'failed',
|
||||
'cancelled'
|
||||
)),
|
||||
answer_sha256 TEXT,
|
||||
answer_excerpt TEXT,
|
||||
proof_ref TEXT,
|
||||
cost_amount REAL NOT NULL DEFAULT 0 CHECK(cost_amount >= 0),
|
||||
currency TEXT NOT NULL DEFAULT 'USDC',
|
||||
latency_ms INTEGER CHECK(latency_ms IS NULL OR latency_ms >= 0),
|
||||
source_count INTEGER NOT NULL DEFAULT 0 CHECK(source_count >= 0),
|
||||
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
completed_at TEXT,
|
||||
CHECK(prompt_excerpt IS NULL OR length(prompt_excerpt) <= 1000),
|
||||
CHECK(answer_excerpt IS NULL OR length(answer_excerpt) <= 2000)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_research_runs_agent_created
|
||||
ON agent_research_runs(agent_slug, created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_research_runs_work_order
|
||||
ON agent_research_runs(sponsored_work_order_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_research_runs_status_route
|
||||
ON agent_research_runs(status, selected_route);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS agent_tool_invocations (
|
||||
id TEXT PRIMARY KEY,
|
||||
research_run_id TEXT NOT NULL REFERENCES agent_research_runs(id) ON DELETE CASCADE,
|
||||
sequence INTEGER NOT NULL DEFAULT 0 CHECK(sequence >= 0),
|
||||
provider TEXT NOT NULL,
|
||||
tool_name TEXT NOT NULL,
|
||||
tool_category TEXT NOT NULL
|
||||
CHECK(tool_category IN (
|
||||
'web_search',
|
||||
'social_trends',
|
||||
'market_data',
|
||||
'page_read',
|
||||
'x402_checkout',
|
||||
'agentcash',
|
||||
'faremeter',
|
||||
'database',
|
||||
'local_context',
|
||||
'other'
|
||||
)),
|
||||
endpoint_host TEXT,
|
||||
endpoint_hash TEXT,
|
||||
decision TEXT NOT NULL
|
||||
CHECK(decision IN ('candidate', 'selected', 'executed', 'skipped', 'rejected', 'fallback', 'failed')),
|
||||
decision_reason TEXT NOT NULL,
|
||||
paid INTEGER NOT NULL DEFAULT 0 CHECK(paid IN (0, 1)),
|
||||
rail TEXT CHECK(rail IS NULL OR rail IN ('x402', 'agentcash', 'manual', 'free', 'other')),
|
||||
network TEXT,
|
||||
amount REAL CHECK(amount IS NULL OR amount >= 0),
|
||||
currency TEXT NOT NULL DEFAULT 'USDC',
|
||||
payment_receipt_id TEXT,
|
||||
input_sha256 TEXT,
|
||||
output_sha256 TEXT,
|
||||
source_count INTEGER NOT NULL DEFAULT 0 CHECK(source_count >= 0),
|
||||
latency_ms INTEGER CHECK(latency_ms IS NULL OR latency_ms >= 0),
|
||||
error_class TEXT,
|
||||
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(research_run_id, sequence)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_tool_invocations_run_decision
|
||||
ON agent_tool_invocations(research_run_id, decision);
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_tool_invocations_provider_category
|
||||
ON agent_tool_invocations(provider, tool_category);
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_tool_invocations_receipt
|
||||
ON agent_tool_invocations(payment_receipt_id);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS agent_research_sources (
|
||||
id TEXT PRIMARY KEY,
|
||||
research_run_id TEXT NOT NULL REFERENCES agent_research_runs(id) ON DELETE CASCADE,
|
||||
tool_invocation_id TEXT REFERENCES agent_tool_invocations(id) ON DELETE SET NULL,
|
||||
source_type TEXT NOT NULL
|
||||
CHECK(source_type IN ('web', 'social', 'market', 'db', 'document', 'other')),
|
||||
source_uri TEXT,
|
||||
source_uri_sha256 TEXT,
|
||||
title TEXT,
|
||||
cited INTEGER NOT NULL DEFAULT 0 CHECK(cited IN (0, 1)),
|
||||
retrieval_rank INTEGER CHECK(retrieval_rank IS NULL OR retrieval_rank >= 0),
|
||||
observed_at TEXT,
|
||||
support_status TEXT NOT NULL DEFAULT 'unknown'
|
||||
CHECK(support_status IN ('supports', 'context', 'conflicts', 'stale', 'unknown')),
|
||||
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_research_sources_run
|
||||
ON agent_research_sources(research_run_id, cited);
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_research_sources_tool
|
||||
ON agent_research_sources(tool_invocation_id);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS agent_eval_cases (
|
||||
id TEXT PRIMARY KEY,
|
||||
suite_id TEXT NOT NULL,
|
||||
case_slug TEXT NOT NULL,
|
||||
case_version INTEGER NOT NULL DEFAULT 1 CHECK(case_version >= 1),
|
||||
prompt_sha256 TEXT NOT NULL,
|
||||
prompt_excerpt TEXT NOT NULL CHECK(length(prompt_excerpt) <= 1000),
|
||||
fixture_context_sha256 TEXT,
|
||||
fixture_context_excerpt TEXT CHECK(fixture_context_excerpt IS NULL OR length(fixture_context_excerpt) <= 2000),
|
||||
expected_route TEXT NOT NULL
|
||||
CHECK(expected_route IN (
|
||||
'none',
|
||||
'web_search',
|
||||
'social_trends',
|
||||
'structured_market_data',
|
||||
'local_context',
|
||||
'mixed',
|
||||
'unknown'
|
||||
)),
|
||||
expected_provider TEXT,
|
||||
must_use_tools_json TEXT NOT NULL DEFAULT '[]',
|
||||
must_not_use_tools_json TEXT NOT NULL DEFAULT '[]',
|
||||
tags_json TEXT NOT NULL DEFAULT '[]',
|
||||
rubric_json TEXT NOT NULL DEFAULT '{}',
|
||||
stale_after TEXT,
|
||||
active INTEGER NOT NULL DEFAULT 1 CHECK(active IN (0, 1)),
|
||||
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(suite_id, case_slug, case_version)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_eval_cases_suite_active
|
||||
ON agent_eval_cases(suite_id, active);
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_eval_cases_route
|
||||
ON agent_eval_cases(expected_route);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS agent_eval_results (
|
||||
id TEXT PRIMARY KEY,
|
||||
eval_case_id TEXT NOT NULL REFERENCES agent_eval_cases(id) ON DELETE CASCADE,
|
||||
research_run_id TEXT NOT NULL REFERENCES agent_research_runs(id) ON DELETE CASCADE,
|
||||
graph_evaluation_run_id TEXT REFERENCES graph_evaluation_runs(id) ON DELETE SET NULL,
|
||||
status TEXT NOT NULL
|
||||
CHECK(status IN ('passed', 'failed', 'warning', 'blocked', 'skipped')),
|
||||
score REAL CHECK(score IS NULL OR (score >= 0 AND score <= 1)),
|
||||
routing_correct INTEGER CHECK(routing_correct IS NULL OR routing_correct IN (0, 1)),
|
||||
tool_choice_score REAL CHECK(tool_choice_score IS NULL OR (tool_choice_score >= 0 AND tool_choice_score <= 1)),
|
||||
source_quality_score REAL CHECK(source_quality_score IS NULL OR (source_quality_score >= 0 AND source_quality_score <= 1)),
|
||||
groundedness_score REAL CHECK(groundedness_score IS NULL OR (groundedness_score >= 0 AND groundedness_score <= 1)),
|
||||
freshness_score REAL CHECK(freshness_score IS NULL OR (freshness_score >= 0 AND freshness_score <= 1)),
|
||||
cost_efficiency_score REAL CHECK(cost_efficiency_score IS NULL OR (cost_efficiency_score >= 0 AND cost_efficiency_score <= 1)),
|
||||
safety_payment_score REAL CHECK(safety_payment_score IS NULL OR (safety_payment_score >= 0 AND safety_payment_score <= 1)),
|
||||
failure_reason TEXT,
|
||||
judge TEXT,
|
||||
proof_ref TEXT,
|
||||
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(eval_case_id, research_run_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_eval_results_case_status
|
||||
ON agent_eval_results(eval_case_id, status);
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_eval_results_run
|
||||
ON agent_eval_results(research_run_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_eval_results_graph_eval
|
||||
ON agent_eval_results(graph_evaluation_run_id);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS work_order_graph_links (
|
||||
id TEXT PRIMARY KEY,
|
||||
sponsored_work_order_id TEXT NOT NULL,
|
||||
role TEXT NOT NULL
|
||||
CHECK(role IN (
|
||||
'input_context',
|
||||
'evaluation_target',
|
||||
'created_evidence',
|
||||
'created_claim',
|
||||
'created_eval_run',
|
||||
'research_run',
|
||||
'tool_trace',
|
||||
'history_trace',
|
||||
'outcome_trace'
|
||||
)),
|
||||
graph_layer TEXT NOT NULL
|
||||
CHECK(graph_layer IN (
|
||||
'persona',
|
||||
'strategy',
|
||||
'position',
|
||||
'belief',
|
||||
'claim',
|
||||
'evidence',
|
||||
'edge',
|
||||
'graph_evaluation_run',
|
||||
'cascade_event',
|
||||
'graph_history_event',
|
||||
'agent_research_run',
|
||||
'agent_tool_invocation',
|
||||
'agent_eval_result',
|
||||
'outcome_observation'
|
||||
)),
|
||||
graph_id TEXT NOT NULL,
|
||||
rationale TEXT,
|
||||
secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(sponsored_work_order_id, role, graph_layer, graph_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_work_order_graph_links_work_order
|
||||
ON work_order_graph_links(sponsored_work_order_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_work_order_graph_links_graph
|
||||
ON work_order_graph_links(graph_layer, graph_id);
|
||||
129
tests/test_agent_graph_schema_sql.py
Normal file
129
tests/test_agent_graph_schema_sql.py
Normal file
|
|
@ -0,0 +1,129 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-graph-v1.sql"
|
||||
|
||||
|
||||
def test_agent_graph_schema_applies_and_models_shared_nodes():
|
||||
conn = sqlite3.connect(":memory:")
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys = ON")
|
||||
conn.executescript(SCHEMA_SQL.read_text())
|
||||
|
||||
conn.executemany(
|
||||
"INSERT INTO agents (slug, display_name, archetype) VALUES (?, ?, ?)",
|
||||
[
|
||||
("leo", "Leo", "cross-domain synthesizer"),
|
||||
("theseus", "Theseus", "AI alignment"),
|
||||
],
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO agent_persona_revisions
|
||||
(id, agent_slug, revision, identity, voice, role, authored_by)
|
||||
VALUES
|
||||
('persona-leo-v1', 'leo', 1, 'cross-domain synthesizer', 'direct', 'evaluate commons', 'diagram'),
|
||||
('persona-theseus-v1', 'theseus', 1, 'alignment maze navigator', 'precise', 'AI evidence lead', 'diagram')"""
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO agent_strategy_revisions
|
||||
(id, agent_slug, persona_revision_id, revision, diagnosis, guiding_policy, proximate_objectives_json)
|
||||
VALUES
|
||||
('strategy-leo-v1', 'leo', 'persona-leo-v1', 1, 'coordination is the bottleneck', 'surface cross-domain isomorphisms', '[]'),
|
||||
('strategy-theseus-v1', 'theseus', 'persona-theseus-v1', 1, 'AI discourse is ungrounded', 'separate generation from evaluation', '[]')"""
|
||||
)
|
||||
conn.executemany(
|
||||
"""INSERT INTO evidence
|
||||
(id, evidence_type, title, summary, verification_status)
|
||||
VALUES (?, ?, ?, ?, 'verified')""",
|
||||
[
|
||||
("e-kim-2025", "study", "Kim et al. ICML 2025", "Shared evidence grounding coordination and verification degradation."),
|
||||
("e-arrow", "formal_result", "Arrow impossibility theorem", "Formal result grounding alignment impossibility claim."),
|
||||
],
|
||||
)
|
||||
conn.executemany(
|
||||
"""INSERT INTO claims
|
||||
(id, slug, domain, description, confidence, primary_evidence_id, status)
|
||||
VALUES (?, ?, ?, ?, ?, ?, 'accepted')""",
|
||||
[
|
||||
("c-coordination", "alignment-is-coordination", "ai-alignment", "Alignment is a coordination problem, not only a technical one.", "likely", "e-kim-2025"),
|
||||
("c-verification", "verification-degrades-with-capability", "ai-alignment", "Verification degrades as capability gaps grow.", "experimental", "e-kim-2025"),
|
||||
("c-arrow", "universal-alignment-impossible", "ai-alignment", "Universal alignment is mathematically impossible under strong aggregation assumptions.", "likely", "e-arrow"),
|
||||
],
|
||||
)
|
||||
conn.executemany(
|
||||
"""INSERT INTO claim_evidence_edges
|
||||
(id, claim_id, evidence_id, relation, weight, rationale)
|
||||
VALUES (?, ?, ?, 'supports', ?, ?)""",
|
||||
[
|
||||
("ce-kim-coordination", "c-coordination", "e-kim-2025", 0.9, "Diagram shared-node case: one evidence node grounds multiple claims."),
|
||||
("ce-kim-verification", "c-verification", "e-kim-2025", 0.8, "Same evidence also grounds verification degradation."),
|
||||
("ce-arrow", "c-arrow", "e-arrow", 0.9, "Formal result evidence."),
|
||||
],
|
||||
)
|
||||
conn.executemany(
|
||||
"""INSERT INTO agent_beliefs
|
||||
(id, agent_slug, belief_code, title, statement, falsification_criteria, is_keystone)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
||||
[
|
||||
("b-leo-b1", "leo", "B1", "Coordination bottleneck", "Coordination is the bottleneck.", "Falsified by civ-scale pure-tech solution.", 1),
|
||||
("b-theseus-t2", "theseus", "T2", "Alignment as coordination", "Alignment is a coordination problem.", "Falsified by a robust one-agent technical alignment solution.", 1),
|
||||
("b-theseus-t4", "theseus", "T4", "Verification degradation", "Verification degrades faster than capability grows.", "Falsified by scalable oversight evidence.", 0),
|
||||
],
|
||||
)
|
||||
conn.executemany(
|
||||
"""INSERT INTO belief_claim_edges
|
||||
(id, belief_id, claim_id, relation, weight, rationale)
|
||||
VALUES (?, ?, ?, 'cites', ?, ?)""",
|
||||
[
|
||||
("bc-leo-coordination", "b-leo-b1", "c-coordination", 1.0, "Keystone belief cites shared claim."),
|
||||
("bc-theseus-coordination", "b-theseus-t2", "c-coordination", 0.9, "Different agent cites same shared claim."),
|
||||
("bc-theseus-verification", "b-theseus-t4", "c-verification", 0.9, "Belief cites verification claim."),
|
||||
("bc-theseus-arrow", "b-theseus-t2", "c-arrow", 0.6, "Belief also cites formal-result claim."),
|
||||
],
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO claim_edges
|
||||
(id, from_claim_id, to_claim_id, relation, weight, rationale)
|
||||
VALUES ('edge-verification-supports-coordination', 'c-verification', 'c-coordination', 'supports', 0.6, 'Oversight degradation strengthens coordination framing.')"""
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO cascade_events
|
||||
(id, changed_layer, changed_id, affected_layer, affected_id, reason)
|
||||
VALUES ('cascade-kim-to-coordination', 'evidence', 'e-kim-2025', 'claim', 'c-coordination', 'shared evidence updated')"""
|
||||
)
|
||||
|
||||
shared_evidence_count = conn.execute(
|
||||
"SELECT COUNT(*) AS n FROM claim_evidence_edges WHERE evidence_id = 'e-kim-2025'"
|
||||
).fetchone()["n"]
|
||||
shared_claim_count = conn.execute(
|
||||
"SELECT COUNT(*) AS n FROM belief_claim_edges WHERE claim_id = 'c-coordination'"
|
||||
).fetchone()["n"]
|
||||
cascade_count = conn.execute("SELECT COUNT(*) AS n FROM cascade_events").fetchone()["n"]
|
||||
|
||||
assert shared_evidence_count == 2
|
||||
assert shared_claim_count == 2
|
||||
assert cascade_count == 1
|
||||
|
||||
|
||||
def test_claim_edges_reject_self_reference():
|
||||
conn = sqlite3.connect(":memory:")
|
||||
conn.execute("PRAGMA foreign_keys = ON")
|
||||
conn.executescript(SCHEMA_SQL.read_text())
|
||||
conn.execute(
|
||||
"""INSERT INTO claims (id, slug, domain, description)
|
||||
VALUES ('c1', 'claim-one', 'ai-alignment', 'A claim specific enough to disagree with.')"""
|
||||
)
|
||||
|
||||
try:
|
||||
conn.execute(
|
||||
"""INSERT INTO claim_edges
|
||||
(id, from_claim_id, to_claim_id, relation, rationale)
|
||||
VALUES ('self', 'c1', 'c1', 'related', 'self edge should fail')"""
|
||||
)
|
||||
except sqlite3.IntegrityError:
|
||||
pass
|
||||
else:
|
||||
raise AssertionError("claim_edges allowed a self-reference")
|
||||
365
tests/test_research_eval_schema_sql.py
Normal file
365
tests/test_research_eval_schema_sql.py
Normal file
|
|
@ -0,0 +1,365 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
GRAPH_SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-graph-v1.sql"
|
||||
RESEARCH_EVAL_SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-research-eval-v1.sql"
|
||||
|
||||
|
||||
def _conn() -> sqlite3.Connection:
|
||||
conn = sqlite3.connect(":memory:")
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys = ON")
|
||||
conn.executescript(GRAPH_SCHEMA_SQL.read_text())
|
||||
conn.executescript(RESEARCH_EVAL_SCHEMA_SQL.read_text())
|
||||
return conn
|
||||
|
||||
|
||||
def test_research_eval_schema_applies_after_graph_schema():
|
||||
conn = _conn()
|
||||
|
||||
versions = {
|
||||
row["version"]
|
||||
for row in conn.execute("SELECT version FROM graph_schema_version").fetchall()
|
||||
}
|
||||
assert versions == {
|
||||
"teleo-agent-graph-v1",
|
||||
"teleo-agent-research-eval-v1",
|
||||
}
|
||||
|
||||
tables = {
|
||||
row["name"]
|
||||
for row in conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type = 'table'"
|
||||
).fetchall()
|
||||
}
|
||||
assert {
|
||||
"agent_research_runs",
|
||||
"agent_tool_invocations",
|
||||
"agent_research_sources",
|
||||
"agent_eval_cases",
|
||||
"agent_eval_results",
|
||||
"work_order_graph_links",
|
||||
} <= tables
|
||||
|
||||
|
||||
def test_ranger_liquidation_case_routes_to_source_backed_research_not_market_data():
|
||||
conn = _conn()
|
||||
conn.execute(
|
||||
"INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')"
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO agent_eval_cases
|
||||
(id, suite_id, case_slug, prompt_sha256, prompt_excerpt, expected_route,
|
||||
expected_provider, must_use_tools_json, must_not_use_tools_json, tags_json, rubric_json)
|
||||
VALUES
|
||||
(
|
||||
'eval-ranger-liquidated-v1',
|
||||
'leo-research-routing-v1',
|
||||
'ranger-liquidated-not-fair-value',
|
||||
'sha256:ranger-prompt',
|
||||
'Is Ranger Finance fairly valued today given Ranger Finance is liquidated and gone?',
|
||||
'web_search',
|
||||
'agentcash-stableenrich-exa-search',
|
||||
'["source-backed web research"]',
|
||||
'["structured_market_data_only", "live_token_fair_value"]',
|
||||
'["ranger_liquidated", "valuation", "source_verification"]',
|
||||
'{"routing": "verify liquidation before valuation framing"}'
|
||||
)"""
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO agent_research_runs
|
||||
(id, agent_slug, source_surface, source_ref, request_kind, sponsored_work_order_id,
|
||||
payment_receipt_id, prompt_sha256, prompt_excerpt, selected_provider, selected_route,
|
||||
status, answer_sha256, answer_excerpt, proof_ref, cost_amount, latency_ms, source_count)
|
||||
VALUES
|
||||
(
|
||||
'run-ranger-liquidated-001',
|
||||
'leo',
|
||||
'telegram',
|
||||
'telegram:group:message-123',
|
||||
'paid_quote',
|
||||
'sponsored_work_orders:test-ranger-001',
|
||||
'payment_receipts:test-ranger-001',
|
||||
'sha256:ranger-prompt',
|
||||
'Is Ranger Finance fairly valued today given Ranger Finance is liquidated and gone?',
|
||||
'agentcash-stableenrich-exa-search',
|
||||
'web_search',
|
||||
'answered',
|
||||
'sha256:ranger-answer',
|
||||
'Verified liquidation/gone status before valuation framing.',
|
||||
'proof/leo-ranger-liquidated-routing.json',
|
||||
0.01,
|
||||
1240,
|
||||
3
|
||||
)"""
|
||||
)
|
||||
conn.executemany(
|
||||
"""INSERT INTO agent_tool_invocations
|
||||
(id, research_run_id, sequence, provider, tool_name, tool_category, decision,
|
||||
decision_reason, paid, rail, network, amount, payment_receipt_id, input_sha256,
|
||||
output_sha256, source_count, latency_ms)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
[
|
||||
(
|
||||
"tool-ranger-market-rejected",
|
||||
"run-ranger-liquidated-001",
|
||||
1,
|
||||
"DexScreener",
|
||||
"structured-market-context",
|
||||
"market_data",
|
||||
"rejected",
|
||||
"Ranger liquidation status must be verified before treating this as a live token valuation.",
|
||||
0,
|
||||
"free",
|
||||
None,
|
||||
0,
|
||||
None,
|
||||
"sha256:market-input",
|
||||
None,
|
||||
0,
|
||||
12,
|
||||
),
|
||||
(
|
||||
"tool-ranger-web-selected",
|
||||
"run-ranger-liquidated-001",
|
||||
2,
|
||||
"AgentCash StableEnrich",
|
||||
"exa-search",
|
||||
"web_search",
|
||||
"executed",
|
||||
"Source-backed liquidation and status verification required.",
|
||||
1,
|
||||
"agentcash",
|
||||
"solana:5eykt4UsFv8P8NJdTREpY1vzqKqZKvdp",
|
||||
0.01,
|
||||
"payment_receipts:test-ranger-001",
|
||||
"sha256:exa-input",
|
||||
"sha256:exa-output",
|
||||
3,
|
||||
1228,
|
||||
),
|
||||
],
|
||||
)
|
||||
conn.executemany(
|
||||
"""INSERT INTO agent_research_sources
|
||||
(id, research_run_id, tool_invocation_id, source_type, source_uri_sha256,
|
||||
title, cited, retrieval_rank, support_status)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
[
|
||||
(
|
||||
"source-ranger-official",
|
||||
"run-ranger-liquidated-001",
|
||||
"tool-ranger-web-selected",
|
||||
"web",
|
||||
"sha256:ranger-official",
|
||||
"Ranger status source",
|
||||
1,
|
||||
1,
|
||||
"supports",
|
||||
),
|
||||
(
|
||||
"source-ranger-forum",
|
||||
"run-ranger-liquidated-001",
|
||||
"tool-ranger-web-selected",
|
||||
"web",
|
||||
"sha256:ranger-forum",
|
||||
"MetaDAO/Ranger discussion source",
|
||||
1,
|
||||
2,
|
||||
"context",
|
||||
),
|
||||
],
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO graph_evaluation_runs
|
||||
(id, target_layer, target_id, trigger_type, evaluator, verdict, confidence, notes)
|
||||
VALUES
|
||||
(
|
||||
'graph-eval-ranger-routing',
|
||||
'claim',
|
||||
'ranger-liquidated-status',
|
||||
'manual',
|
||||
'leo-research-routing-benchmark',
|
||||
'approve',
|
||||
0.92,
|
||||
'Tool choice matched Ranger liquidation guard.'
|
||||
)"""
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO agent_eval_results
|
||||
(id, eval_case_id, research_run_id, graph_evaluation_run_id, status, score,
|
||||
routing_correct, tool_choice_score, source_quality_score, groundedness_score,
|
||||
freshness_score, cost_efficiency_score, safety_payment_score, proof_ref)
|
||||
VALUES
|
||||
(
|
||||
'eval-result-ranger-liquidated-001',
|
||||
'eval-ranger-liquidated-v1',
|
||||
'run-ranger-liquidated-001',
|
||||
'graph-eval-ranger-routing',
|
||||
'passed',
|
||||
0.94,
|
||||
1,
|
||||
1.0,
|
||||
0.9,
|
||||
0.9,
|
||||
0.85,
|
||||
0.8,
|
||||
1.0,
|
||||
'proof/leo-ranger-liquidated-routing.json'
|
||||
)"""
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO work_order_graph_links
|
||||
(id, sponsored_work_order_id, role, graph_layer, graph_id, rationale)
|
||||
VALUES
|
||||
(
|
||||
'wo-ranger-run-link',
|
||||
'sponsored_work_orders:test-ranger-001',
|
||||
'research_run',
|
||||
'agent_research_run',
|
||||
'run-ranger-liquidated-001',
|
||||
'Paid work order produced source-backed research run.'
|
||||
)"""
|
||||
)
|
||||
|
||||
row = conn.execute(
|
||||
"""SELECT
|
||||
r.selected_route,
|
||||
r.selected_provider,
|
||||
er.status AS eval_status,
|
||||
er.routing_correct,
|
||||
er.tool_choice_score,
|
||||
COUNT(s.id) AS cited_source_count
|
||||
FROM agent_research_runs r
|
||||
JOIN agent_eval_results er ON er.research_run_id = r.id
|
||||
LEFT JOIN agent_research_sources s ON s.research_run_id = r.id AND s.cited = 1
|
||||
WHERE r.id = 'run-ranger-liquidated-001'
|
||||
GROUP BY r.id, er.id"""
|
||||
).fetchone()
|
||||
|
||||
market_executed = conn.execute(
|
||||
"""SELECT COUNT(*) AS n
|
||||
FROM agent_tool_invocations
|
||||
WHERE research_run_id = 'run-ranger-liquidated-001'
|
||||
AND tool_category = 'market_data'
|
||||
AND decision = 'executed'"""
|
||||
).fetchone()["n"]
|
||||
rejected_market = conn.execute(
|
||||
"""SELECT COUNT(*) AS n
|
||||
FROM agent_tool_invocations
|
||||
WHERE research_run_id = 'run-ranger-liquidated-001'
|
||||
AND tool_category = 'market_data'
|
||||
AND decision = 'rejected'"""
|
||||
).fetchone()["n"]
|
||||
|
||||
assert dict(row) == {
|
||||
"selected_route": "web_search",
|
||||
"selected_provider": "agentcash-stableenrich-exa-search",
|
||||
"eval_status": "passed",
|
||||
"routing_correct": 1,
|
||||
"tool_choice_score": 1.0,
|
||||
"cited_source_count": 2,
|
||||
}
|
||||
assert market_executed == 0
|
||||
assert rejected_market == 1
|
||||
|
||||
|
||||
def test_schema_rejects_secret_flags_bad_scores_and_bad_tool_decisions():
|
||||
conn = _conn()
|
||||
conn.execute(
|
||||
"INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')"
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO agent_research_runs
|
||||
(id, agent_slug, source_surface, request_kind, prompt_sha256, selected_route, status)
|
||||
VALUES ('run-constraints', 'leo', 'test', 'benchmark', 'sha256:prompt', 'web_search', 'answered')"""
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO agent_eval_cases
|
||||
(id, suite_id, case_slug, prompt_sha256, prompt_excerpt, expected_route)
|
||||
VALUES ('case-constraints', 'suite', 'case', 'sha256:prompt', 'redacted prompt', 'web_search')"""
|
||||
)
|
||||
|
||||
invalid_statements = [
|
||||
"""INSERT INTO agent_research_runs
|
||||
(id, agent_slug, source_surface, request_kind, prompt_sha256, selected_route, status, secret_values_included)
|
||||
VALUES ('run-secret', 'leo', 'test', 'benchmark', 'sha256:secret', 'web_search', 'answered', 1)""",
|
||||
"""INSERT INTO agent_tool_invocations
|
||||
(id, research_run_id, provider, tool_name, tool_category, decision, decision_reason)
|
||||
VALUES ('tool-bad-decision', 'run-constraints', 'p', 't', 'web_search', 'approved', 'bad enum')""",
|
||||
"""INSERT INTO agent_eval_results
|
||||
(id, eval_case_id, research_run_id, status, score)
|
||||
VALUES ('eval-bad-score', 'case-constraints', 'run-constraints', 'passed', 1.1)""",
|
||||
"""INSERT INTO agent_eval_results
|
||||
(id, eval_case_id, research_run_id, status, routing_correct)
|
||||
VALUES ('eval-bad-bool', 'case-constraints', 'run-constraints', 'passed', 2)""",
|
||||
]
|
||||
|
||||
for statement in invalid_statements:
|
||||
try:
|
||||
conn.execute(statement)
|
||||
except sqlite3.IntegrityError:
|
||||
pass
|
||||
else:
|
||||
raise AssertionError(f"invalid statement unexpectedly passed: {statement}")
|
||||
|
||||
|
||||
def test_research_run_can_be_recorded_without_raw_prompt_or_private_payloads():
|
||||
conn = _conn()
|
||||
conn.execute(
|
||||
"INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')"
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO agent_research_runs
|
||||
(id, agent_slug, source_surface, source_ref, request_kind, prompt_sha256,
|
||||
selected_route, status, answer_sha256, proof_ref)
|
||||
VALUES
|
||||
(
|
||||
'run-hash-only',
|
||||
'leo',
|
||||
'api',
|
||||
'api:request-redacted',
|
||||
'paid_work_order',
|
||||
'sha256:prompt-only',
|
||||
'social_trends',
|
||||
'answered',
|
||||
'sha256:answer-only',
|
||||
'proof/hash-only.json'
|
||||
)"""
|
||||
)
|
||||
conn.execute(
|
||||
"""INSERT INTO agent_tool_invocations
|
||||
(id, research_run_id, provider, tool_name, tool_category, decision,
|
||||
decision_reason, input_sha256, output_sha256)
|
||||
VALUES
|
||||
(
|
||||
'tool-hash-only',
|
||||
'run-hash-only',
|
||||
'AgentCash StableSocial',
|
||||
'lightreel-trends',
|
||||
'social_trends',
|
||||
'executed',
|
||||
'Question asks for current Twitter/X discussion.',
|
||||
'sha256:input-only',
|
||||
'sha256:output-only'
|
||||
)"""
|
||||
)
|
||||
|
||||
row = conn.execute(
|
||||
"""SELECT
|
||||
r.prompt_excerpt,
|
||||
r.answer_excerpt,
|
||||
r.secret_values_included AS run_secret_flag,
|
||||
t.secret_values_included AS tool_secret_flag
|
||||
FROM agent_research_runs r
|
||||
JOIN agent_tool_invocations t ON t.research_run_id = r.id
|
||||
WHERE r.id = 'run-hash-only'"""
|
||||
).fetchone()
|
||||
|
||||
assert row["prompt_excerpt"] is None
|
||||
assert row["answer_excerpt"] is None
|
||||
assert row["run_secret_flag"] == 0
|
||||
assert row["tool_secret_flag"] == 0
|
||||
Loading…
Reference in a new issue