From 1a71efcde2967693b6257e38ce1af89b941fabe1 Mon Sep 17 00:00:00 2001 From: twentyOne2x Date: Wed, 24 Jun 2026 14:21:03 +0200 Subject: [PATCH] Add Teleo research eval schema Adds graph schema prerequisite plus research-eval schema/docs/tests for Leo tool-use benchmarks and x402 research telemetry. Validated by full local pytest and green CI. --- schemas/teleo-agent-graph-v1.md | 104 +++++++ schemas/teleo-agent-graph-v1.sql | 251 ++++++++++++++++ schemas/teleo-agent-research-eval-v1.md | 73 +++++ schemas/teleo-agent-research-eval-v1.sql | 247 +++++++++++++++ tests/test_agent_graph_schema_sql.py | 129 ++++++++ tests/test_research_eval_schema_sql.py | 365 +++++++++++++++++++++++ 6 files changed, 1169 insertions(+) create mode 100644 schemas/teleo-agent-graph-v1.md create mode 100644 schemas/teleo-agent-graph-v1.sql create mode 100644 schemas/teleo-agent-research-eval-v1.md create mode 100644 schemas/teleo-agent-research-eval-v1.sql create mode 100644 tests/test_agent_graph_schema_sql.py create mode 100644 tests/test_research_eval_schema_sql.py diff --git a/schemas/teleo-agent-graph-v1.md b/schemas/teleo-agent-graph-v1.md new file mode 100644 index 0000000..73fc067 --- /dev/null +++ b/schemas/teleo-agent-graph-v1.md @@ -0,0 +1,104 @@ +# Teleo Agent Graph Schema v1 + +Source idea: `teleo-agent-architecture-COMBINED (2).excalidraw`. + +This schema models the agent commons as a graph: + +```text +persona -> strategy -> position -> belief -> claim -> evidence +``` + +The top layers are agent-owned. The lower layers are shared commons. +Changes cascade upward: evidence changes re-evaluate claims, claims flag beliefs, +beliefs flag positions, and positions can force persona/strategy review. + +## Design Commitments + +- Personas are authored, stable, and loaded every turn. +- Strategies are derived from personas using the Rumelt kernel: + diagnosis, guiding policy, proximate objectives. +- Positions and beliefs are per-agent public commitments. +- Claims are owned by no agent. +- Evidence is owned by no agent. +- Claims link to claims through typed weighted edges. +- One evidence node can ground many claims. +- One claim can be cited by many beliefs across agents and domains. +- `cited_by` and `importance` are computed/readback fields, not hand-authored + truth. +- Every edge has a relation, weight, and rationale so cascade behavior is + auditable. + +## Main Tables + +| Table | Purpose | +| --- | --- | +| `agents` | Agent registry: Leo, Rio, Theseus, etc. | +| `agent_persona_revisions` | Stable authored identity, voice, and role snapshots | +| `agent_strategy_revisions` | Derived diagnosis, guiding policy, and objectives | +| `agent_positions` | Per-agent public commitments with falsification criteria | +| `agent_beliefs` | Per-agent falsifiable beliefs citing claims | +| `claims` | Shared claim commons | +| `evidence` | Shared sourced/verifiable evidence commons | +| `position_belief_edges` | Position depends on belief | +| `belief_claim_edges` | Belief cites or depends on claim | +| `claim_edges` | Claim-to-claim typed relationship | +| `claim_evidence_edges` | Claim grounded by evidence | +| `graph_evaluation_runs` | Evaluation/re-evaluation records | +| `cascade_events` | Upward propagation queue/history | +| `graph_history_events` | Sanitized GitHub/Forgejo/local-git manifest events | +| `graph_node_history_links` | Links history events to graph nodes | + +## Claim Node + +Diagram frontmatter maps to `claims`: + +| Diagram field | Column | +| --- | --- | +| `type: claim` | implicit table | +| `domain` | `claims.domain` | +| `description` | `claims.description` | +| `confidence` | `claims.confidence` | +| `source` | `claims.source_summary`, plus evidence edges | +| `created` | `claims.created_at` | +| `last_evaluated` | `claims.last_evaluated` | +| `cross_references` | `claim_edges` | +| `importance` | `claims.importance`, computed from inbound refs | +| `attribution` | `claims.attribution_json` | + +## Claim Relations + +| Relation | Meaning | +| --- | --- | +| `depends_on` | This claim cannot be true unless the linked claim is true | +| `supports` | Linked claim provides evidence for this one | +| `challenged_by` | Linked claim is counter-argument or counter-evidence | +| `cited_by` | Computed inbound reference, not hand-authored | +| `related` | Topical link without a specific evidential relationship | + +## Experiment Use + +This schema should be applied after a test database is created and before a +history manifest is loaded: + +```text +spin database +apply teleo-agent-graph-v1.sql +load history manifest through graph adapter +run persona/journey/red-team experiments +verify cascades and graph invariants +tear database down +``` + +## Minimum Invariants + +- Every active belief must cite at least three claims before it can be marked + `load_bearing`. +- Every active claim must have at least one evidence edge before it can be + marked `accepted`. +- Red-team or quarantined claims cannot be cited by active beliefs unless the + edge relation is `challenged_by`. +- `claim_edges` cannot self-reference. +- `importance` should be recomputed from inbound belief and claim references + during loader/evaluation jobs. +- Any evidence update must produce cascade events for affected claims and + upstream beliefs/positions. diff --git a/schemas/teleo-agent-graph-v1.sql b/schemas/teleo-agent-graph-v1.sql new file mode 100644 index 0000000..66c2c1c --- /dev/null +++ b/schemas/teleo-agent-graph-v1.sql @@ -0,0 +1,251 @@ +-- Teleo Agent Graph Schema v1 +-- Common SQL subset intended for ephemeral SQLite tests and Postgres/Supabase +-- staging. IDs are app-generated text IDs so this can run across engines. + +CREATE TABLE IF NOT EXISTS graph_schema_version ( + version TEXT PRIMARY KEY, + source TEXT NOT NULL, + applied_at TEXT DEFAULT CURRENT_TIMESTAMP +); + +INSERT OR IGNORE INTO graph_schema_version (version, source) +VALUES ('teleo-agent-graph-v1', 'teleo-agent-architecture-excalidraw'); + +CREATE TABLE IF NOT EXISTS agents ( + slug TEXT PRIMARY KEY, + display_name TEXT NOT NULL, + archetype TEXT, + status TEXT NOT NULL DEFAULT 'active' + CHECK(status IN ('active', 'inactive', 'deprecated')), + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS agent_persona_revisions ( + id TEXT PRIMARY KEY, + agent_slug TEXT NOT NULL REFERENCES agents(slug), + revision INTEGER NOT NULL, + identity TEXT NOT NULL, + voice TEXT NOT NULL, + role TEXT NOT NULL, + authored_by TEXT, + stable INTEGER NOT NULL DEFAULT 1 CHECK(stable IN (0, 1)), + loads_every_turn INTEGER NOT NULL DEFAULT 1 CHECK(loads_every_turn IN (0, 1)), + active INTEGER NOT NULL DEFAULT 1 CHECK(active IN (0, 1)), + notes TEXT, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(agent_slug, revision) +); + +CREATE TABLE IF NOT EXISTS agent_strategy_revisions ( + id TEXT PRIMARY KEY, + agent_slug TEXT NOT NULL REFERENCES agents(slug), + persona_revision_id TEXT REFERENCES agent_persona_revisions(id), + revision INTEGER NOT NULL, + diagnosis TEXT NOT NULL, + guiding_policy TEXT NOT NULL, + proximate_objectives_json TEXT NOT NULL DEFAULT '[]', + derivation_notes TEXT, + active INTEGER NOT NULL DEFAULT 1 CHECK(active IN (0, 1)), + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(agent_slug, revision) +); + +CREATE TABLE IF NOT EXISTS agent_positions ( + id TEXT PRIMARY KEY, + agent_slug TEXT NOT NULL REFERENCES agents(slug), + title TEXT NOT NULL, + statement TEXT NOT NULL, + falsification_criteria TEXT, + public_commitment INTEGER NOT NULL DEFAULT 1 CHECK(public_commitment IN (0, 1)), + confidence TEXT NOT NULL DEFAULT 'experimental' + CHECK(confidence IN ('proven', 'likely', 'experimental', 'speculative')), + status TEXT NOT NULL DEFAULT 'active' + CHECK(status IN ('draft', 'active', 'flagged', 'retired')), + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + last_reviewed TEXT +); + +CREATE TABLE IF NOT EXISTS agent_beliefs ( + id TEXT PRIMARY KEY, + agent_slug TEXT NOT NULL REFERENCES agents(slug), + belief_code TEXT NOT NULL, + title TEXT NOT NULL, + statement TEXT NOT NULL, + falsification_criteria TEXT, + is_keystone INTEGER NOT NULL DEFAULT 0 CHECK(is_keystone IN (0, 1)), + min_claims INTEGER NOT NULL DEFAULT 3, + confidence TEXT NOT NULL DEFAULT 'experimental' + CHECK(confidence IN ('proven', 'likely', 'experimental', 'speculative')), + status TEXT NOT NULL DEFAULT 'active' + CHECK(status IN ('draft', 'active', 'load_bearing', 'flagged', 'retired')), + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + last_evaluated TEXT, + UNIQUE(agent_slug, belief_code) +); + +CREATE TABLE IF NOT EXISTS evidence ( + id TEXT PRIMARY KEY, + evidence_type TEXT NOT NULL + CHECK(evidence_type IN ('study', 'data', 'event', 'formal_result', 'legal', 'market', 'historical', 'other')), + title TEXT NOT NULL, + source_uri TEXT, + citation TEXT, + summary TEXT NOT NULL, + verification_status TEXT NOT NULL DEFAULT 'unverified' + CHECK(verification_status IN ('unverified', 'sourced', 'verified', 'disputed', 'retracted')), + observed_at TEXT, + attribution_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS claims ( + id TEXT PRIMARY KEY, + slug TEXT NOT NULL UNIQUE, + domain TEXT NOT NULL, + description TEXT NOT NULL, + confidence TEXT NOT NULL DEFAULT 'experimental' + CHECK(confidence IN ('proven', 'likely', 'experimental', 'speculative')), + source_summary TEXT, + proposed_by TEXT, + primary_evidence_id TEXT REFERENCES evidence(id), + importance REAL NOT NULL DEFAULT 0 CHECK(importance >= 0 AND importance <= 1), + status TEXT NOT NULL DEFAULT 'draft' + CHECK(status IN ('draft', 'active', 'accepted', 'challenged', 'quarantined', 'retired')), + attribution_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + last_evaluated TEXT +); + +CREATE TABLE IF NOT EXISTS position_belief_edges ( + id TEXT PRIMARY KEY, + position_id TEXT NOT NULL REFERENCES agent_positions(id), + belief_id TEXT NOT NULL REFERENCES agent_beliefs(id), + relation TEXT NOT NULL DEFAULT 'depends_on' + CHECK(relation IN ('depends_on', 'supports', 'challenged_by', 'related')), + weight REAL NOT NULL DEFAULT 1 CHECK(weight >= 0 AND weight <= 1), + rationale TEXT NOT NULL, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(position_id, belief_id, relation) +); + +CREATE TABLE IF NOT EXISTS belief_claim_edges ( + id TEXT PRIMARY KEY, + belief_id TEXT NOT NULL REFERENCES agent_beliefs(id), + claim_id TEXT NOT NULL REFERENCES claims(id), + relation TEXT NOT NULL DEFAULT 'cites' + CHECK(relation IN ('cites', 'depends_on', 'supports', 'challenged_by', 'related')), + weight REAL NOT NULL DEFAULT 1 CHECK(weight >= 0 AND weight <= 1), + rationale TEXT NOT NULL, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(belief_id, claim_id, relation) +); + +CREATE TABLE IF NOT EXISTS claim_edges ( + id TEXT PRIMARY KEY, + from_claim_id TEXT NOT NULL REFERENCES claims(id), + to_claim_id TEXT NOT NULL REFERENCES claims(id), + relation TEXT NOT NULL + CHECK(relation IN ('depends_on', 'supports', 'challenged_by', 'cited_by', 'related')), + weight REAL NOT NULL DEFAULT 1 CHECK(weight >= 0 AND weight <= 1), + rationale TEXT NOT NULL, + authored_by TEXT, + computed INTEGER NOT NULL DEFAULT 0 CHECK(computed IN (0, 1)), + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + CHECK(from_claim_id <> to_claim_id), + UNIQUE(from_claim_id, to_claim_id, relation) +); + +CREATE TABLE IF NOT EXISTS claim_evidence_edges ( + id TEXT PRIMARY KEY, + claim_id TEXT NOT NULL REFERENCES claims(id), + evidence_id TEXT NOT NULL REFERENCES evidence(id), + relation TEXT NOT NULL DEFAULT 'supports' + CHECK(relation IN ('primary', 'supports', 'challenges', 'context', 'weakens')), + weight REAL NOT NULL DEFAULT 1 CHECK(weight >= 0 AND weight <= 1), + rationale TEXT NOT NULL, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(claim_id, evidence_id, relation) +); + +CREATE TABLE IF NOT EXISTS graph_evaluation_runs ( + id TEXT PRIMARY KEY, + target_layer TEXT NOT NULL + CHECK(target_layer IN ('persona', 'strategy', 'position', 'belief', 'claim', 'evidence', 'edge')), + target_id TEXT NOT NULL, + trigger_type TEXT NOT NULL + CHECK(trigger_type IN ('scheduled', 'history_replay', 'evidence_changed', 'claim_changed', 'manual', 'red_team')), + trigger_id TEXT, + evaluator TEXT NOT NULL, + model TEXT, + verdict TEXT NOT NULL + CHECK(verdict IN ('approve', 'request_changes', 'reject', 'flag', 'quarantine', 'no_op')), + confidence REAL CHECK(confidence IS NULL OR (confidence >= 0 AND confidence <= 1)), + notes TEXT, + created_at TEXT DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS cascade_events ( + id TEXT PRIMARY KEY, + changed_layer TEXT NOT NULL + CHECK(changed_layer IN ('evidence', 'claim', 'belief', 'position', 'strategy', 'persona')), + changed_id TEXT NOT NULL, + affected_layer TEXT NOT NULL + CHECK(affected_layer IN ('claim', 'belief', 'position', 'strategy', 'persona')), + affected_id TEXT NOT NULL, + direction TEXT NOT NULL DEFAULT 'up' + CHECK(direction IN ('up', 'down', 'lateral')), + status TEXT NOT NULL DEFAULT 'queued' + CHECK(status IN ('queued', 'reviewing', 'resolved', 'ignored')), + reason TEXT NOT NULL, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + resolved_at TEXT +); + +CREATE TABLE IF NOT EXISTS graph_history_events ( + id TEXT PRIMARY KEY, + provider TEXT NOT NULL CHECK(provider IN ('github', 'forgejo', 'local_git', 'web', 'x', 'telegram', 'manual')), + repo TEXT, + provider_event_id TEXT, + event_type TEXT NOT NULL, + actor TEXT, + occurred_at TEXT, + payload_json TEXT NOT NULL DEFAULT '{}', + redacted INTEGER NOT NULL DEFAULT 1 CHECK(redacted IN (0, 1)), + created_at TEXT DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS graph_node_history_links ( + history_event_id TEXT NOT NULL REFERENCES graph_history_events(id), + node_layer TEXT NOT NULL + CHECK(node_layer IN ('persona', 'strategy', 'position', 'belief', 'claim', 'evidence', 'edge')), + node_id TEXT NOT NULL, + role TEXT NOT NULL + CHECK(role IN ('created', 'updated', 'evaluated', 'merged', 'challenged', 'cited', 'sourced')), + PRIMARY KEY (history_event_id, node_layer, node_id, role) +); + +CREATE INDEX IF NOT EXISTS idx_persona_revisions_agent_active + ON agent_persona_revisions(agent_slug, active); +CREATE INDEX IF NOT EXISTS idx_strategy_revisions_agent_active + ON agent_strategy_revisions(agent_slug, active); +CREATE INDEX IF NOT EXISTS idx_positions_agent_status + ON agent_positions(agent_slug, status); +CREATE INDEX IF NOT EXISTS idx_beliefs_agent_status + ON agent_beliefs(agent_slug, status); +CREATE INDEX IF NOT EXISTS idx_claims_domain_status + ON claims(domain, status); +CREATE INDEX IF NOT EXISTS idx_claims_importance + ON claims(importance); +CREATE INDEX IF NOT EXISTS idx_evidence_status + ON evidence(verification_status); +CREATE INDEX IF NOT EXISTS idx_belief_claim_edges_claim + ON belief_claim_edges(claim_id, relation); +CREATE INDEX IF NOT EXISTS idx_claim_edges_to + ON claim_edges(to_claim_id, relation); +CREATE INDEX IF NOT EXISTS idx_claim_evidence_edges_evidence + ON claim_evidence_edges(evidence_id, relation); +CREATE INDEX IF NOT EXISTS idx_cascade_status + ON cascade_events(status, affected_layer); +CREATE INDEX IF NOT EXISTS idx_history_provider_repo + ON graph_history_events(provider, repo, event_type); diff --git a/schemas/teleo-agent-research-eval-v1.md b/schemas/teleo-agent-research-eval-v1.md new file mode 100644 index 0000000..f5a2cbc --- /dev/null +++ b/schemas/teleo-agent-research-eval-v1.md @@ -0,0 +1,73 @@ +# Teleo Agent Research Eval Schema v1 + +Apply this schema after `teleo-agent-graph-v1.sql`. + +This schema records how Leo and other agents answer research requests, which +tools they choose, what sources they cite, and whether benchmark cases passed. +It is operational/economic telemetry, not the claim/evidence graph itself. + +## Design Commitments + +- The graph schema remains the knowledge spine: persona, strategy, beliefs, + claims, evidence, graph evals, and cascades. +- Research-eval rows explain how a request was handled and whether the route was + good enough to trust or ship. +- Payment funds work. It does not directly mutate claims, confidence, beliefs, + or rewards. +- Tool-use benchmarking must distinguish candidates, selected tools, executed + tools, skipped tools, and rejected tools. +- Secrets and private payloads are never stored. Tables store hashes, redacted + excerpts, proof references, source metadata, and receipt ids. + +## Main Tables + +| Table | Purpose | +| --- | --- | +| `agent_research_runs` | One row per research request from Telegram, API, checkout, CLI, or benchmark. | +| `agent_tool_invocations` | One row per candidate, selected, executed, skipped, rejected, fallback, or failed tool decision. | +| `agent_research_sources` | Retrieved or cited source rows tied to a run and optionally a tool invocation. | +| `agent_eval_cases` | Versioned benchmark prompts, expected routes/providers, tool constraints, tags, and rubrics. | +| `agent_eval_results` | Per-case result, routing correctness, tool score, source quality, groundedness, cost, and safety scores. | +| `work_order_graph_links` | Links sponsored work orders to research runs, tool traces, graph evals, evidence, claims, and outcomes. | + +## Leo x402 Research Flow + +```text +Telegram/API question +-> agent_research_runs +-> agent_tool_invocations +-> agent_research_sources +-> agent_eval_results when a benchmark case applies +-> work_order_graph_links when a paid work order or graph artifact is involved +``` + +For paid research, `agent_research_runs.sponsored_work_order_id` and +`payment_receipt_id` carry the external work-order/payment anchors. The payment +receipt table is still owned by the economic/payment layer; this schema only +keeps references. + +## Ranger Liquidation Guard + +The Ranger benchmark class should be represented as: + +- `agent_eval_cases.expected_route = 'web_search'` +- `agent_eval_cases.tags_json` includes `ranger_liquidated` +- `agent_eval_cases.must_not_use_tools_json` includes market-data-only routes +- `agent_tool_invocations` records market data as `rejected` or `skipped` when + it is not the right tool +- `agent_eval_results.routing_correct = 1` only if Leo routed to source-backed + research instead of live-token valuation + +This ensures "Ranger is liquidated/gone" is verified before any valuation +framing and never silently treated as a normal live fair-value token question. + +## Minimum Invariants + +- No row may set `secret_values_included = 1`. +- A benchmark result must link to both an eval case and a research run. +- Tool invocation sequence numbers are unique per research run. +- Scores are bounded between `0` and `1`. +- Research runs store prompt and answer hashes plus optional redacted excerpts, + not raw private prompts. +- `outcome_observations` remain the downstream business-value layer; raw tool + traces belong here, not there. diff --git a/schemas/teleo-agent-research-eval-v1.sql b/schemas/teleo-agent-research-eval-v1.sql new file mode 100644 index 0000000..1b9be78 --- /dev/null +++ b/schemas/teleo-agent-research-eval-v1.sql @@ -0,0 +1,247 @@ +-- Teleo Agent Research Eval Schema v1 +-- Common SQL subset intended for ephemeral SQLite tests and Postgres/Supabase +-- staging. IDs are app-generated text IDs so this can run across engines. +-- +-- Apply after teleo-agent-graph-v1.sql. +-- +-- Secret policy: store hashes, redacted excerpts, and proof references only. +-- Raw prompts, bearer tokens, API keys, wallet secrets, and private receipts do +-- not belong in these tables. + +INSERT OR IGNORE INTO graph_schema_version (version, source) +VALUES ('teleo-agent-research-eval-v1', 'leo-x402-research-routing-benchmark'); + +CREATE TABLE IF NOT EXISTS agent_research_runs ( + id TEXT PRIMARY KEY, + agent_slug TEXT NOT NULL REFERENCES agents(slug), + source_surface TEXT NOT NULL + CHECK(source_surface IN ('telegram', 'api', 'checkout', 'web', 'cli', 'test', 'other')), + source_ref TEXT, + request_kind TEXT NOT NULL DEFAULT 'free' + CHECK(request_kind IN ('free', 'paid_quote', 'paid_work_order', 'benchmark', 'system')), + sponsored_work_order_id TEXT, + payment_receipt_id TEXT, + prompt_sha256 TEXT NOT NULL, + prompt_excerpt TEXT, + selected_provider TEXT, + selected_route TEXT NOT NULL DEFAULT 'unknown' + CHECK(selected_route IN ( + 'none', + 'web_search', + 'social_trends', + 'structured_market_data', + 'local_context', + 'mixed', + 'unknown' + )), + status TEXT NOT NULL DEFAULT 'running' + CHECK(status IN ( + 'quoted', + 'payment_pending', + 'running', + 'answered', + 'abstained', + 'blocked', + 'failed', + 'cancelled' + )), + answer_sha256 TEXT, + answer_excerpt TEXT, + proof_ref TEXT, + cost_amount REAL NOT NULL DEFAULT 0 CHECK(cost_amount >= 0), + currency TEXT NOT NULL DEFAULT 'USDC', + latency_ms INTEGER CHECK(latency_ms IS NULL OR latency_ms >= 0), + source_count INTEGER NOT NULL DEFAULT 0 CHECK(source_count >= 0), + secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0), + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + completed_at TEXT, + CHECK(prompt_excerpt IS NULL OR length(prompt_excerpt) <= 1000), + CHECK(answer_excerpt IS NULL OR length(answer_excerpt) <= 2000) +); + +CREATE INDEX IF NOT EXISTS idx_agent_research_runs_agent_created + ON agent_research_runs(agent_slug, created_at); +CREATE INDEX IF NOT EXISTS idx_agent_research_runs_work_order + ON agent_research_runs(sponsored_work_order_id); +CREATE INDEX IF NOT EXISTS idx_agent_research_runs_status_route + ON agent_research_runs(status, selected_route); + +CREATE TABLE IF NOT EXISTS agent_tool_invocations ( + id TEXT PRIMARY KEY, + research_run_id TEXT NOT NULL REFERENCES agent_research_runs(id) ON DELETE CASCADE, + sequence INTEGER NOT NULL DEFAULT 0 CHECK(sequence >= 0), + provider TEXT NOT NULL, + tool_name TEXT NOT NULL, + tool_category TEXT NOT NULL + CHECK(tool_category IN ( + 'web_search', + 'social_trends', + 'market_data', + 'page_read', + 'x402_checkout', + 'agentcash', + 'faremeter', + 'database', + 'local_context', + 'other' + )), + endpoint_host TEXT, + endpoint_hash TEXT, + decision TEXT NOT NULL + CHECK(decision IN ('candidate', 'selected', 'executed', 'skipped', 'rejected', 'fallback', 'failed')), + decision_reason TEXT NOT NULL, + paid INTEGER NOT NULL DEFAULT 0 CHECK(paid IN (0, 1)), + rail TEXT CHECK(rail IS NULL OR rail IN ('x402', 'agentcash', 'manual', 'free', 'other')), + network TEXT, + amount REAL CHECK(amount IS NULL OR amount >= 0), + currency TEXT NOT NULL DEFAULT 'USDC', + payment_receipt_id TEXT, + input_sha256 TEXT, + output_sha256 TEXT, + source_count INTEGER NOT NULL DEFAULT 0 CHECK(source_count >= 0), + latency_ms INTEGER CHECK(latency_ms IS NULL OR latency_ms >= 0), + error_class TEXT, + secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0), + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(research_run_id, sequence) +); + +CREATE INDEX IF NOT EXISTS idx_agent_tool_invocations_run_decision + ON agent_tool_invocations(research_run_id, decision); +CREATE INDEX IF NOT EXISTS idx_agent_tool_invocations_provider_category + ON agent_tool_invocations(provider, tool_category); +CREATE INDEX IF NOT EXISTS idx_agent_tool_invocations_receipt + ON agent_tool_invocations(payment_receipt_id); + +CREATE TABLE IF NOT EXISTS agent_research_sources ( + id TEXT PRIMARY KEY, + research_run_id TEXT NOT NULL REFERENCES agent_research_runs(id) ON DELETE CASCADE, + tool_invocation_id TEXT REFERENCES agent_tool_invocations(id) ON DELETE SET NULL, + source_type TEXT NOT NULL + CHECK(source_type IN ('web', 'social', 'market', 'db', 'document', 'other')), + source_uri TEXT, + source_uri_sha256 TEXT, + title TEXT, + cited INTEGER NOT NULL DEFAULT 0 CHECK(cited IN (0, 1)), + retrieval_rank INTEGER CHECK(retrieval_rank IS NULL OR retrieval_rank >= 0), + observed_at TEXT, + support_status TEXT NOT NULL DEFAULT 'unknown' + CHECK(support_status IN ('supports', 'context', 'conflicts', 'stale', 'unknown')), + secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0), + created_at TEXT DEFAULT CURRENT_TIMESTAMP +); + +CREATE INDEX IF NOT EXISTS idx_agent_research_sources_run + ON agent_research_sources(research_run_id, cited); +CREATE INDEX IF NOT EXISTS idx_agent_research_sources_tool + ON agent_research_sources(tool_invocation_id); + +CREATE TABLE IF NOT EXISTS agent_eval_cases ( + id TEXT PRIMARY KEY, + suite_id TEXT NOT NULL, + case_slug TEXT NOT NULL, + case_version INTEGER NOT NULL DEFAULT 1 CHECK(case_version >= 1), + prompt_sha256 TEXT NOT NULL, + prompt_excerpt TEXT NOT NULL CHECK(length(prompt_excerpt) <= 1000), + fixture_context_sha256 TEXT, + fixture_context_excerpt TEXT CHECK(fixture_context_excerpt IS NULL OR length(fixture_context_excerpt) <= 2000), + expected_route TEXT NOT NULL + CHECK(expected_route IN ( + 'none', + 'web_search', + 'social_trends', + 'structured_market_data', + 'local_context', + 'mixed', + 'unknown' + )), + expected_provider TEXT, + must_use_tools_json TEXT NOT NULL DEFAULT '[]', + must_not_use_tools_json TEXT NOT NULL DEFAULT '[]', + tags_json TEXT NOT NULL DEFAULT '[]', + rubric_json TEXT NOT NULL DEFAULT '{}', + stale_after TEXT, + active INTEGER NOT NULL DEFAULT 1 CHECK(active IN (0, 1)), + secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0), + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(suite_id, case_slug, case_version) +); + +CREATE INDEX IF NOT EXISTS idx_agent_eval_cases_suite_active + ON agent_eval_cases(suite_id, active); +CREATE INDEX IF NOT EXISTS idx_agent_eval_cases_route + ON agent_eval_cases(expected_route); + +CREATE TABLE IF NOT EXISTS agent_eval_results ( + id TEXT PRIMARY KEY, + eval_case_id TEXT NOT NULL REFERENCES agent_eval_cases(id) ON DELETE CASCADE, + research_run_id TEXT NOT NULL REFERENCES agent_research_runs(id) ON DELETE CASCADE, + graph_evaluation_run_id TEXT REFERENCES graph_evaluation_runs(id) ON DELETE SET NULL, + status TEXT NOT NULL + CHECK(status IN ('passed', 'failed', 'warning', 'blocked', 'skipped')), + score REAL CHECK(score IS NULL OR (score >= 0 AND score <= 1)), + routing_correct INTEGER CHECK(routing_correct IS NULL OR routing_correct IN (0, 1)), + tool_choice_score REAL CHECK(tool_choice_score IS NULL OR (tool_choice_score >= 0 AND tool_choice_score <= 1)), + source_quality_score REAL CHECK(source_quality_score IS NULL OR (source_quality_score >= 0 AND source_quality_score <= 1)), + groundedness_score REAL CHECK(groundedness_score IS NULL OR (groundedness_score >= 0 AND groundedness_score <= 1)), + freshness_score REAL CHECK(freshness_score IS NULL OR (freshness_score >= 0 AND freshness_score <= 1)), + cost_efficiency_score REAL CHECK(cost_efficiency_score IS NULL OR (cost_efficiency_score >= 0 AND cost_efficiency_score <= 1)), + safety_payment_score REAL CHECK(safety_payment_score IS NULL OR (safety_payment_score >= 0 AND safety_payment_score <= 1)), + failure_reason TEXT, + judge TEXT, + proof_ref TEXT, + secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0), + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(eval_case_id, research_run_id) +); + +CREATE INDEX IF NOT EXISTS idx_agent_eval_results_case_status + ON agent_eval_results(eval_case_id, status); +CREATE INDEX IF NOT EXISTS idx_agent_eval_results_run + ON agent_eval_results(research_run_id); +CREATE INDEX IF NOT EXISTS idx_agent_eval_results_graph_eval + ON agent_eval_results(graph_evaluation_run_id); + +CREATE TABLE IF NOT EXISTS work_order_graph_links ( + id TEXT PRIMARY KEY, + sponsored_work_order_id TEXT NOT NULL, + role TEXT NOT NULL + CHECK(role IN ( + 'input_context', + 'evaluation_target', + 'created_evidence', + 'created_claim', + 'created_eval_run', + 'research_run', + 'tool_trace', + 'history_trace', + 'outcome_trace' + )), + graph_layer TEXT NOT NULL + CHECK(graph_layer IN ( + 'persona', + 'strategy', + 'position', + 'belief', + 'claim', + 'evidence', + 'edge', + 'graph_evaluation_run', + 'cascade_event', + 'graph_history_event', + 'agent_research_run', + 'agent_tool_invocation', + 'agent_eval_result', + 'outcome_observation' + )), + graph_id TEXT NOT NULL, + rationale TEXT, + secret_values_included INTEGER NOT NULL DEFAULT 0 CHECK(secret_values_included = 0), + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(sponsored_work_order_id, role, graph_layer, graph_id) +); + +CREATE INDEX IF NOT EXISTS idx_work_order_graph_links_work_order + ON work_order_graph_links(sponsored_work_order_id); +CREATE INDEX IF NOT EXISTS idx_work_order_graph_links_graph + ON work_order_graph_links(graph_layer, graph_id); diff --git a/tests/test_agent_graph_schema_sql.py b/tests/test_agent_graph_schema_sql.py new file mode 100644 index 0000000..51a8cf4 --- /dev/null +++ b/tests/test_agent_graph_schema_sql.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +import sqlite3 +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-graph-v1.sql" + + +def test_agent_graph_schema_applies_and_models_shared_nodes(): + conn = sqlite3.connect(":memory:") + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA foreign_keys = ON") + conn.executescript(SCHEMA_SQL.read_text()) + + conn.executemany( + "INSERT INTO agents (slug, display_name, archetype) VALUES (?, ?, ?)", + [ + ("leo", "Leo", "cross-domain synthesizer"), + ("theseus", "Theseus", "AI alignment"), + ], + ) + conn.execute( + """INSERT INTO agent_persona_revisions + (id, agent_slug, revision, identity, voice, role, authored_by) + VALUES + ('persona-leo-v1', 'leo', 1, 'cross-domain synthesizer', 'direct', 'evaluate commons', 'diagram'), + ('persona-theseus-v1', 'theseus', 1, 'alignment maze navigator', 'precise', 'AI evidence lead', 'diagram')""" + ) + conn.execute( + """INSERT INTO agent_strategy_revisions + (id, agent_slug, persona_revision_id, revision, diagnosis, guiding_policy, proximate_objectives_json) + VALUES + ('strategy-leo-v1', 'leo', 'persona-leo-v1', 1, 'coordination is the bottleneck', 'surface cross-domain isomorphisms', '[]'), + ('strategy-theseus-v1', 'theseus', 'persona-theseus-v1', 1, 'AI discourse is ungrounded', 'separate generation from evaluation', '[]')""" + ) + conn.executemany( + """INSERT INTO evidence + (id, evidence_type, title, summary, verification_status) + VALUES (?, ?, ?, ?, 'verified')""", + [ + ("e-kim-2025", "study", "Kim et al. ICML 2025", "Shared evidence grounding coordination and verification degradation."), + ("e-arrow", "formal_result", "Arrow impossibility theorem", "Formal result grounding alignment impossibility claim."), + ], + ) + conn.executemany( + """INSERT INTO claims + (id, slug, domain, description, confidence, primary_evidence_id, status) + VALUES (?, ?, ?, ?, ?, ?, 'accepted')""", + [ + ("c-coordination", "alignment-is-coordination", "ai-alignment", "Alignment is a coordination problem, not only a technical one.", "likely", "e-kim-2025"), + ("c-verification", "verification-degrades-with-capability", "ai-alignment", "Verification degrades as capability gaps grow.", "experimental", "e-kim-2025"), + ("c-arrow", "universal-alignment-impossible", "ai-alignment", "Universal alignment is mathematically impossible under strong aggregation assumptions.", "likely", "e-arrow"), + ], + ) + conn.executemany( + """INSERT INTO claim_evidence_edges + (id, claim_id, evidence_id, relation, weight, rationale) + VALUES (?, ?, ?, 'supports', ?, ?)""", + [ + ("ce-kim-coordination", "c-coordination", "e-kim-2025", 0.9, "Diagram shared-node case: one evidence node grounds multiple claims."), + ("ce-kim-verification", "c-verification", "e-kim-2025", 0.8, "Same evidence also grounds verification degradation."), + ("ce-arrow", "c-arrow", "e-arrow", 0.9, "Formal result evidence."), + ], + ) + conn.executemany( + """INSERT INTO agent_beliefs + (id, agent_slug, belief_code, title, statement, falsification_criteria, is_keystone) + VALUES (?, ?, ?, ?, ?, ?, ?)""", + [ + ("b-leo-b1", "leo", "B1", "Coordination bottleneck", "Coordination is the bottleneck.", "Falsified by civ-scale pure-tech solution.", 1), + ("b-theseus-t2", "theseus", "T2", "Alignment as coordination", "Alignment is a coordination problem.", "Falsified by a robust one-agent technical alignment solution.", 1), + ("b-theseus-t4", "theseus", "T4", "Verification degradation", "Verification degrades faster than capability grows.", "Falsified by scalable oversight evidence.", 0), + ], + ) + conn.executemany( + """INSERT INTO belief_claim_edges + (id, belief_id, claim_id, relation, weight, rationale) + VALUES (?, ?, ?, 'cites', ?, ?)""", + [ + ("bc-leo-coordination", "b-leo-b1", "c-coordination", 1.0, "Keystone belief cites shared claim."), + ("bc-theseus-coordination", "b-theseus-t2", "c-coordination", 0.9, "Different agent cites same shared claim."), + ("bc-theseus-verification", "b-theseus-t4", "c-verification", 0.9, "Belief cites verification claim."), + ("bc-theseus-arrow", "b-theseus-t2", "c-arrow", 0.6, "Belief also cites formal-result claim."), + ], + ) + conn.execute( + """INSERT INTO claim_edges + (id, from_claim_id, to_claim_id, relation, weight, rationale) + VALUES ('edge-verification-supports-coordination', 'c-verification', 'c-coordination', 'supports', 0.6, 'Oversight degradation strengthens coordination framing.')""" + ) + conn.execute( + """INSERT INTO cascade_events + (id, changed_layer, changed_id, affected_layer, affected_id, reason) + VALUES ('cascade-kim-to-coordination', 'evidence', 'e-kim-2025', 'claim', 'c-coordination', 'shared evidence updated')""" + ) + + shared_evidence_count = conn.execute( + "SELECT COUNT(*) AS n FROM claim_evidence_edges WHERE evidence_id = 'e-kim-2025'" + ).fetchone()["n"] + shared_claim_count = conn.execute( + "SELECT COUNT(*) AS n FROM belief_claim_edges WHERE claim_id = 'c-coordination'" + ).fetchone()["n"] + cascade_count = conn.execute("SELECT COUNT(*) AS n FROM cascade_events").fetchone()["n"] + + assert shared_evidence_count == 2 + assert shared_claim_count == 2 + assert cascade_count == 1 + + +def test_claim_edges_reject_self_reference(): + conn = sqlite3.connect(":memory:") + conn.execute("PRAGMA foreign_keys = ON") + conn.executescript(SCHEMA_SQL.read_text()) + conn.execute( + """INSERT INTO claims (id, slug, domain, description) + VALUES ('c1', 'claim-one', 'ai-alignment', 'A claim specific enough to disagree with.')""" + ) + + try: + conn.execute( + """INSERT INTO claim_edges + (id, from_claim_id, to_claim_id, relation, rationale) + VALUES ('self', 'c1', 'c1', 'related', 'self edge should fail')""" + ) + except sqlite3.IntegrityError: + pass + else: + raise AssertionError("claim_edges allowed a self-reference") diff --git a/tests/test_research_eval_schema_sql.py b/tests/test_research_eval_schema_sql.py new file mode 100644 index 0000000..34be318 --- /dev/null +++ b/tests/test_research_eval_schema_sql.py @@ -0,0 +1,365 @@ +from __future__ import annotations + +import sqlite3 +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +GRAPH_SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-graph-v1.sql" +RESEARCH_EVAL_SCHEMA_SQL = REPO_ROOT / "schemas" / "teleo-agent-research-eval-v1.sql" + + +def _conn() -> sqlite3.Connection: + conn = sqlite3.connect(":memory:") + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA foreign_keys = ON") + conn.executescript(GRAPH_SCHEMA_SQL.read_text()) + conn.executescript(RESEARCH_EVAL_SCHEMA_SQL.read_text()) + return conn + + +def test_research_eval_schema_applies_after_graph_schema(): + conn = _conn() + + versions = { + row["version"] + for row in conn.execute("SELECT version FROM graph_schema_version").fetchall() + } + assert versions == { + "teleo-agent-graph-v1", + "teleo-agent-research-eval-v1", + } + + tables = { + row["name"] + for row in conn.execute( + "SELECT name FROM sqlite_master WHERE type = 'table'" + ).fetchall() + } + assert { + "agent_research_runs", + "agent_tool_invocations", + "agent_research_sources", + "agent_eval_cases", + "agent_eval_results", + "work_order_graph_links", + } <= tables + + +def test_ranger_liquidation_case_routes_to_source_backed_research_not_market_data(): + conn = _conn() + conn.execute( + "INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')" + ) + conn.execute( + """INSERT INTO agent_eval_cases + (id, suite_id, case_slug, prompt_sha256, prompt_excerpt, expected_route, + expected_provider, must_use_tools_json, must_not_use_tools_json, tags_json, rubric_json) + VALUES + ( + 'eval-ranger-liquidated-v1', + 'leo-research-routing-v1', + 'ranger-liquidated-not-fair-value', + 'sha256:ranger-prompt', + 'Is Ranger Finance fairly valued today given Ranger Finance is liquidated and gone?', + 'web_search', + 'agentcash-stableenrich-exa-search', + '["source-backed web research"]', + '["structured_market_data_only", "live_token_fair_value"]', + '["ranger_liquidated", "valuation", "source_verification"]', + '{"routing": "verify liquidation before valuation framing"}' + )""" + ) + conn.execute( + """INSERT INTO agent_research_runs + (id, agent_slug, source_surface, source_ref, request_kind, sponsored_work_order_id, + payment_receipt_id, prompt_sha256, prompt_excerpt, selected_provider, selected_route, + status, answer_sha256, answer_excerpt, proof_ref, cost_amount, latency_ms, source_count) + VALUES + ( + 'run-ranger-liquidated-001', + 'leo', + 'telegram', + 'telegram:group:message-123', + 'paid_quote', + 'sponsored_work_orders:test-ranger-001', + 'payment_receipts:test-ranger-001', + 'sha256:ranger-prompt', + 'Is Ranger Finance fairly valued today given Ranger Finance is liquidated and gone?', + 'agentcash-stableenrich-exa-search', + 'web_search', + 'answered', + 'sha256:ranger-answer', + 'Verified liquidation/gone status before valuation framing.', + 'proof/leo-ranger-liquidated-routing.json', + 0.01, + 1240, + 3 + )""" + ) + conn.executemany( + """INSERT INTO agent_tool_invocations + (id, research_run_id, sequence, provider, tool_name, tool_category, decision, + decision_reason, paid, rail, network, amount, payment_receipt_id, input_sha256, + output_sha256, source_count, latency_ms) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + [ + ( + "tool-ranger-market-rejected", + "run-ranger-liquidated-001", + 1, + "DexScreener", + "structured-market-context", + "market_data", + "rejected", + "Ranger liquidation status must be verified before treating this as a live token valuation.", + 0, + "free", + None, + 0, + None, + "sha256:market-input", + None, + 0, + 12, + ), + ( + "tool-ranger-web-selected", + "run-ranger-liquidated-001", + 2, + "AgentCash StableEnrich", + "exa-search", + "web_search", + "executed", + "Source-backed liquidation and status verification required.", + 1, + "agentcash", + "solana:5eykt4UsFv8P8NJdTREpY1vzqKqZKvdp", + 0.01, + "payment_receipts:test-ranger-001", + "sha256:exa-input", + "sha256:exa-output", + 3, + 1228, + ), + ], + ) + conn.executemany( + """INSERT INTO agent_research_sources + (id, research_run_id, tool_invocation_id, source_type, source_uri_sha256, + title, cited, retrieval_rank, support_status) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + [ + ( + "source-ranger-official", + "run-ranger-liquidated-001", + "tool-ranger-web-selected", + "web", + "sha256:ranger-official", + "Ranger status source", + 1, + 1, + "supports", + ), + ( + "source-ranger-forum", + "run-ranger-liquidated-001", + "tool-ranger-web-selected", + "web", + "sha256:ranger-forum", + "MetaDAO/Ranger discussion source", + 1, + 2, + "context", + ), + ], + ) + conn.execute( + """INSERT INTO graph_evaluation_runs + (id, target_layer, target_id, trigger_type, evaluator, verdict, confidence, notes) + VALUES + ( + 'graph-eval-ranger-routing', + 'claim', + 'ranger-liquidated-status', + 'manual', + 'leo-research-routing-benchmark', + 'approve', + 0.92, + 'Tool choice matched Ranger liquidation guard.' + )""" + ) + conn.execute( + """INSERT INTO agent_eval_results + (id, eval_case_id, research_run_id, graph_evaluation_run_id, status, score, + routing_correct, tool_choice_score, source_quality_score, groundedness_score, + freshness_score, cost_efficiency_score, safety_payment_score, proof_ref) + VALUES + ( + 'eval-result-ranger-liquidated-001', + 'eval-ranger-liquidated-v1', + 'run-ranger-liquidated-001', + 'graph-eval-ranger-routing', + 'passed', + 0.94, + 1, + 1.0, + 0.9, + 0.9, + 0.85, + 0.8, + 1.0, + 'proof/leo-ranger-liquidated-routing.json' + )""" + ) + conn.execute( + """INSERT INTO work_order_graph_links + (id, sponsored_work_order_id, role, graph_layer, graph_id, rationale) + VALUES + ( + 'wo-ranger-run-link', + 'sponsored_work_orders:test-ranger-001', + 'research_run', + 'agent_research_run', + 'run-ranger-liquidated-001', + 'Paid work order produced source-backed research run.' + )""" + ) + + row = conn.execute( + """SELECT + r.selected_route, + r.selected_provider, + er.status AS eval_status, + er.routing_correct, + er.tool_choice_score, + COUNT(s.id) AS cited_source_count + FROM agent_research_runs r + JOIN agent_eval_results er ON er.research_run_id = r.id + LEFT JOIN agent_research_sources s ON s.research_run_id = r.id AND s.cited = 1 + WHERE r.id = 'run-ranger-liquidated-001' + GROUP BY r.id, er.id""" + ).fetchone() + + market_executed = conn.execute( + """SELECT COUNT(*) AS n + FROM agent_tool_invocations + WHERE research_run_id = 'run-ranger-liquidated-001' + AND tool_category = 'market_data' + AND decision = 'executed'""" + ).fetchone()["n"] + rejected_market = conn.execute( + """SELECT COUNT(*) AS n + FROM agent_tool_invocations + WHERE research_run_id = 'run-ranger-liquidated-001' + AND tool_category = 'market_data' + AND decision = 'rejected'""" + ).fetchone()["n"] + + assert dict(row) == { + "selected_route": "web_search", + "selected_provider": "agentcash-stableenrich-exa-search", + "eval_status": "passed", + "routing_correct": 1, + "tool_choice_score": 1.0, + "cited_source_count": 2, + } + assert market_executed == 0 + assert rejected_market == 1 + + +def test_schema_rejects_secret_flags_bad_scores_and_bad_tool_decisions(): + conn = _conn() + conn.execute( + "INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')" + ) + conn.execute( + """INSERT INTO agent_research_runs + (id, agent_slug, source_surface, request_kind, prompt_sha256, selected_route, status) + VALUES ('run-constraints', 'leo', 'test', 'benchmark', 'sha256:prompt', 'web_search', 'answered')""" + ) + conn.execute( + """INSERT INTO agent_eval_cases + (id, suite_id, case_slug, prompt_sha256, prompt_excerpt, expected_route) + VALUES ('case-constraints', 'suite', 'case', 'sha256:prompt', 'redacted prompt', 'web_search')""" + ) + + invalid_statements = [ + """INSERT INTO agent_research_runs + (id, agent_slug, source_surface, request_kind, prompt_sha256, selected_route, status, secret_values_included) + VALUES ('run-secret', 'leo', 'test', 'benchmark', 'sha256:secret', 'web_search', 'answered', 1)""", + """INSERT INTO agent_tool_invocations + (id, research_run_id, provider, tool_name, tool_category, decision, decision_reason) + VALUES ('tool-bad-decision', 'run-constraints', 'p', 't', 'web_search', 'approved', 'bad enum')""", + """INSERT INTO agent_eval_results + (id, eval_case_id, research_run_id, status, score) + VALUES ('eval-bad-score', 'case-constraints', 'run-constraints', 'passed', 1.1)""", + """INSERT INTO agent_eval_results + (id, eval_case_id, research_run_id, status, routing_correct) + VALUES ('eval-bad-bool', 'case-constraints', 'run-constraints', 'passed', 2)""", + ] + + for statement in invalid_statements: + try: + conn.execute(statement) + except sqlite3.IntegrityError: + pass + else: + raise AssertionError(f"invalid statement unexpectedly passed: {statement}") + + +def test_research_run_can_be_recorded_without_raw_prompt_or_private_payloads(): + conn = _conn() + conn.execute( + "INSERT INTO agents (slug, display_name, archetype) VALUES ('leo', 'Leo', 'research agent')" + ) + conn.execute( + """INSERT INTO agent_research_runs + (id, agent_slug, source_surface, source_ref, request_kind, prompt_sha256, + selected_route, status, answer_sha256, proof_ref) + VALUES + ( + 'run-hash-only', + 'leo', + 'api', + 'api:request-redacted', + 'paid_work_order', + 'sha256:prompt-only', + 'social_trends', + 'answered', + 'sha256:answer-only', + 'proof/hash-only.json' + )""" + ) + conn.execute( + """INSERT INTO agent_tool_invocations + (id, research_run_id, provider, tool_name, tool_category, decision, + decision_reason, input_sha256, output_sha256) + VALUES + ( + 'tool-hash-only', + 'run-hash-only', + 'AgentCash StableSocial', + 'lightreel-trends', + 'social_trends', + 'executed', + 'Question asks for current Twitter/X discussion.', + 'sha256:input-only', + 'sha256:output-only' + )""" + ) + + row = conn.execute( + """SELECT + r.prompt_excerpt, + r.answer_excerpt, + r.secret_values_included AS run_secret_flag, + t.secret_values_included AS tool_secret_flag + FROM agent_research_runs r + JOIN agent_tool_invocations t ON t.research_run_id = r.id + WHERE r.id = 'run-hash-only'""" + ).fetchone() + + assert row["prompt_excerpt"] is None + assert row["answer_excerpt"] is None + assert row["run_secret_flag"] == 0 + assert row["tool_secret_flag"] == 0