teleo-infrastructure/tests/test_decision_engine_replay.py
twentyOne2x 71ea7a625c Add decision engine replay harness
- Add source-linked model discovery registry for bakeoff candidates
- Add Rio, Theseus, and KB interop fixtures with deterministic replay proof
- Gate CI on replay output; verify with 424-test suite

`.crabbox.yaml`
`.github/workflows/ci.yml`
`docs/llm-refinement-decision-engine.md`
`docs/model-discovery-registry.md`
`fixtures/decision-engine-eval/kb_interop_propose_only.json`
`fixtures/decision-engine-eval/rio_meteora_lp_incentives.json`
`fixtures/decision-engine-eval/theseus_live_model_switch_reject.json`
`scripts/check_llm_refinement_contract.py`
`scripts/replay_decision_engine_eval.py`
`tests/test_decision_engine_replay.py`
2026-06-01 17:37:38 +02:00

56 lines
1.8 KiB
Python

from __future__ import annotations
import importlib.util
import json
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
SCRIPT_PATH = REPO_ROOT / "scripts" / "replay_decision_engine_eval.py"
FIXTURES_DIR = REPO_ROOT / "fixtures" / "decision-engine-eval"
spec = importlib.util.spec_from_file_location("replay_decision_engine_eval", SCRIPT_PATH)
replay = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(replay)
def test_default_decision_engine_fixtures_replay_cleanly():
fixtures = replay.load_fixtures(FIXTURES_DIR)
proof = replay.evaluate_fixtures(fixtures)
assert proof["ok"] is True
assert proof["fixture_count"] == 3
assert proof["metrics"]["route_accuracy"] == 1.0
assert proof["metrics"]["lanes"] == {
"kb-interop": 1,
"rio-economics": 1,
"theseus-model-integrity": 1,
}
def test_candidate_false_approve_is_caught(tmp_path):
fixtures = replay.load_fixtures(FIXTURES_DIR)
candidate_path = tmp_path / "candidate.json"
candidate_path.write_text(
json.dumps(
{
"candidate_name": "bad-single-answer-model",
"verdicts": [
{
"fixture_id": "theseus_live_model_switch_reject",
"disposition": "approve",
"issue_tags": [],
"primary_agent": "Theseus",
"required_agents": ["Theseus"],
}
],
}
)
)
candidate = replay._load_candidate_output(candidate_path)
proof = replay.evaluate_fixtures(fixtures, candidate=candidate)
assert proof["ok"] is False
assert proof["candidate"]["false_approve_count"] == 1
assert proof["candidate"]["false_approves"] == ["theseus_live_model_switch_reject"]