- Add source-linked model discovery registry for bakeoff candidates - Add Rio, Theseus, and KB interop fixtures with deterministic replay proof - Gate CI on replay output; verify with 424-test suite `.crabbox.yaml` `.github/workflows/ci.yml` `docs/llm-refinement-decision-engine.md` `docs/model-discovery-registry.md` `fixtures/decision-engine-eval/kb_interop_propose_only.json` `fixtures/decision-engine-eval/rio_meteora_lp_incentives.json` `fixtures/decision-engine-eval/theseus_live_model_switch_reject.json` `scripts/check_llm_refinement_contract.py` `scripts/replay_decision_engine_eval.py` `tests/test_decision_engine_replay.py`
56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
from __future__ import annotations
|
|
|
|
import importlib.util
|
|
import json
|
|
from pathlib import Path
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
SCRIPT_PATH = REPO_ROOT / "scripts" / "replay_decision_engine_eval.py"
|
|
FIXTURES_DIR = REPO_ROOT / "fixtures" / "decision-engine-eval"
|
|
|
|
spec = importlib.util.spec_from_file_location("replay_decision_engine_eval", SCRIPT_PATH)
|
|
replay = importlib.util.module_from_spec(spec)
|
|
assert spec.loader is not None
|
|
spec.loader.exec_module(replay)
|
|
|
|
|
|
def test_default_decision_engine_fixtures_replay_cleanly():
|
|
fixtures = replay.load_fixtures(FIXTURES_DIR)
|
|
proof = replay.evaluate_fixtures(fixtures)
|
|
|
|
assert proof["ok"] is True
|
|
assert proof["fixture_count"] == 3
|
|
assert proof["metrics"]["route_accuracy"] == 1.0
|
|
assert proof["metrics"]["lanes"] == {
|
|
"kb-interop": 1,
|
|
"rio-economics": 1,
|
|
"theseus-model-integrity": 1,
|
|
}
|
|
|
|
|
|
def test_candidate_false_approve_is_caught(tmp_path):
|
|
fixtures = replay.load_fixtures(FIXTURES_DIR)
|
|
candidate_path = tmp_path / "candidate.json"
|
|
candidate_path.write_text(
|
|
json.dumps(
|
|
{
|
|
"candidate_name": "bad-single-answer-model",
|
|
"verdicts": [
|
|
{
|
|
"fixture_id": "theseus_live_model_switch_reject",
|
|
"disposition": "approve",
|
|
"issue_tags": [],
|
|
"primary_agent": "Theseus",
|
|
"required_agents": ["Theseus"],
|
|
}
|
|
],
|
|
}
|
|
)
|
|
)
|
|
|
|
candidate = replay._load_candidate_output(candidate_path)
|
|
proof = replay.evaluate_fixtures(fixtures, candidate=candidate)
|
|
|
|
assert proof["ok"] is False
|
|
assert proof["candidate"]["false_approve_count"] == 1
|
|
assert proof["candidate"]["false_approves"] == ["theseus_live_model_switch_reject"]
|