teleo-infrastructure/scripts/check_llm_refinement_contract.py
twentyOne2x 71ea7a625c Add decision engine replay harness
- Add source-linked model discovery registry for bakeoff candidates
- Add Rio, Theseus, and KB interop fixtures with deterministic replay proof
- Gate CI on replay output; verify with 424-test suite

`.crabbox.yaml`
`.github/workflows/ci.yml`
`docs/llm-refinement-decision-engine.md`
`docs/model-discovery-registry.md`
`fixtures/decision-engine-eval/kb_interop_propose_only.json`
`fixtures/decision-engine-eval/rio_meteora_lp_incentives.json`
`fixtures/decision-engine-eval/theseus_live_model_switch_reject.json`
`scripts/check_llm_refinement_contract.py`
`scripts/replay_decision_engine_eval.py`
`tests/test_decision_engine_replay.py`
2026-06-01 17:37:38 +02:00

185 lines
6.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""Validate the LLM refinement and decision-engine guidance surface."""
from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
REQUIRED_FILES = {
"program_doc": REPO_ROOT / "docs" / "llm-refinement-decision-engine.md",
"model_registry": REPO_ROOT / "docs" / "model-discovery-registry.md",
"replay_script": REPO_ROOT / "scripts" / "replay_decision_engine_eval.py",
"decision_skill": REPO_ROOT / ".agents" / "skills" / "decision-engine-refinement" / "SKILL.md",
"db_skill": REPO_ROOT / ".agents" / "skills" / "teleo-db-operator" / "SKILL.md",
"kb_skill": REPO_ROOT / ".agents" / "skills" / "living-ip-kb-interop" / "SKILL.md",
"hermes_skill": REPO_ROOT / ".agents" / "skills" / "nousresearch-hermes-agent" / "SKILL.md",
"openclaw_skill": REPO_ROOT / ".agents" / "skills" / "openclaw-agent" / "SKILL.md",
}
PROGRAM_REQUIRED_PHRASES = [
"Pentagon.run should own disposable infrastructure",
"This repo should own decision quality",
"Rio becomes the economic and incentive-quality evaluator",
"Theseus becomes the model-integrity and agent-refinement evaluator",
"No model switch is accepted because it",
"Default is read-only",
"Model Discovery Registry",
"Any Hermes, OpenClaw, or Claude-style agent",
"Raw cards and secrets are not agent runtime inputs",
"scripts/replay_decision_engine_eval.py",
]
MODEL_REGISTRY_REQUIRED_PHRASES = [
"candidate registry, not model approval",
"GPT-5.5",
"gpt-oss-20b",
"Claude Opus 4.8",
"Gemini 3.5 Flash",
"Hermes 4 70B",
"Qwen3.5 9B",
"Zero false approvals on known-bad fixtures",
]
REPLAY_REQUIRED_PHRASES = [
"decision_engine_replay",
"false_approve_count",
"kb_interop_ok",
"route_accuracy",
]
SKILL_REQUIRED = {
"decision_skill": [
"Rio economics",
"Theseus model integrity",
"Do not change live model assignments",
"baseline verdict output",
],
"db_skill": [
"Default to read-only",
"BEGIN IMMEDIATE",
"Do not attach, copy, or commit `pipeline.db`",
"review_records",
],
"kb_skill": [
"propose-first",
"kb.search",
"Do not write directly to main",
"teleo-db-operator",
],
"hermes_skill": [
"model switching",
"fixture-first",
"Rio Hermes package",
"Theseus Hermes package",
"living-ip-kb-interop",
],
"openclaw_skill": [
"AGENTS.md",
"SOUL.md",
"TOOLS.md",
"Default deny",
"living-ip-kb-interop",
],
}
FIXTURE_REQUIRED = {
"rio_meteora_lp_incentives.json": ["rio-economics", "paid_query_effects", "Rio"],
"theseus_live_model_switch_reject.json": [
"theseus-model-integrity",
"model_assignment_without_eval",
"Theseus",
],
"kb_interop_propose_only.json": ["kb-interop", "no_prod_db_write", "Theseus"],
}
def _read(path: Path) -> str:
if not path.exists():
raise AssertionError(f"missing file: {path.relative_to(REPO_ROOT)}")
return path.read_text()
def _assert_frontmatter(path: Path, text: str) -> None:
match = re.match(r"^---\n(?P<body>.*?)\n---\n", text, flags=re.DOTALL)
if not match:
raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing YAML frontmatter")
body = match.group("body")
if "name:" not in body or "description:" not in body:
raise AssertionError(f"{path.relative_to(REPO_ROOT)} frontmatter needs name and description")
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--output", default=".crabbox-results/llm-refinement-contract.json")
args = parser.parse_args()
program = _read(REQUIRED_FILES["program_doc"])
missing_program = [phrase for phrase in PROGRAM_REQUIRED_PHRASES if phrase not in program]
if missing_program:
raise AssertionError(f"program doc missing phrases: {missing_program}")
model_registry = _read(REQUIRED_FILES["model_registry"])
missing_registry = [phrase for phrase in MODEL_REGISTRY_REQUIRED_PHRASES if phrase not in model_registry]
if missing_registry:
raise AssertionError(f"model registry missing phrases: {missing_registry}")
replay_script = _read(REQUIRED_FILES["replay_script"])
missing_replay = [phrase for phrase in REPLAY_REQUIRED_PHRASES if phrase not in replay_script]
if missing_replay:
raise AssertionError(f"replay script missing phrases: {missing_replay}")
fixture_checks = {}
fixtures_dir = REPO_ROOT / "fixtures" / "decision-engine-eval"
for filename, phrases in FIXTURE_REQUIRED.items():
path = fixtures_dir / filename
text = _read(path)
missing = [phrase for phrase in phrases if phrase not in text]
if missing:
raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing phrases: {missing}")
fixture_checks[filename] = {
"path": str(path.relative_to(REPO_ROOT)),
"phrases_checked": phrases,
}
skill_checks = {}
for key, phrases in SKILL_REQUIRED.items():
path = REQUIRED_FILES[key]
text = _read(path)
_assert_frontmatter(path, text)
missing = [phrase for phrase in phrases if phrase not in text]
if missing:
raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing phrases: {missing}")
skill_checks[key] = {
"path": str(path.relative_to(REPO_ROOT)),
"phrases_checked": phrases,
}
proof = {
"ok": True,
"scope": "llm_refinement_decision_engine_contract",
"program_doc": str(REQUIRED_FILES["program_doc"].relative_to(REPO_ROOT)),
"model_registry": str(REQUIRED_FILES["model_registry"].relative_to(REPO_ROOT)),
"program_phrases_checked": PROGRAM_REQUIRED_PHRASES,
"model_registry_phrases_checked": MODEL_REGISTRY_REQUIRED_PHRASES,
"fixtures": fixture_checks,
"skills": skill_checks,
"pivot": {
"infra_owner": "Pentagon.run",
"repo_owner": "decision quality, rubrics, model evals, prompt/tool refinement, DB feedback loops",
},
}
output = REPO_ROOT / args.output
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(json.dumps(proof, indent=2, sort_keys=True) + "\n")
print(json.dumps(proof, indent=2, sort_keys=True))
return 0
if __name__ == "__main__":
raise SystemExit(main())