- Add source-linked model discovery registry for bakeoff candidates - Add Rio, Theseus, and KB interop fixtures with deterministic replay proof - Gate CI on replay output; verify with 424-test suite `.crabbox.yaml` `.github/workflows/ci.yml` `docs/llm-refinement-decision-engine.md` `docs/model-discovery-registry.md` `fixtures/decision-engine-eval/kb_interop_propose_only.json` `fixtures/decision-engine-eval/rio_meteora_lp_incentives.json` `fixtures/decision-engine-eval/theseus_live_model_switch_reject.json` `scripts/check_llm_refinement_contract.py` `scripts/replay_decision_engine_eval.py` `tests/test_decision_engine_replay.py`
185 lines
6.4 KiB
Python
Executable file
185 lines
6.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""Validate the LLM refinement and decision-engine guidance surface."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
|
|
REQUIRED_FILES = {
|
|
"program_doc": REPO_ROOT / "docs" / "llm-refinement-decision-engine.md",
|
|
"model_registry": REPO_ROOT / "docs" / "model-discovery-registry.md",
|
|
"replay_script": REPO_ROOT / "scripts" / "replay_decision_engine_eval.py",
|
|
"decision_skill": REPO_ROOT / ".agents" / "skills" / "decision-engine-refinement" / "SKILL.md",
|
|
"db_skill": REPO_ROOT / ".agents" / "skills" / "teleo-db-operator" / "SKILL.md",
|
|
"kb_skill": REPO_ROOT / ".agents" / "skills" / "living-ip-kb-interop" / "SKILL.md",
|
|
"hermes_skill": REPO_ROOT / ".agents" / "skills" / "nousresearch-hermes-agent" / "SKILL.md",
|
|
"openclaw_skill": REPO_ROOT / ".agents" / "skills" / "openclaw-agent" / "SKILL.md",
|
|
}
|
|
|
|
PROGRAM_REQUIRED_PHRASES = [
|
|
"Pentagon.run should own disposable infrastructure",
|
|
"This repo should own decision quality",
|
|
"Rio becomes the economic and incentive-quality evaluator",
|
|
"Theseus becomes the model-integrity and agent-refinement evaluator",
|
|
"No model switch is accepted because it",
|
|
"Default is read-only",
|
|
"Model Discovery Registry",
|
|
"Any Hermes, OpenClaw, or Claude-style agent",
|
|
"Raw cards and secrets are not agent runtime inputs",
|
|
"scripts/replay_decision_engine_eval.py",
|
|
]
|
|
|
|
MODEL_REGISTRY_REQUIRED_PHRASES = [
|
|
"candidate registry, not model approval",
|
|
"GPT-5.5",
|
|
"gpt-oss-20b",
|
|
"Claude Opus 4.8",
|
|
"Gemini 3.5 Flash",
|
|
"Hermes 4 70B",
|
|
"Qwen3.5 9B",
|
|
"Zero false approvals on known-bad fixtures",
|
|
]
|
|
|
|
REPLAY_REQUIRED_PHRASES = [
|
|
"decision_engine_replay",
|
|
"false_approve_count",
|
|
"kb_interop_ok",
|
|
"route_accuracy",
|
|
]
|
|
|
|
SKILL_REQUIRED = {
|
|
"decision_skill": [
|
|
"Rio economics",
|
|
"Theseus model integrity",
|
|
"Do not change live model assignments",
|
|
"baseline verdict output",
|
|
],
|
|
"db_skill": [
|
|
"Default to read-only",
|
|
"BEGIN IMMEDIATE",
|
|
"Do not attach, copy, or commit `pipeline.db`",
|
|
"review_records",
|
|
],
|
|
"kb_skill": [
|
|
"propose-first",
|
|
"kb.search",
|
|
"Do not write directly to main",
|
|
"teleo-db-operator",
|
|
],
|
|
"hermes_skill": [
|
|
"model switching",
|
|
"fixture-first",
|
|
"Rio Hermes package",
|
|
"Theseus Hermes package",
|
|
"living-ip-kb-interop",
|
|
],
|
|
"openclaw_skill": [
|
|
"AGENTS.md",
|
|
"SOUL.md",
|
|
"TOOLS.md",
|
|
"Default deny",
|
|
"living-ip-kb-interop",
|
|
],
|
|
}
|
|
|
|
FIXTURE_REQUIRED = {
|
|
"rio_meteora_lp_incentives.json": ["rio-economics", "paid_query_effects", "Rio"],
|
|
"theseus_live_model_switch_reject.json": [
|
|
"theseus-model-integrity",
|
|
"model_assignment_without_eval",
|
|
"Theseus",
|
|
],
|
|
"kb_interop_propose_only.json": ["kb-interop", "no_prod_db_write", "Theseus"],
|
|
}
|
|
|
|
|
|
def _read(path: Path) -> str:
|
|
if not path.exists():
|
|
raise AssertionError(f"missing file: {path.relative_to(REPO_ROOT)}")
|
|
return path.read_text()
|
|
|
|
|
|
def _assert_frontmatter(path: Path, text: str) -> None:
|
|
match = re.match(r"^---\n(?P<body>.*?)\n---\n", text, flags=re.DOTALL)
|
|
if not match:
|
|
raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing YAML frontmatter")
|
|
body = match.group("body")
|
|
if "name:" not in body or "description:" not in body:
|
|
raise AssertionError(f"{path.relative_to(REPO_ROOT)} frontmatter needs name and description")
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--output", default=".crabbox-results/llm-refinement-contract.json")
|
|
args = parser.parse_args()
|
|
|
|
program = _read(REQUIRED_FILES["program_doc"])
|
|
missing_program = [phrase for phrase in PROGRAM_REQUIRED_PHRASES if phrase not in program]
|
|
if missing_program:
|
|
raise AssertionError(f"program doc missing phrases: {missing_program}")
|
|
|
|
model_registry = _read(REQUIRED_FILES["model_registry"])
|
|
missing_registry = [phrase for phrase in MODEL_REGISTRY_REQUIRED_PHRASES if phrase not in model_registry]
|
|
if missing_registry:
|
|
raise AssertionError(f"model registry missing phrases: {missing_registry}")
|
|
|
|
replay_script = _read(REQUIRED_FILES["replay_script"])
|
|
missing_replay = [phrase for phrase in REPLAY_REQUIRED_PHRASES if phrase not in replay_script]
|
|
if missing_replay:
|
|
raise AssertionError(f"replay script missing phrases: {missing_replay}")
|
|
|
|
fixture_checks = {}
|
|
fixtures_dir = REPO_ROOT / "fixtures" / "decision-engine-eval"
|
|
for filename, phrases in FIXTURE_REQUIRED.items():
|
|
path = fixtures_dir / filename
|
|
text = _read(path)
|
|
missing = [phrase for phrase in phrases if phrase not in text]
|
|
if missing:
|
|
raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing phrases: {missing}")
|
|
fixture_checks[filename] = {
|
|
"path": str(path.relative_to(REPO_ROOT)),
|
|
"phrases_checked": phrases,
|
|
}
|
|
|
|
skill_checks = {}
|
|
for key, phrases in SKILL_REQUIRED.items():
|
|
path = REQUIRED_FILES[key]
|
|
text = _read(path)
|
|
_assert_frontmatter(path, text)
|
|
missing = [phrase for phrase in phrases if phrase not in text]
|
|
if missing:
|
|
raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing phrases: {missing}")
|
|
skill_checks[key] = {
|
|
"path": str(path.relative_to(REPO_ROOT)),
|
|
"phrases_checked": phrases,
|
|
}
|
|
|
|
proof = {
|
|
"ok": True,
|
|
"scope": "llm_refinement_decision_engine_contract",
|
|
"program_doc": str(REQUIRED_FILES["program_doc"].relative_to(REPO_ROOT)),
|
|
"model_registry": str(REQUIRED_FILES["model_registry"].relative_to(REPO_ROOT)),
|
|
"program_phrases_checked": PROGRAM_REQUIRED_PHRASES,
|
|
"model_registry_phrases_checked": MODEL_REGISTRY_REQUIRED_PHRASES,
|
|
"fixtures": fixture_checks,
|
|
"skills": skill_checks,
|
|
"pivot": {
|
|
"infra_owner": "Pentagon.run",
|
|
"repo_owner": "decision quality, rubrics, model evals, prompt/tool refinement, DB feedback loops",
|
|
},
|
|
}
|
|
|
|
output = REPO_ROOT / args.output
|
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
output.write_text(json.dumps(proof, indent=2, sort_keys=True) + "\n")
|
|
print(json.dumps(proof, indent=2, sort_keys=True))
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|