#!/usr/bin/env python3 """Validate the LLM refinement and decision-engine guidance surface.""" from __future__ import annotations import argparse import json import re from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[1] REQUIRED_FILES = { "program_doc": REPO_ROOT / "docs" / "llm-refinement-decision-engine.md", "model_registry": REPO_ROOT / "docs" / "model-discovery-registry.md", "replay_script": REPO_ROOT / "scripts" / "replay_decision_engine_eval.py", "decision_skill": REPO_ROOT / ".agents" / "skills" / "decision-engine-refinement" / "SKILL.md", "db_skill": REPO_ROOT / ".agents" / "skills" / "teleo-db-operator" / "SKILL.md", "kb_skill": REPO_ROOT / ".agents" / "skills" / "living-ip-kb-interop" / "SKILL.md", "hermes_skill": REPO_ROOT / ".agents" / "skills" / "nousresearch-hermes-agent" / "SKILL.md", "openclaw_skill": REPO_ROOT / ".agents" / "skills" / "openclaw-agent" / "SKILL.md", } PROGRAM_REQUIRED_PHRASES = [ "Pentagon.run should own disposable infrastructure", "This repo should own decision quality", "Rio becomes the economic and incentive-quality evaluator", "Theseus becomes the model-integrity and agent-refinement evaluator", "No model switch is accepted because it", "Default is read-only", "Model Discovery Registry", "Any Hermes, OpenClaw, or Claude-style agent", "Raw cards and secrets are not agent runtime inputs", "scripts/replay_decision_engine_eval.py", ] MODEL_REGISTRY_REQUIRED_PHRASES = [ "candidate registry, not model approval", "GPT-5.5", "gpt-oss-20b", "Claude Opus 4.8", "Gemini 3.5 Flash", "Hermes 4 70B", "Qwen3.5 9B", "Zero false approvals on known-bad fixtures", ] REPLAY_REQUIRED_PHRASES = [ "decision_engine_replay", "false_approve_count", "kb_interop_ok", "route_accuracy", ] SKILL_REQUIRED = { "decision_skill": [ "Rio economics", "Theseus model integrity", "Do not change live model assignments", "baseline verdict output", ], "db_skill": [ "Default to read-only", "BEGIN IMMEDIATE", "Do not attach, copy, or commit `pipeline.db`", "review_records", ], "kb_skill": [ "propose-first", "kb.search", "Do not write directly to main", "teleo-db-operator", ], "hermes_skill": [ "model switching", "fixture-first", "Rio Hermes package", "Theseus Hermes package", "living-ip-kb-interop", ], "openclaw_skill": [ "AGENTS.md", "SOUL.md", "TOOLS.md", "Default deny", "living-ip-kb-interop", ], } FIXTURE_REQUIRED = { "rio_meteora_lp_incentives.json": ["rio-economics", "paid_query_effects", "Rio"], "theseus_live_model_switch_reject.json": [ "theseus-model-integrity", "model_assignment_without_eval", "Theseus", ], "kb_interop_propose_only.json": ["kb-interop", "no_prod_db_write", "Theseus"], } def _read(path: Path) -> str: if not path.exists(): raise AssertionError(f"missing file: {path.relative_to(REPO_ROOT)}") return path.read_text() def _assert_frontmatter(path: Path, text: str) -> None: match = re.match(r"^---\n(?P.*?)\n---\n", text, flags=re.DOTALL) if not match: raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing YAML frontmatter") body = match.group("body") if "name:" not in body or "description:" not in body: raise AssertionError(f"{path.relative_to(REPO_ROOT)} frontmatter needs name and description") def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--output", default=".crabbox-results/llm-refinement-contract.json") args = parser.parse_args() program = _read(REQUIRED_FILES["program_doc"]) missing_program = [phrase for phrase in PROGRAM_REQUIRED_PHRASES if phrase not in program] if missing_program: raise AssertionError(f"program doc missing phrases: {missing_program}") model_registry = _read(REQUIRED_FILES["model_registry"]) missing_registry = [phrase for phrase in MODEL_REGISTRY_REQUIRED_PHRASES if phrase not in model_registry] if missing_registry: raise AssertionError(f"model registry missing phrases: {missing_registry}") replay_script = _read(REQUIRED_FILES["replay_script"]) missing_replay = [phrase for phrase in REPLAY_REQUIRED_PHRASES if phrase not in replay_script] if missing_replay: raise AssertionError(f"replay script missing phrases: {missing_replay}") fixture_checks = {} fixtures_dir = REPO_ROOT / "fixtures" / "decision-engine-eval" for filename, phrases in FIXTURE_REQUIRED.items(): path = fixtures_dir / filename text = _read(path) missing = [phrase for phrase in phrases if phrase not in text] if missing: raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing phrases: {missing}") fixture_checks[filename] = { "path": str(path.relative_to(REPO_ROOT)), "phrases_checked": phrases, } skill_checks = {} for key, phrases in SKILL_REQUIRED.items(): path = REQUIRED_FILES[key] text = _read(path) _assert_frontmatter(path, text) missing = [phrase for phrase in phrases if phrase not in text] if missing: raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing phrases: {missing}") skill_checks[key] = { "path": str(path.relative_to(REPO_ROOT)), "phrases_checked": phrases, } proof = { "ok": True, "scope": "llm_refinement_decision_engine_contract", "program_doc": str(REQUIRED_FILES["program_doc"].relative_to(REPO_ROOT)), "model_registry": str(REQUIRED_FILES["model_registry"].relative_to(REPO_ROOT)), "program_phrases_checked": PROGRAM_REQUIRED_PHRASES, "model_registry_phrases_checked": MODEL_REGISTRY_REQUIRED_PHRASES, "fixtures": fixture_checks, "skills": skill_checks, "pivot": { "infra_owner": "Pentagon.run", "repo_owner": "decision quality, rubrics, model evals, prompt/tool refinement, DB feedback loops", }, } output = REPO_ROOT / args.output output.parent.mkdir(parents=True, exist_ok=True) output.write_text(json.dumps(proof, indent=2, sort_keys=True) + "\n") print(json.dumps(proof, indent=2, sort_keys=True)) return 0 if __name__ == "__main__": raise SystemExit(main())