teleo-infrastructure/scripts/check_llm_refinement_contract.py

#!/usr/bin/env python3
"""Validate the LLM refinement and decision-engine guidance surface."""

from __future__ import annotations

import argparse
import json
import re
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]

REQUIRED_FILES = {
    "program_doc": REPO_ROOT / "docs" / "llm-refinement-decision-engine.md",
    "model_registry": REPO_ROOT / "docs" / "model-discovery-registry.md",
    "replay_script": REPO_ROOT / "scripts" / "replay_decision_engine_eval.py",
    "decision_skill": REPO_ROOT / ".agents" / "skills" / "decision-engine-refinement" / "SKILL.md",
    "db_skill": REPO_ROOT / ".agents" / "skills" / "teleo-db-operator" / "SKILL.md",
    "kb_skill": REPO_ROOT / ".agents" / "skills" / "living-ip-kb-interop" / "SKILL.md",
    "hermes_skill": REPO_ROOT / ".agents" / "skills" / "nousresearch-hermes-agent" / "SKILL.md",
    "openclaw_skill": REPO_ROOT / ".agents" / "skills" / "openclaw-agent" / "SKILL.md",
}

PROGRAM_REQUIRED_PHRASES = [
    "Pentagon.run should own disposable infrastructure",
    "This repo should own decision quality",
    "Rio becomes the economic and incentive-quality evaluator",
    "Theseus becomes the model-integrity and agent-refinement evaluator",
    "No model switch is accepted because it",
    "Default is read-only",
    "Model Discovery Registry",
    "Any Hermes, OpenClaw, or Claude-style agent",
    "Raw cards and secrets are not agent runtime inputs",
    "scripts/replay_decision_engine_eval.py",
]

MODEL_REGISTRY_REQUIRED_PHRASES = [
    "candidate registry, not model approval",
    "GPT-5.5",
    "gpt-oss-20b",
    "Claude Opus 4.8",
    "Gemini 3.5 Flash",
    "Hermes 4 70B",
    "Qwen3.5 9B",
    "Zero false approvals on known-bad fixtures",
]

REPLAY_REQUIRED_PHRASES = [
    "decision_engine_replay",
    "false_approve_count",
    "kb_interop_ok",
    "route_accuracy",
]

SKILL_REQUIRED = {
    "decision_skill": [
        "Rio economics",
        "Theseus model integrity",
        "Do not change live model assignments",
        "baseline verdict output",
    ],
    "db_skill": [
        "Default to read-only",
        "BEGIN IMMEDIATE",
        "Do not attach, copy, or commit `pipeline.db`",
        "review_records",
    ],
    "kb_skill": [
        "propose-first",
        "kb.search",
        "Do not write directly to main",
        "teleo-db-operator",
    ],
    "hermes_skill": [
        "model switching",
        "fixture-first",
        "Rio Hermes package",
        "Theseus Hermes package",
        "living-ip-kb-interop",
    ],
    "openclaw_skill": [
        "AGENTS.md",
        "SOUL.md",
        "TOOLS.md",
        "Default deny",
        "living-ip-kb-interop",
    ],
}

FIXTURE_REQUIRED = {
    "rio_meteora_lp_incentives.json": ["rio-economics", "paid_query_effects", "Rio"],
    "theseus_live_model_switch_reject.json": [
        "theseus-model-integrity",
        "model_assignment_without_eval",
        "Theseus",
    ],
    "kb_interop_propose_only.json": ["kb-interop", "no_prod_db_write", "Theseus"],
}


def _read(path: Path) -> str:
    if not path.exists():
        raise AssertionError(f"missing file: {path.relative_to(REPO_ROOT)}")
    return path.read_text()


def _assert_frontmatter(path: Path, text: str) -> None:
    match = re.match(r"^---\n(?P<body>.*?)\n---\n", text, flags=re.DOTALL)
    if not match:
        raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing YAML frontmatter")
    body = match.group("body")
    if "name:" not in body or "description:" not in body:
        raise AssertionError(f"{path.relative_to(REPO_ROOT)} frontmatter needs name and description")


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--output", default=".crabbox-results/llm-refinement-contract.json")
    args = parser.parse_args()

    program = _read(REQUIRED_FILES["program_doc"])
    missing_program = [phrase for phrase in PROGRAM_REQUIRED_PHRASES if phrase not in program]
    if missing_program:
        raise AssertionError(f"program doc missing phrases: {missing_program}")

    model_registry = _read(REQUIRED_FILES["model_registry"])
    missing_registry = [phrase for phrase in MODEL_REGISTRY_REQUIRED_PHRASES if phrase not in model_registry]
    if missing_registry:
        raise AssertionError(f"model registry missing phrases: {missing_registry}")

    replay_script = _read(REQUIRED_FILES["replay_script"])
    missing_replay = [phrase for phrase in REPLAY_REQUIRED_PHRASES if phrase not in replay_script]
    if missing_replay:
        raise AssertionError(f"replay script missing phrases: {missing_replay}")

    fixture_checks = {}
    fixtures_dir = REPO_ROOT / "fixtures" / "decision-engine-eval"
    for filename, phrases in FIXTURE_REQUIRED.items():
        path = fixtures_dir / filename
        text = _read(path)
        missing = [phrase for phrase in phrases if phrase not in text]
        if missing:
            raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing phrases: {missing}")
        fixture_checks[filename] = {
            "path": str(path.relative_to(REPO_ROOT)),
            "phrases_checked": phrases,
        }

    skill_checks = {}
    for key, phrases in SKILL_REQUIRED.items():
        path = REQUIRED_FILES[key]
        text = _read(path)
        _assert_frontmatter(path, text)
        missing = [phrase for phrase in phrases if phrase not in text]
        if missing:
            raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing phrases: {missing}")
        skill_checks[key] = {
            "path": str(path.relative_to(REPO_ROOT)),
            "phrases_checked": phrases,
        }

    proof = {
        "ok": True,
        "scope": "llm_refinement_decision_engine_contract",
        "program_doc": str(REQUIRED_FILES["program_doc"].relative_to(REPO_ROOT)),
        "model_registry": str(REQUIRED_FILES["model_registry"].relative_to(REPO_ROOT)),
        "program_phrases_checked": PROGRAM_REQUIRED_PHRASES,
        "model_registry_phrases_checked": MODEL_REGISTRY_REQUIRED_PHRASES,
        "fixtures": fixture_checks,
        "skills": skill_checks,
        "pivot": {
            "infra_owner": "Pentagon.run",
            "repo_owner": "decision quality, rubrics, model evals, prompt/tool refinement, DB feedback loops",
        },
    }

    output = REPO_ROOT / args.output
    output.parent.mkdir(parents=True, exist_ok=True)
    output.write_text(json.dumps(proof, indent=2, sort_keys=True) + "\n")
    print(json.dumps(proof, indent=2, sort_keys=True))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())