diff --git a/.agents/skills/decision-engine-refinement/SKILL.md b/.agents/skills/decision-engine-refinement/SKILL.md new file mode 100644 index 0000000..a3355d1 --- /dev/null +++ b/.agents/skills/decision-engine-refinement/SKILL.md @@ -0,0 +1,41 @@ +--- +name: decision-engine-refinement +description: Use when improving Living IP decision-engine quality, LLM model selection, evaluator prompts, rubrics, replay evals, Rio or Theseus reviewer behavior, or model bakeoffs. +--- + +# Decision Engine Refinement + +Use this skill for quality work, not infrastructure work. Pentagon.run or Crabbox can run remote jobs; this repo owns model judgment, rubric design, prompt/tool refinement, and proof artifacts. + +## Workflow + +1. Read `docs/llm-refinement-decision-engine.md`. +2. Identify the lane: Rio economics, Theseus model integrity, Leo cross-domain, domain factuality, retrieval quality, or prompt/tool self-upgrade. +3. Build or reuse a replayable fixture before changing prompts or model assignments. +4. Compare baseline vs candidate with the same input, same rubric, and structured verdict format. +5. Record false approves, false rejects, useful disagreements, cost, and latency. +6. Change runtime prompts/models only after the candidate shows a measured improvement with no critical regression. + +## Hard Rules + +- Do not change live model assignments because one answer sounds better. +- Do not use production DB writes to tune prompts. +- Do not collapse Rio and Theseus into generic "reviewers". +- Do not treat payment, popularity, or engagement as quality approval. +- Do not claim production decision-engine improvement without replay evidence and live/staging readback. + +## Agent Responsibilities + +- Rio: incentive design, contribution weights, paid-query effects, market/mechanism reasoning, OPSEC, correlated-prior warnings. +- Theseus: model diversity, adversarial evals, disagreement queues, self-upgrade criteria, prompt/tool safety, verifier drift. +- Leo: cross-domain synthesis, fallback review, final arbitration where the route or rubric is ambiguous. + +## Expected Artifacts + +- fixture file or DB query used for sampling; +- baseline verdict output; +- candidate verdict output; +- summary JSON with quality, cost, latency, and disagreement metrics; +- patch scoped to prompts, model config, rubric docs, or eval harness. + +Run `python3 scripts/check_llm_refinement_contract.py` after editing this surface. diff --git a/.agents/skills/nousresearch-hermes-agent/SKILL.md b/.agents/skills/nousresearch-hermes-agent/SKILL.md new file mode 100644 index 0000000..54538e0 --- /dev/null +++ b/.agents/skills/nousresearch-hermes-agent/SKILL.md @@ -0,0 +1,69 @@ +--- +name: nousresearch-hermes-agent +description: Use when packaging Living IP agents, skills, prompts, memory, model routing, or decision-engine workflows for NousResearch Hermes Agent. +--- + +# NousResearch Hermes Agent + +Use this skill to adapt Living IP decision-engine behavior to Hermes Agent. Keep the package fixture-first and no-secret by default. + +## Current External Surface + +As of 2026-06-01, the upstream Hermes Agent README describes: + +- model switching via `hermes model`; +- tools via `hermes tools`; +- a messaging gateway for Telegram, Discord, Slack, WhatsApp, Signal, and CLI; +- built-in skill creation and self-improvement; +- cron scheduling; +- terminal backends including local, Docker, SSH, Modal, and Daytona; +- OpenClaw migration commands. + +Verify upstream docs before depending on a command in code. + +## Living IP Package Shape + +Create a package that includes: + +- agent identity file for Rio or Theseus; +- skill instructions copied from repo-owned `.agents/skills/*`; +- no-secret tool allowlist; +- fixture replay command; +- model selection notes; +- proof output path. + +Do not package production DBs, tokens, API keys, SSH keys, or Bitwarden exports. + +## Rio Package + +Rio Hermes package should focus on: + +- internet finance and mechanism reasoning; +- contribution weights and paid-query effects; +- OPSEC finance filters; +- source-diversity warnings; +- fixture tests for false economic reasoning. + +## Theseus Package + +Theseus Hermes package should focus on: + +- model-diversity evals; +- disagreement queues; +- self-upgrade criteria; +- prompt/tool safety; +- fixture tests for overconfident or poorly grounded model judgments. + +## Handoff Contract + +Every Hermes handoff must include: + +1. install/config snippet; +2. model/provider selection left configurable; +3. tool allowlist; +4. fixture-first demo; +5. no-live-write default; +6. proof artifact path; +7. known blockers. + +Do not claim Hermes production integration until a Hermes runtime actually executes the fixture and writes proof. diff --git a/.agents/skills/openclaw-agent/SKILL.md b/.agents/skills/openclaw-agent/SKILL.md new file mode 100644 index 0000000..6354b99 --- /dev/null +++ b/.agents/skills/openclaw-agent/SKILL.md @@ -0,0 +1,69 @@ +--- +name: openclaw-agent +description: Use when adapting Living IP decision-engine agents, skills, tools, prompt files, or no-secret workflows to OpenClaw agent workspaces. +--- + +# OpenClaw Agent + +Use this skill to package Living IP decision-engine behavior for OpenClaw workspaces. Treat OpenClaw as a distribution/runtime surface, not a new source of truth. + +## Current External Surface + +As of 2026-06-01, the upstream OpenClaw README describes: + +- Node 24 or Node 22.19+ runtime; +- `openclaw onboard --install-daemon`; +- Gateway daemon usage; +- agent prompt files `AGENTS.md`, `SOUL.md`, and `TOOLS.md`; +- workspace skills at `~/.openclaw/workspace/skills//SKILL.md`; +- model configuration in OpenClaw config; +- security guidance for DM pairing, allowlists, and sandboxing. + +Verify upstream docs before depending on a command in code. + +## Living IP Workspace Shape + +Create or update: + +- `AGENTS.md`: scope, repo boundaries, proof requirements; +- `SOUL.md`: Rio or Theseus identity; +- `TOOLS.md`: bounded tools only; +- `skills/decision-engine-refinement/SKILL.md`; +- `skills/teleo-db-operator/SKILL.md` only for read-only local copies unless explicitly authorized. + +## Tool Policy + +Default allow: + +- read files; +- run local fixture tests; +- write proof artifacts; +- inspect git diffs; +- query copied SQLite DBs read-only. + +Default deny: + +- production DB writes; +- token reads; +- Bitwarden vault export; +- live GitHub PR comments; +- public messaging sends; +- broad shell automation against host services. + +## Rio And Theseus + +- Rio OpenClaw package: economic reasoning, contribution incentives, paid-query guardrails, OPSEC. +- Theseus OpenClaw package: eval integrity, adversarial prompts, model bakeoffs, self-upgrade review. + +## Proof Contract + +An OpenClaw adapter is useful only if it can run a fixture and produce: + +- prompt files used; +- tool allowlist; +- model selected; +- fixture input; +- structured verdict output; +- proof that no denied tools were invoked. + +Do not claim OpenClaw production readiness until the package runs in an OpenClaw workspace and writes proof. diff --git a/.agents/skills/teleo-db-operator/SKILL.md b/.agents/skills/teleo-db-operator/SKILL.md new file mode 100644 index 0000000..40aa746 --- /dev/null +++ b/.agents/skills/teleo-db-operator/SKILL.md @@ -0,0 +1,76 @@ +--- +name: teleo-db-operator +description: Use when reading, auditing, backing up, querying, or safely writing the Teleo pipeline SQLite database, including review_records, audit_log, costs, prs, sources, and contributor feedback loops. +--- + +# Teleo DB Operator + +Default to read-only. The database is evidence for decision-engine refinement, not a scratchpad. + +## Discover + +1. Read `lib/config.py` for `DB_PATH` and related paths. +2. Prefer local or copied DBs over production DBs. +3. If using production, record whether access is read-only or write-authorized. +4. Never print secret values found near DB paths or shell history. + +## Read Path + +Use `sqlite3` or Python `sqlite3`. + +Recommended read targets: + +- `review_records`: evaluator, model, outcome, rejection reason. +- `audit_log`: route decisions, approve/reject events, failure details. +- `costs`: model cost by date/stage. +- `prs`: status, tier, route compatibility fields, verdicts. +- `sources`: priority, feedback, extraction model. + +For refinement work, export aggregated JSON or CSV into `.crabbox-results/` or `proof/`, not raw private DB snapshots. + +## Write Path + +Writes require explicit authorization and a backup. + +Required sequence: + +1. Create a backup or operate on a copy. +2. Write the exact SQL in a retained artifact. +3. Use `BEGIN IMMEDIATE;`. +4. Apply the minimal mutation. +5. Read back the changed rows. +6. Commit the transaction only after readback is correct. +7. Write a blocker artifact instead of guessing if any precondition is missing. + +Never write production prompt/model state as part of an experiment. Experiments should replay fixtures and produce proof first. + +## Safety Boundaries + +- Do not attach, copy, or commit `pipeline.db`. +- Do not run broad `UPDATE` or `DELETE` without a `WHERE` clause and a prior row count. +- Do not mutate `prs`, `sources`, or contributor state from a model response alone. +- Do not treat local copied DB proof as production proof. + +## Useful Queries + +```sql +SELECT reviewer, reviewer_model, outcome, rejection_reason, count(*) AS n +FROM review_records +GROUP BY reviewer, reviewer_model, outcome, rejection_reason +ORDER BY n DESC; +``` + +```sql +SELECT event, count(*) AS n +FROM audit_log +WHERE stage = 'evaluate' +GROUP BY event +ORDER BY n DESC; +``` + +```sql +SELECT model, stage, calls, input_tokens, output_tokens, cost_usd +FROM costs +ORDER BY date DESC, cost_usd DESC +LIMIT 50; +``` diff --git a/.crabbox.yaml b/.crabbox.yaml index ef32963..1fb2d0f 100644 --- a/.crabbox.yaml +++ b/.crabbox.yaml @@ -77,9 +77,12 @@ jobs: python3 -m pip install -e '.[dev]' && mkdir -p .crabbox-results && python3 scripts/check_crabbox_ci_contract.py - --output .crabbox-results/crabbox-ci-contract.json + --output .crabbox-results/crabbox-ci-contract.json && + python3 scripts/check_llm_refinement_contract.py + --output .crabbox-results/llm-refinement-contract.json downloads: - .crabbox-results/crabbox-ci-contract.json + - .crabbox-results/llm-refinement-contract.json stop: always unit: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index df4b354..55b39cf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,6 +43,7 @@ jobs: lib/post_extract.py \ telegram/approvals.py \ scripts/check_crabbox_ci_contract.py \ + scripts/check_llm_refinement_contract.py \ scripts/prove_phase1b_local.py \ tests/test_agent_routing.py \ tests/test_evaluate_agent_routing.py \ @@ -76,8 +77,8 @@ jobs: path: .crabbox-results/pytest.xml if-no-files-found: warn - crabbox-contract: - name: Crabbox and Leo contract + repo-contracts: + name: Repo contracts runs-on: ubuntu-latest timeout-minutes: 10 steps: @@ -93,12 +94,16 @@ jobs: run: | python scripts/check_crabbox_ci_contract.py \ --output .crabbox-results/crabbox-ci-contract.json - - name: Upload contract artifact + python scripts/check_llm_refinement_contract.py \ + --output .crabbox-results/llm-refinement-contract.json + - name: Upload contract artifacts if: always() uses: actions/upload-artifact@v4 with: - name: teleo-infrastructure-crabbox-contract - path: .crabbox-results/crabbox-ci-contract.json + name: teleo-infrastructure-repo-contracts + path: | + .crabbox-results/crabbox-ci-contract.json + .crabbox-results/llm-refinement-contract.json if-no-files-found: error phase1b-local-proof: @@ -107,7 +112,7 @@ jobs: needs: - lint - test - - crabbox-contract + - repo-contracts timeout-minutes: 20 env: PHASE1B_AGENT_ROUTING_ENABLED: "true" diff --git a/docs/llm-refinement-decision-engine.md b/docs/llm-refinement-decision-engine.md new file mode 100644 index 0000000..7b1819f --- /dev/null +++ b/docs/llm-refinement-decision-engine.md @@ -0,0 +1,191 @@ +# LLM Refinement And Decision Engine Program + +Created: 2026-06-01 +Status: active direction + +## Product Outcome + +The decision engine should become the best judgment layer for Living IP: it routes knowledge changes to the right agent identities, tests competing LLMs against the same rubric, learns from disagreement, and improves prompts/tools only when measured deltas prove the change. + +Pentagon.run should own disposable infrastructure and remote execution. This repo should own decision quality: rubrics, prompts, model selection, route evidence, database feedback loops, and agent tool packages. + +## What Rio And Theseus Become + +### Rio + +Rio becomes the economic and incentive-quality evaluator. + +Rio owns: + +- contribution weights and role economics; +- paid-query effects and anti-pay-to-pollute rules; +- market, mechanism, futarchy, x402, token, and capital-formation reasoning; +- source-diversity and correlated-prior warnings; +- OPSEC for finance, deal terms, token economics, and internal allocations; +- model tests that expose weak economic reasoning. + +Rio should not be "the crypto agent". Rio should be the agent that asks whether the system's incentives create useful knowledge or garbage incentives. + +### Theseus + +Theseus becomes the model-integrity and agent-refinement evaluator. + +Theseus owns: + +- model diversity and correlated-blind-spot measurement; +- adversarial eval rubrics; +- prompt/tool safety and self-upgrade criteria; +- disagreement queues and verifier-divergence analysis; +- LLM capability evidence and agent-system architecture; +- tests that expose hallucinated certainty, weak causal claims, and prompt-injection fragility. + +Theseus should not be "the AI safety agent". Theseus should be the agent that asks whether the decision system can be trusted when the models are persuasive but wrong. + +## Decision Engine Loop + +```mermaid +flowchart TD + PR["Decision-engine PR or source record"] --> Route["Deterministic route evidence"] + Route --> Reviewers["Required agent reviewers"] + Reviewers --> Rubric["Shared rubric"] + Rubric --> ModelA["Primary model"] + Rubric --> ModelB["Independent model family"] + ModelA --> Verdicts["Structured verdicts"] + ModelB --> Verdicts + Verdicts --> Disagree{"Disagreement?"} + Disagree -->|yes| Queue["Disagreement queue"] + Disagree -->|no| Metrics["Calibration metrics"] + Queue --> HumanOrLeo["Leo or human arbitration"] + HumanOrLeo --> Metrics + Metrics --> DB["SQLite feedback state"] + DB --> Refine["Prompt, tool, or model proposal"] + Refine --> Delta["Before/after eval harness"] + Delta -->|passes| Update["Commit refinement"] + Delta -->|fails| Archive["Archive failed refinement"] +``` + +## Model Portfolio + +The goal is not to pick one favorite model. The goal is to assign models to failure modes. + +| Lane | Primary evaluator | Independent check | Why | +| --- | --- | --- | --- | +| Fast triage | cheap small model | deterministic route evidence | triage should be cheap and overridable | +| Domain review | routed agent prompt | different model family | catch domain-specific errors without same-family agreement bias | +| Deep review | strongest available reasoning model | non-Claude or non-primary family | deep review is for structural claims and disagreement | +| Economic reasoning | Rio rubric | model with strong quantitative/mechanism reasoning | tests incentive design, paid-query effects, and contribution weights | +| Agent/refinement safety | Theseus rubric | model with strong adversarial critique | tests tool safety, self-upgrades, and evaluator drift | + +Candidate models should enter only through a harness: + +1. fixed input set; +2. fixed rubric; +3. structured verdict JSON; +4. cost and latency recorded; +5. disagreement categories stored; +6. before/after comparison against current baseline. + +No model switch is accepted because it "sounds better" on one example. + +## Refinement Workstreams + +### R1: Rubric Packets + +Create a small rubric packet for each evaluator role: + +- `rio-economics-rubric` +- `theseus-model-integrity-rubric` +- `leo-cross-domain-rubric` +- domain-specific factuality rubrics + +Each packet must define allowed verdicts, rejection tags, must-check criteria, and examples of false positives. + +### R2: Evaluation Corpus + +Build a replayable corpus from existing PRs: + +- approved clean PRs; +- rejected PRs by issue tag; +- Rio/Theseus cross-domain PRs; +- paid-query or contribution-weight examples; +- adversarial malformed claims; +- near-duplicate and OPSEC edge cases. + +Use local fixture data first. Production DB sampling requires the DB operator skill. + +### R3: Model Bakeoff + +Run each candidate model against the same corpus and emit: + +- accuracy against expected disposition; +- false-approve count; +- false-reject count; +- issue-tag precision; +- average latency; +- estimated cost; +- disagreement matrix by model pair. + +The highest-signal metric is not raw approval rate. It is false approvals on bad claims plus useful disagreement on ambiguous claims. + +### R4: Feedback Loop + +Use `review_records`, `audit_log`, `costs`, and PR state to find: + +- recurring model failure categories; +- agents with repeated same-tag rejections; +- prompts that produce vague reviews; +- cost spikes without quality gain; +- routes that keep requiring manual override. + +Every prompt/tool change should include a before/after proof over this loop. + +### R5: Agent Runtime Packages + +Package the same decision-engine contract for: + +- NousResearch Hermes Agent: skill/memory/model-switching oriented. +- OpenClaw: workspace skill plus `AGENTS.md`, `SOUL.md`, `TOOLS.md` oriented. + +Both packages should be fixture-first and no-secret by default. They are distribution surfaces for the decision engine, not separate evaluators with their own truth. + +## DB Usage Boundary + +Default is read-only. + +Writes are allowed only when all are true: + +- the target DB is local, staging, or explicitly authorized production; +- a backup or copy exists; +- the write is wrapped in a transaction; +- the exact query is retained in a proof artifact; +- the post-write readback is retained. + +Never let an agent tune prompts by mutating production state directly. + +## Pentagon.run Boundary + +Pentagon.run should own: + +- disposable VPS setup; +- Crabbox or remote proof execution; +- Hetzner lifecycle; +- runner cleanup; +- infra receipts. + +This repo should own: + +- decision-engine quality; +- model and prompt experiments; +- agent skills and adapter handoffs; +- database feedback analysis; +- proof schemas for eval quality. + +## Next Implementation Slice + +1. Add `scripts/replay_decision_engine_eval.py` with local fixture mode. +2. Add `fixtures/decision-engine-eval/*.json`. +3. Store verdict outputs in `.crabbox-results/decision-engine-eval.json`. +4. Add one Rio economics fixture and one Theseus model-integrity fixture. +5. Compare current prompt versus one candidate prompt before touching runtime prompts. + +Do not start by changing live model assignments. diff --git a/scripts/check_llm_refinement_contract.py b/scripts/check_llm_refinement_contract.py new file mode 100755 index 0000000..55a2445 --- /dev/null +++ b/scripts/check_llm_refinement_contract.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +"""Validate the LLM refinement and decision-engine guidance surface.""" + +from __future__ import annotations + +import argparse +import json +import re +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] + +REQUIRED_FILES = { + "program_doc": REPO_ROOT / "docs" / "llm-refinement-decision-engine.md", + "decision_skill": REPO_ROOT / ".agents" / "skills" / "decision-engine-refinement" / "SKILL.md", + "db_skill": REPO_ROOT / ".agents" / "skills" / "teleo-db-operator" / "SKILL.md", + "hermes_skill": REPO_ROOT / ".agents" / "skills" / "nousresearch-hermes-agent" / "SKILL.md", + "openclaw_skill": REPO_ROOT / ".agents" / "skills" / "openclaw-agent" / "SKILL.md", +} + +PROGRAM_REQUIRED_PHRASES = [ + "Pentagon.run should own disposable infrastructure", + "This repo should own decision quality", + "Rio becomes the economic and incentive-quality evaluator", + "Theseus becomes the model-integrity and agent-refinement evaluator", + "No model switch is accepted because it", + "Default is read-only", +] + +SKILL_REQUIRED = { + "decision_skill": [ + "Rio economics", + "Theseus model integrity", + "Do not change live model assignments", + "baseline verdict output", + ], + "db_skill": [ + "Default to read-only", + "BEGIN IMMEDIATE", + "Do not attach, copy, or commit `pipeline.db`", + "review_records", + ], + "hermes_skill": [ + "model switching", + "fixture-first", + "Rio Hermes package", + "Theseus Hermes package", + ], + "openclaw_skill": [ + "AGENTS.md", + "SOUL.md", + "TOOLS.md", + "Default deny", + ], +} + + +def _read(path: Path) -> str: + if not path.exists(): + raise AssertionError(f"missing file: {path.relative_to(REPO_ROOT)}") + return path.read_text() + + +def _assert_frontmatter(path: Path, text: str) -> None: + match = re.match(r"^---\n(?P.*?)\n---\n", text, flags=re.DOTALL) + if not match: + raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing YAML frontmatter") + body = match.group("body") + if "name:" not in body or "description:" not in body: + raise AssertionError(f"{path.relative_to(REPO_ROOT)} frontmatter needs name and description") + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--output", default=".crabbox-results/llm-refinement-contract.json") + args = parser.parse_args() + + program = _read(REQUIRED_FILES["program_doc"]) + missing_program = [phrase for phrase in PROGRAM_REQUIRED_PHRASES if phrase not in program] + if missing_program: + raise AssertionError(f"program doc missing phrases: {missing_program}") + + skill_checks = {} + for key, phrases in SKILL_REQUIRED.items(): + path = REQUIRED_FILES[key] + text = _read(path) + _assert_frontmatter(path, text) + missing = [phrase for phrase in phrases if phrase not in text] + if missing: + raise AssertionError(f"{path.relative_to(REPO_ROOT)} missing phrases: {missing}") + skill_checks[key] = { + "path": str(path.relative_to(REPO_ROOT)), + "phrases_checked": phrases, + } + + proof = { + "ok": True, + "scope": "llm_refinement_decision_engine_contract", + "program_doc": str(REQUIRED_FILES["program_doc"].relative_to(REPO_ROOT)), + "program_phrases_checked": PROGRAM_REQUIRED_PHRASES, + "skills": skill_checks, + "pivot": { + "infra_owner": "Pentagon.run", + "repo_owner": "decision quality, rubrics, model evals, prompt/tool refinement, DB feedback loops", + }, + } + + output = REPO_ROOT / args.output + output.parent.mkdir(parents=True, exist_ok=True) + output.write_text(json.dumps(proof, indent=2, sort_keys=True) + "\n") + print(json.dumps(proof, indent=2, sort_keys=True)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())