teleo-codex/ops/pipeline-v2/lib/config.py

"""Pipeline v2 configuration — all constants and thresholds."""

import os
from pathlib import Path

# --- Paths ---
BASE_DIR = Path(os.environ.get("PIPELINE_BASE", "/opt/teleo-eval"))
REPO_DIR = BASE_DIR / "workspaces" / "teleo-codex.git"
MAIN_WORKTREE = BASE_DIR / "workspaces" / "main"
SECRETS_DIR = BASE_DIR / "secrets"
LOG_DIR = BASE_DIR / "logs"
DB_PATH = BASE_DIR / "pipeline" / "pipeline.db"
# File-based worktree lock path — used by all processes that write to main worktree
# (pipeline daemon stages + telegram bot). Ganymede: one lock, one mechanism.
MAIN_WORKTREE_LOCKFILE = BASE_DIR / "workspaces" / ".main-worktree.lock"

INBOX_QUEUE = "inbox/queue"
INBOX_ARCHIVE = "inbox/archive"
INBOX_NULL_RESULT = "inbox/null-result"

# --- Forgejo ---
FORGEJO_URL = os.environ.get("FORGEJO_URL", "http://localhost:3000")
FORGEJO_OWNER = "teleo"
FORGEJO_REPO = "teleo-codex"
FORGEJO_TOKEN_FILE = SECRETS_DIR / "forgejo-admin-token"
FORGEJO_PIPELINE_USER = "teleo"  # git user for pipeline commits

# --- Models ---
CLAUDE_CLI = os.environ.get("CLAUDE_CLI", "/home/teleo/.local/bin/claude")
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"

# Model IDs
MODEL_OPUS = "opus"
MODEL_SONNET = "sonnet"
MODEL_HAIKU = "anthropic/claude-3.5-haiku"
MODEL_GPT4O = "openai/gpt-4o"  # legacy, kept for reference
MODEL_GEMINI_FLASH = "google/gemini-2.5-flash"  # was -preview, removed by OpenRouter
MODEL_SONNET_OR = "anthropic/claude-sonnet-4.5"  # OpenRouter Sonnet (paid, not Claude Max)

# --- Model assignment per stage ---
# Principle: Opus is scarce (Claude Max). Reserve for DEEP eval + overnight research.
# Model diversity: domain (GPT-4o) + Leo (Sonnet) = two model families, no correlated blindspots.
# Both on OpenRouter = Claude Max rate limit untouched for Opus.
#
# Pipeline eval ordering (domain-first, Leo-last):
#   1. Domain review → GPT-4o (OpenRouter) — different family from Leo
#   2. Leo STANDARD → Sonnet (OpenRouter) — different family from domain
#   3. Leo DEEP → Opus (Claude Max) — highest judgment, scarce
EXTRACT_MODEL = MODEL_SONNET  # extraction: structured output, volume work (Claude Max)
TRIAGE_MODEL = MODEL_HAIKU  # triage: routing decision, cheapest (OpenRouter)
EVAL_DOMAIN_MODEL = MODEL_GEMINI_FLASH  # domain review: Gemini 2.5 Flash (was GPT-4o — 16x cheaper, different family from Sonnet)
EVAL_LEO_MODEL = MODEL_OPUS  # Leo DEEP review: Claude Max Opus
EVAL_LEO_STANDARD_MODEL = MODEL_SONNET_OR  # Leo STANDARD review: OpenRouter Sonnet
EVAL_DEEP_MODEL = MODEL_GEMINI_FLASH  # DEEP cross-family: paid, adversarial

# --- Model backends ---
# Each model can run on Claude Max (subscription, base load) or API (overflow/spikes).
# Claude Max: free but rate-limited. API: paid but unlimited.
# When Claude Max is rate-limited, behavior per stage:
#   "queue"    — wait for capacity (preferred for non-urgent work)
#   "overflow" — fall back to API (for time-sensitive work)
#   "skip"     — skip this cycle (for optional stages like sample audit)
OVERFLOW_POLICY = {
    "extract": "queue",  # extraction can wait
    "triage": "overflow",  # triage is cheap on API anyway
    "eval_domain": "overflow",  # domain review is the volume filter — don't let it bottleneck (Rhea)
    "eval_leo": "queue",  # Leo review is the bottleneck we protect
    "eval_deep": "overflow",  # DEEP is already on API
    "sample_audit": "skip",  # optional, skip if constrained
}

# OpenRouter cost rates per 1K tokens (only applies when using API, not Claude Max)
MODEL_COSTS = {
    "opus": {"input": 0.015, "output": 0.075},
    "sonnet": {"input": 0.003, "output": 0.015},
    MODEL_HAIKU: {"input": 0.0008, "output": 0.004},
    MODEL_GPT4O: {"input": 0.0025, "output": 0.01},
    MODEL_GEMINI_FLASH: {"input": 0.00015, "output": 0.0006},
    MODEL_SONNET_OR: {"input": 0.003, "output": 0.015},
}

# --- Concurrency ---
MAX_EXTRACT_WORKERS = int(os.environ.get("MAX_EXTRACT_WORKERS", "5"))
MAX_EVAL_WORKERS = int(os.environ.get("MAX_EVAL_WORKERS", "7"))
MAX_MERGE_WORKERS = 1  # domain-serialized, but one merge at a time per domain

# --- Timeouts (seconds) ---
EXTRACT_TIMEOUT = 600  # 10 min
EVAL_TIMEOUT = 120  # 2 min — routine Sonnet/Gemini Flash calls (was 600, caused 10-min stalls)
EVAL_TIMEOUT_OPUS = 600  # 10 min — Opus DEEP eval needs more time for complex reasoning
MERGE_TIMEOUT = 300  # 5 min — force-reset to conflict if exceeded (Rhea)
CLAUDE_MAX_PROBE_TIMEOUT = 15

# --- Backpressure ---
BACKPRESSURE_HIGH = 40  # pause extraction above this
BACKPRESSURE_LOW = 20  # throttle extraction above this
BACKPRESSURE_THROTTLE_WORKERS = 2  # workers when throttled

# --- Retry budgets ---
TRANSIENT_RETRY_MAX = 5  # API timeouts, rate limits
SUBSTANTIVE_RETRY_STANDARD = 2  # reviewer request_changes
SUBSTANTIVE_RETRY_DEEP = 3
MAX_EVAL_ATTEMPTS = 3  # Hard cap on eval cycles per PR before terminal
MAX_FIX_ATTEMPTS = 2  # Hard cap on auto-fix cycles per PR before giving up
MAX_FIX_PER_CYCLE = 15  # PRs to fix per cycle — bumped from 5 to clear backlog (Cory, Mar 14)

# Issue tags that can be fixed mechanically (Python fixer or Haiku)
# broken_wiki_links removed — downgraded to warning, not a gate. Links to claims
# in other open PRs resolve naturally as the dependency chain merges. (Cory, Mar 14)
MECHANICAL_ISSUE_TAGS = {"frontmatter_schema", "near_duplicate"}
# Issue tags that require re-extraction (substantive quality problems)
SUBSTANTIVE_ISSUE_TAGS = {"factual_discrepancy", "confidence_miscalibration", "scope_error", "title_overclaims"}

# --- Content type schemas ---
# Registry of content types. validate.py branches on type to apply the right
# required fields, confidence rules, and title checks. Adding a new type is a
# dict entry here — no code changes in validate.py needed.
TYPE_SCHEMAS = {
    "claim": {
        "required": ("type", "domain", "description", "confidence", "source", "created"),
        "valid_confidence": ("proven", "likely", "experimental", "speculative"),
        "needs_proposition_title": True,
    },
    "framework": {
        "required": ("type", "domain", "description", "source", "created"),
        "valid_confidence": None,
        "needs_proposition_title": True,
    },
    "entity": {
        "required": ("type", "domain", "description"),
        "valid_confidence": None,
        "needs_proposition_title": False,
    },
    "decision": {
        "required": ("type", "domain", "description", "parent_entity", "status"),
        "valid_confidence": None,
        "needs_proposition_title": False,
        "valid_status": ("active", "passed", "failed", "expired", "cancelled"),
    },
}

# --- Content directories ---
ENTITY_DIR_TEMPLATE = "entities/{domain}"  # centralized path (Rhea: don't hardcode across 5 files)
DECISION_DIR_TEMPLATE = "decisions/{domain}"

# --- Contributor tiers ---
# Auto-promotion rules. CI is computed, not stored.
CONTRIBUTOR_TIER_RULES = {
    "contributor": {
        "claims_merged": 1,
    },
    "veteran": {
        "claims_merged": 10,
        "min_days_since_first": 30,
        "challenges_survived": 1,
    },
}

# Role weights for CI computation (must match schemas/contribution-weights.yaml)
CONTRIBUTION_ROLE_WEIGHTS = {
    "sourcer": 0.15,
    "extractor": 0.40,
    "challenger": 0.20,
    "synthesizer": 0.15,
    "reviewer": 0.10,
}

# --- Circuit breakers ---
BREAKER_THRESHOLD = 5
BREAKER_COOLDOWN = 900  # 15 min

# --- Cost budgets ---
OPENROUTER_DAILY_BUDGET = 20.0  # USD
OPENROUTER_WARN_THRESHOLD = 0.8  # 80% of budget

# --- Quality ---
SAMPLE_AUDIT_RATE = 0.15  # 15% of LIGHT merges get pre-merge promotion to STANDARD (Rio)
SAMPLE_AUDIT_DISAGREEMENT_THRESHOLD = 0.10  # 10% disagreement → tighten LIGHT criteria
SAMPLE_AUDIT_MODEL = MODEL_OPUS  # Opus for audit — different family from Haiku triage (Leo)

# --- Batch eval ---
# Batch domain review: group STANDARD PRs by domain, one LLM call per batch.
# Leo review stays individual (safety net for cross-contamination).
BATCH_EVAL_MAX_PRS = int(os.environ.get("BATCH_EVAL_MAX_PRS", "5"))
BATCH_EVAL_MAX_DIFF_BYTES = int(os.environ.get("BATCH_EVAL_MAX_DIFF_BYTES", "100000"))  # 100KB

# --- Tier logic ---
# LIGHT_SKIP_LLM: when True, LIGHT PRs skip domain+Leo review entirely (auto-approve on Tier 0 pass).
# Set False for shadow mode (domain review runs but logs only). Flip True after 24h validation (Rhea).
LIGHT_SKIP_LLM = os.environ.get("LIGHT_SKIP_LLM", "false").lower() == "true"
# Random pre-merge promotion: fraction of LIGHT PRs upgraded to STANDARD before eval (Rio).
# Makes gaming unpredictable — extraction agents can't know which LIGHT PRs get full review.
LIGHT_PROMOTION_RATE = float(os.environ.get("LIGHT_PROMOTION_RATE", "0.15"))

# --- Polling intervals (seconds) ---
INGEST_INTERVAL = 60
VALIDATE_INTERVAL = 30
EVAL_INTERVAL = 30
MERGE_INTERVAL = 30
FIX_INTERVAL = 60
HEALTH_CHECK_INTERVAL = 60

# --- Retrieval (Telegram bot) ---
RETRIEVAL_RRF_K = 20  # RRF smoothing constant — tuned for 5-10 results per source
RETRIEVAL_ENTITY_BOOST = 1.5  # RRF score multiplier for claims wiki-linked from matched entities
RETRIEVAL_MAX_RESULTS = 10  # Max claims shown to LLM after RRF merge
RETRIEVAL_MIN_CLAIM_SCORE = 3.0  # Floor for keyword claim scoring — filters single-stopword matches

# --- Health API ---
HEALTH_PORT = 8080

# --- Logging ---
LOG_FILE = LOG_DIR / "pipeline.jsonl"
LOG_ROTATION_MAX_BYTES = 50 * 1024 * 1024  # 50MB per file
LOG_ROTATION_BACKUP_COUNT = 7  # keep 7 days

# --- Versioning (tracked in metrics_snapshots for chart annotations) ---
PROMPT_VERSION = "v2-lean-directed"  # bump on every prompt change
PIPELINE_VERSION = "2.2"  # bump on every significant pipeline change