teleo-infrastructure/lib/config.py

"""Pipeline v2 configuration — all constants and thresholds."""

import os
from pathlib import Path

# --- Paths ---
BASE_DIR = Path(os.environ.get("PIPELINE_BASE", "/opt/teleo-eval"))
REPO_DIR = BASE_DIR / "workspaces" / "teleo-codex.git"
MAIN_WORKTREE = BASE_DIR / "workspaces" / "main"
SECRETS_DIR = BASE_DIR / "secrets"
LOG_DIR = BASE_DIR / "logs"
DB_PATH = BASE_DIR / "pipeline" / "pipeline.db"
INBOX_ARCHIVE = "inbox/archive"

# --- Forgejo ---
FORGEJO_URL = os.environ.get("FORGEJO_URL", "http://localhost:3000")
FORGEJO_OWNER = "teleo"
FORGEJO_REPO = "teleo-codex"
FORGEJO_TOKEN_FILE = SECRETS_DIR / "forgejo-admin-token"
FORGEJO_PIPELINE_USER = "teleo"  # git user for pipeline commits

# --- Models ---
CLAUDE_CLI = os.environ.get("CLAUDE_CLI", "/home/teleo/.local/bin/claude")
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"

# Model IDs
MODEL_OPUS = "opus"
MODEL_SONNET = "sonnet"
MODEL_HAIKU = "anthropic/claude-3.5-haiku"
MODEL_GPT4O = "openai/gpt-4o"
MODEL_SONNET_OR = "anthropic/claude-sonnet-4.5"  # OpenRouter Sonnet (paid, not Claude Max)

# --- Model assignment per stage ---
# Principle: Opus is scarce (Claude Max). Reserve for DEEP eval + overnight research.
# Model diversity: domain (GPT-4o) + Leo (Sonnet) = two model families, no correlated blindspots.
# Both on OpenRouter = Claude Max rate limit untouched for Opus.
#
# Pipeline eval ordering (domain-first, Leo-last):
#   1. Domain review → GPT-4o (OpenRouter) — different family from Leo
#   2. Leo STANDARD → Sonnet (OpenRouter) — different family from domain
#   3. Leo DEEP → Opus (Claude Max) — highest judgment, scarce
EXTRACT_MODEL = MODEL_SONNET  # extraction: structured output, volume work (Claude Max)
TRIAGE_MODEL = MODEL_HAIKU  # triage: routing decision, cheapest (OpenRouter)
EVAL_DOMAIN_MODEL = MODEL_GPT4O  # domain review: OpenRouter GPT-4o
EVAL_LEO_MODEL = MODEL_OPUS  # Leo DEEP review: Claude Max Opus
EVAL_LEO_STANDARD_MODEL = MODEL_SONNET_OR  # Leo STANDARD review: OpenRouter Sonnet
EVAL_DEEP_MODEL = MODEL_GPT4O  # DEEP cross-family: paid, adversarial

# --- Model backends ---
# Each model can run on Claude Max (subscription, base load) or API (overflow/spikes).
# Claude Max: free but rate-limited. API: paid but unlimited.
# When Claude Max is rate-limited, behavior per stage:
#   "queue"    — wait for capacity (preferred for non-urgent work)
#   "overflow" — fall back to API (for time-sensitive work)
#   "skip"     — skip this cycle (for optional stages like sample audit)
OVERFLOW_POLICY = {
    "extract": "queue",  # extraction can wait
    "triage": "overflow",  # triage is cheap on API anyway
    "eval_domain": "overflow",  # domain review is the volume filter — don't let it bottleneck (Rhea)
    "eval_leo": "queue",  # Leo review is the bottleneck we protect
    "eval_deep": "overflow",  # DEEP is already on API
    "sample_audit": "skip",  # optional, skip if constrained
}

# OpenRouter cost rates per 1K tokens (only applies when using API, not Claude Max)
MODEL_COSTS = {
    "opus": {"input": 0.015, "output": 0.075},
    "sonnet": {"input": 0.003, "output": 0.015},
    MODEL_HAIKU: {"input": 0.0008, "output": 0.004},
    MODEL_GPT4O: {"input": 0.0025, "output": 0.01},
    MODEL_SONNET_OR: {"input": 0.003, "output": 0.015},
}

# --- Concurrency ---
MAX_EXTRACT_WORKERS = int(os.environ.get("MAX_EXTRACT_WORKERS", "5"))
MAX_EVAL_WORKERS = int(os.environ.get("MAX_EVAL_WORKERS", "7"))
MAX_MERGE_WORKERS = 1  # domain-serialized, but one merge at a time per domain

# --- Timeouts (seconds) ---
EXTRACT_TIMEOUT = 600  # 10 min
EVAL_TIMEOUT = 300  # 5 min
MERGE_TIMEOUT = 300  # 5 min — force-reset to conflict if exceeded (Rhea)
CLAUDE_MAX_PROBE_TIMEOUT = 15

# --- Backpressure ---
BACKPRESSURE_HIGH = 40  # pause extraction above this
BACKPRESSURE_LOW = 20  # throttle extraction above this
BACKPRESSURE_THROTTLE_WORKERS = 2  # workers when throttled

# --- Retry budgets ---
TRANSIENT_RETRY_MAX = 5  # API timeouts, rate limits
SUBSTANTIVE_RETRY_STANDARD = 2  # reviewer request_changes
SUBSTANTIVE_RETRY_DEEP = 3
MAX_EVAL_ATTEMPTS = 3  # Hard cap on eval cycles per PR before terminal

# Issue tags that can be fixed mechanically (Python fixer or Haiku)
MECHANICAL_ISSUE_TAGS = {"frontmatter_schema", "broken_wiki_links", "near_duplicate"}
# Issue tags that require re-extraction (substantive quality problems)
SUBSTANTIVE_ISSUE_TAGS = {"factual_discrepancy", "confidence_miscalibration", "scope_error", "title_overclaims"}

# --- Circuit breakers ---
BREAKER_THRESHOLD = 5
BREAKER_COOLDOWN = 900  # 15 min

# --- Cost budgets ---
OPENROUTER_DAILY_BUDGET = 20.0  # USD
OPENROUTER_WARN_THRESHOLD = 0.8  # 80% of budget

# --- Quality ---
SAMPLE_AUDIT_RATE = 0.10  # 10% of LIGHT merges
SAMPLE_AUDIT_DISAGREEMENT_THRESHOLD = 0.10  # 10% disagreement → tighten LIGHT criteria

# --- Polling intervals (seconds) ---
INGEST_INTERVAL = 60
VALIDATE_INTERVAL = 30
EVAL_INTERVAL = 30
MERGE_INTERVAL = 30
HEALTH_CHECK_INTERVAL = 60

# --- Health API ---
HEALTH_PORT = 8080

# --- Logging ---
LOG_FILE = LOG_DIR / "pipeline.jsonl"
LOG_ROTATION_MAX_BYTES = 50 * 1024 * 1024  # 50MB per file
LOG_ROTATION_BACKUP_COUNT = 7  # keep 7 days