"""Pipeline v2 configuration — all constants and thresholds.""" import os from pathlib import Path # --- Paths --- BASE_DIR = Path(os.environ.get("PIPELINE_BASE", "/opt/teleo-eval")) REPO_DIR = BASE_DIR / "workspaces" / "teleo-codex.git" MAIN_WORKTREE = BASE_DIR / "workspaces" / "main" SECRETS_DIR = BASE_DIR / "secrets" LOG_DIR = BASE_DIR / "logs" DB_PATH = BASE_DIR / "pipeline" / "pipeline.db" # File-based worktree lock path — used by all processes that write to main worktree # (pipeline daemon stages + telegram bot). Ganymede: one lock, one mechanism. MAIN_WORKTREE_LOCKFILE = BASE_DIR / "workspaces" / ".main-worktree.lock" INBOX_QUEUE = "inbox/queue" INBOX_ARCHIVE = "inbox/archive" INBOX_NULL_RESULT = "inbox/null-result" # --- Forgejo --- FORGEJO_URL = os.environ.get("FORGEJO_URL", "http://localhost:3000") FORGEJO_OWNER = "teleo" FORGEJO_REPO = "teleo-codex" FORGEJO_TOKEN_FILE = SECRETS_DIR / "forgejo-admin-token" FORGEJO_PIPELINE_USER = "teleo" # git user for pipeline commits # --- Models --- CLAUDE_CLI = os.environ.get("CLAUDE_CLI", "/home/teleo/.local/bin/claude") OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions" # Model IDs MODEL_OPUS = "opus" MODEL_SONNET = "sonnet" MODEL_HAIKU = "anthropic/claude-3.5-haiku" MODEL_GPT4O = "openai/gpt-4o" # legacy, kept for reference MODEL_GEMINI_FLASH = "google/gemini-2.5-flash" # was -preview, removed by OpenRouter MODEL_SONNET_OR = "anthropic/claude-sonnet-4.5" # OpenRouter Sonnet (paid, not Claude Max) # --- Model assignment per stage --- # Principle: Opus is scarce (Claude Max). Reserve for DEEP eval + overnight research. # Model diversity: domain (GPT-4o) + Leo (Sonnet) = two model families, no correlated blindspots. # Both on OpenRouter = Claude Max rate limit untouched for Opus. # # Pipeline eval ordering (domain-first, Leo-last): # 1. Domain review → GPT-4o (OpenRouter) — different family from Leo # 2. Leo STANDARD → Sonnet (OpenRouter) — different family from domain # 3. Leo DEEP → Opus (Claude Max) — highest judgment, scarce EXTRACT_MODEL = MODEL_SONNET # extraction: structured output, volume work (Claude Max) TRIAGE_MODEL = MODEL_HAIKU # triage: routing decision, cheapest (OpenRouter) EVAL_DOMAIN_MODEL = MODEL_GEMINI_FLASH # domain review: Gemini 2.5 Flash (was GPT-4o — 16x cheaper, different family from Sonnet) EVAL_LEO_MODEL = MODEL_OPUS # Leo DEEP review: Claude Max Opus EVAL_LEO_STANDARD_MODEL = MODEL_SONNET_OR # Leo STANDARD review: OpenRouter Sonnet EVAL_DEEP_MODEL = MODEL_GEMINI_FLASH # DEEP cross-family: paid, adversarial # --- Model backends --- # Each model can run on Claude Max (subscription, base load) or API (overflow/spikes). # Claude Max: free but rate-limited. API: paid but unlimited. # When Claude Max is rate-limited, behavior per stage: # "queue" — wait for capacity (preferred for non-urgent work) # "overflow" — fall back to API (for time-sensitive work) # "skip" — skip this cycle (for optional stages like sample audit) OVERFLOW_POLICY = { "extract": "queue", # extraction can wait "triage": "overflow", # triage is cheap on API anyway "eval_domain": "overflow", # domain review is the volume filter — don't let it bottleneck (Rhea) "eval_leo": "queue", # Leo review is the bottleneck we protect "eval_deep": "overflow", # DEEP is already on API "sample_audit": "skip", # optional, skip if constrained } # OpenRouter cost rates per 1K tokens (only applies when using API, not Claude Max) MODEL_COSTS = { "opus": {"input": 0.015, "output": 0.075}, "sonnet": {"input": 0.003, "output": 0.015}, MODEL_HAIKU: {"input": 0.0008, "output": 0.004}, MODEL_GPT4O: {"input": 0.0025, "output": 0.01}, MODEL_GEMINI_FLASH: {"input": 0.00015, "output": 0.0006}, MODEL_SONNET_OR: {"input": 0.003, "output": 0.015}, } # --- Concurrency --- MAX_EXTRACT_WORKERS = int(os.environ.get("MAX_EXTRACT_WORKERS", "5")) MAX_EVAL_WORKERS = int(os.environ.get("MAX_EVAL_WORKERS", "7")) MAX_MERGE_WORKERS = 1 # domain-serialized, but one merge at a time per domain # --- Timeouts (seconds) --- EXTRACT_TIMEOUT = 600 # 10 min EVAL_TIMEOUT = 120 # 2 min — routine Sonnet/Gemini Flash calls (was 600, caused 10-min stalls) EVAL_TIMEOUT_OPUS = 600 # 10 min — Opus DEEP eval needs more time for complex reasoning MERGE_TIMEOUT = 300 # 5 min — force-reset to conflict if exceeded (Rhea) CLAUDE_MAX_PROBE_TIMEOUT = 15 # --- Backpressure --- BACKPRESSURE_HIGH = 40 # pause extraction above this BACKPRESSURE_LOW = 20 # throttle extraction above this BACKPRESSURE_THROTTLE_WORKERS = 2 # workers when throttled # --- Retry budgets --- TRANSIENT_RETRY_MAX = 5 # API timeouts, rate limits SUBSTANTIVE_RETRY_STANDARD = 2 # reviewer request_changes SUBSTANTIVE_RETRY_DEEP = 3 MAX_EVAL_ATTEMPTS = 3 # Hard cap on eval cycles per PR before terminal MAX_FIX_ATTEMPTS = 2 # Hard cap on auto-fix cycles per PR before giving up MAX_FIX_PER_CYCLE = 15 # PRs to fix per cycle — bumped from 5 to clear backlog (Cory, Mar 14) # Issue tags that can be fixed mechanically (Python fixer or Haiku) # broken_wiki_links removed — downgraded to warning, not a gate. Links to claims # in other open PRs resolve naturally as the dependency chain merges. (Cory, Mar 14) MECHANICAL_ISSUE_TAGS = {"frontmatter_schema", "near_duplicate"} # Issue tags that require re-extraction (substantive quality problems) SUBSTANTIVE_ISSUE_TAGS = {"factual_discrepancy", "confidence_miscalibration", "scope_error", "title_overclaims"} # --- Content type schemas --- # Registry of content types. validate.py branches on type to apply the right # required fields, confidence rules, and title checks. Adding a new type is a # dict entry here — no code changes in validate.py needed. TYPE_SCHEMAS = { "claim": { "required": ("type", "domain", "description", "confidence", "source", "created"), "valid_confidence": ("proven", "likely", "experimental", "speculative"), "needs_proposition_title": True, }, "framework": { "required": ("type", "domain", "description", "source", "created"), "valid_confidence": None, "needs_proposition_title": True, }, "entity": { "required": ("type", "domain", "description"), "valid_confidence": None, "needs_proposition_title": False, }, "decision": { "required": ("type", "domain", "description", "parent_entity", "status"), "valid_confidence": None, "needs_proposition_title": False, "valid_status": ("active", "passed", "failed", "expired", "cancelled"), }, } # --- Content directories --- ENTITY_DIR_TEMPLATE = "entities/{domain}" # centralized path (Rhea: don't hardcode across 5 files) DECISION_DIR_TEMPLATE = "decisions/{domain}" # --- Contributor tiers --- # Auto-promotion rules. CI is computed, not stored. CONTRIBUTOR_TIER_RULES = { "contributor": { "claims_merged": 1, }, "veteran": { "claims_merged": 10, "min_days_since_first": 30, "challenges_survived": 1, }, } # Role weights for CI computation (must match schemas/contribution-weights.yaml) CONTRIBUTION_ROLE_WEIGHTS = { "sourcer": 0.15, "extractor": 0.40, "challenger": 0.20, "synthesizer": 0.15, "reviewer": 0.10, } # --- Circuit breakers --- BREAKER_THRESHOLD = 5 BREAKER_COOLDOWN = 900 # 15 min # --- Cost budgets --- OPENROUTER_DAILY_BUDGET = 20.0 # USD OPENROUTER_WARN_THRESHOLD = 0.8 # 80% of budget # --- Quality --- SAMPLE_AUDIT_RATE = 0.15 # 15% of LIGHT merges get pre-merge promotion to STANDARD (Rio) SAMPLE_AUDIT_DISAGREEMENT_THRESHOLD = 0.10 # 10% disagreement → tighten LIGHT criteria SAMPLE_AUDIT_MODEL = MODEL_OPUS # Opus for audit — different family from Haiku triage (Leo) # --- Batch eval --- # Batch domain review: group STANDARD PRs by domain, one LLM call per batch. # Leo review stays individual (safety net for cross-contamination). BATCH_EVAL_MAX_PRS = int(os.environ.get("BATCH_EVAL_MAX_PRS", "5")) BATCH_EVAL_MAX_DIFF_BYTES = int(os.environ.get("BATCH_EVAL_MAX_DIFF_BYTES", "100000")) # 100KB # --- Tier logic --- # LIGHT_SKIP_LLM: when True, LIGHT PRs skip domain+Leo review entirely (auto-approve on Tier 0 pass). # Set False for shadow mode (domain review runs but logs only). Flip True after 24h validation (Rhea). LIGHT_SKIP_LLM = os.environ.get("LIGHT_SKIP_LLM", "false").lower() == "true" # Random pre-merge promotion: fraction of LIGHT PRs upgraded to STANDARD before eval (Rio). # Makes gaming unpredictable — extraction agents can't know which LIGHT PRs get full review. LIGHT_PROMOTION_RATE = float(os.environ.get("LIGHT_PROMOTION_RATE", "0.15")) # --- Polling intervals (seconds) --- INGEST_INTERVAL = 60 VALIDATE_INTERVAL = 30 EVAL_INTERVAL = 30 MERGE_INTERVAL = 30 FIX_INTERVAL = 60 HEALTH_CHECK_INTERVAL = 60 # --- Retrieval (Telegram bot) --- RETRIEVAL_RRF_K = 20 # RRF smoothing constant — tuned for 5-10 results per source RETRIEVAL_ENTITY_BOOST = 1.5 # RRF score multiplier for claims wiki-linked from matched entities RETRIEVAL_MAX_RESULTS = 10 # Max claims shown to LLM after RRF merge RETRIEVAL_MIN_CLAIM_SCORE = 3.0 # Floor for keyword claim scoring — filters single-stopword matches # --- Health API --- HEALTH_PORT = 8080 # --- Logging --- LOG_FILE = LOG_DIR / "pipeline.jsonl" LOG_ROTATION_MAX_BYTES = 50 * 1024 * 1024 # 50MB per file LOG_ROTATION_BACKUP_COUNT = 7 # keep 7 days # --- Versioning (tracked in metrics_snapshots for chart annotations) --- PROMPT_VERSION = "v2-lean-directed" # bump on every prompt change PIPELINE_VERSION = "2.2" # bump on every significant pipeline change