Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source of truth. Previously only 8 of 67 files existed in repo — the rest were deployed directly to VPS via SCP, causing massive drift. Includes: - pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.) - pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh - diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics) - agent-state/: bootstrap, lib-state, cascade inbox processor, schema - systemd/: service unit files for reference - deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate - research-session.sh: updated with Step 8.5 digest + cascade inbox processing No new code written — all files are exact copies from VPS as of 2026-04-06. From this point forward: edit in repo, commit, then deploy.sh. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
219 lines
9.5 KiB
Python
219 lines
9.5 KiB
Python
"""Pipeline v2 configuration — all constants and thresholds."""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# --- Paths ---
|
|
BASE_DIR = Path(os.environ.get("PIPELINE_BASE", "/opt/teleo-eval"))
|
|
REPO_DIR = BASE_DIR / "workspaces" / "teleo-codex.git"
|
|
MAIN_WORKTREE = BASE_DIR / "workspaces" / "main"
|
|
SECRETS_DIR = BASE_DIR / "secrets"
|
|
LOG_DIR = BASE_DIR / "logs"
|
|
DB_PATH = BASE_DIR / "pipeline" / "pipeline.db"
|
|
# File-based worktree lock path — used by all processes that write to main worktree
|
|
# (pipeline daemon stages + telegram bot). Ganymede: one lock, one mechanism.
|
|
MAIN_WORKTREE_LOCKFILE = BASE_DIR / "workspaces" / ".main-worktree.lock"
|
|
|
|
INBOX_QUEUE = "inbox/queue"
|
|
INBOX_ARCHIVE = "inbox/archive"
|
|
INBOX_NULL_RESULT = "inbox/null-result"
|
|
|
|
# --- Forgejo ---
|
|
FORGEJO_URL = os.environ.get("FORGEJO_URL", "http://localhost:3000")
|
|
FORGEJO_OWNER = "teleo"
|
|
FORGEJO_REPO = "teleo-codex"
|
|
FORGEJO_TOKEN_FILE = SECRETS_DIR / "forgejo-admin-token"
|
|
FORGEJO_PIPELINE_USER = "teleo" # git user for pipeline commits
|
|
|
|
# --- Models ---
|
|
CLAUDE_CLI = os.environ.get("CLAUDE_CLI", "/home/teleo/.local/bin/claude")
|
|
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
|
|
|
|
# Model IDs
|
|
MODEL_OPUS = "opus"
|
|
MODEL_SONNET = "sonnet"
|
|
MODEL_HAIKU = "anthropic/claude-3.5-haiku"
|
|
MODEL_GPT4O = "openai/gpt-4o" # legacy, kept for reference
|
|
MODEL_GEMINI_FLASH = "google/gemini-2.5-flash" # was -preview, removed by OpenRouter
|
|
MODEL_SONNET_OR = "anthropic/claude-sonnet-4.5" # OpenRouter Sonnet (paid, not Claude Max)
|
|
|
|
# --- Model assignment per stage ---
|
|
# Principle: Opus is scarce (Claude Max). Reserve for DEEP eval + overnight research.
|
|
# Model diversity: domain (GPT-4o) + Leo (Sonnet) = two model families, no correlated blindspots.
|
|
# Both on OpenRouter = Claude Max rate limit untouched for Opus.
|
|
#
|
|
# Pipeline eval ordering (domain-first, Leo-last):
|
|
# 1. Domain review → GPT-4o (OpenRouter) — different family from Leo
|
|
# 2. Leo STANDARD → Sonnet (OpenRouter) — different family from domain
|
|
# 3. Leo DEEP → Opus (Claude Max) — highest judgment, scarce
|
|
EXTRACT_MODEL = MODEL_SONNET # extraction: structured output, volume work (Claude Max)
|
|
TRIAGE_MODEL = MODEL_HAIKU # triage: routing decision, cheapest (OpenRouter)
|
|
EVAL_DOMAIN_MODEL = MODEL_GEMINI_FLASH # domain review: Gemini 2.5 Flash (was GPT-4o — 16x cheaper, different family from Sonnet)
|
|
EVAL_LEO_MODEL = MODEL_OPUS # Leo DEEP review: Claude Max Opus
|
|
EVAL_LEO_STANDARD_MODEL = MODEL_SONNET_OR # Leo STANDARD review: OpenRouter Sonnet
|
|
EVAL_DEEP_MODEL = MODEL_GEMINI_FLASH # DEEP cross-family: paid, adversarial
|
|
|
|
# --- Model backends ---
|
|
# Each model can run on Claude Max (subscription, base load) or API (overflow/spikes).
|
|
# Claude Max: free but rate-limited. API: paid but unlimited.
|
|
# When Claude Max is rate-limited, behavior per stage:
|
|
# "queue" — wait for capacity (preferred for non-urgent work)
|
|
# "overflow" — fall back to API (for time-sensitive work)
|
|
# "skip" — skip this cycle (for optional stages like sample audit)
|
|
OVERFLOW_POLICY = {
|
|
"extract": "queue", # extraction can wait
|
|
"triage": "overflow", # triage is cheap on API anyway
|
|
"eval_domain": "overflow", # domain review is the volume filter — don't let it bottleneck (Rhea)
|
|
"eval_leo": "queue", # Leo review is the bottleneck we protect
|
|
"eval_deep": "overflow", # DEEP is already on API
|
|
"sample_audit": "skip", # optional, skip if constrained
|
|
}
|
|
|
|
# OpenRouter cost rates per 1K tokens (only applies when using API, not Claude Max)
|
|
MODEL_COSTS = {
|
|
"opus": {"input": 0.015, "output": 0.075},
|
|
"sonnet": {"input": 0.003, "output": 0.015},
|
|
MODEL_HAIKU: {"input": 0.0008, "output": 0.004},
|
|
MODEL_GPT4O: {"input": 0.0025, "output": 0.01},
|
|
MODEL_GEMINI_FLASH: {"input": 0.00015, "output": 0.0006},
|
|
MODEL_SONNET_OR: {"input": 0.003, "output": 0.015},
|
|
}
|
|
|
|
# --- Concurrency ---
|
|
MAX_EXTRACT_WORKERS = int(os.environ.get("MAX_EXTRACT_WORKERS", "5"))
|
|
MAX_EVAL_WORKERS = int(os.environ.get("MAX_EVAL_WORKERS", "7"))
|
|
MAX_MERGE_WORKERS = 1 # domain-serialized, but one merge at a time per domain
|
|
|
|
# --- Timeouts (seconds) ---
|
|
EXTRACT_TIMEOUT = 600 # 10 min
|
|
EVAL_TIMEOUT = 120 # 2 min — routine Sonnet/Gemini Flash calls (was 600, caused 10-min stalls)
|
|
EVAL_TIMEOUT_OPUS = 600 # 10 min — Opus DEEP eval needs more time for complex reasoning
|
|
MERGE_TIMEOUT = 300 # 5 min — force-reset to conflict if exceeded (Rhea)
|
|
CLAUDE_MAX_PROBE_TIMEOUT = 15
|
|
|
|
# --- Backpressure ---
|
|
BACKPRESSURE_HIGH = 40 # pause extraction above this
|
|
BACKPRESSURE_LOW = 20 # throttle extraction above this
|
|
BACKPRESSURE_THROTTLE_WORKERS = 2 # workers when throttled
|
|
|
|
# --- Retry budgets ---
|
|
TRANSIENT_RETRY_MAX = 5 # API timeouts, rate limits
|
|
SUBSTANTIVE_RETRY_STANDARD = 2 # reviewer request_changes
|
|
SUBSTANTIVE_RETRY_DEEP = 3
|
|
MAX_EVAL_ATTEMPTS = 3 # Hard cap on eval cycles per PR before terminal
|
|
MAX_FIX_ATTEMPTS = 2 # Hard cap on auto-fix cycles per PR before giving up
|
|
MAX_FIX_PER_CYCLE = 15 # PRs to fix per cycle — bumped from 5 to clear backlog (Cory, Mar 14)
|
|
|
|
# Issue tags that can be fixed mechanically (Python fixer or Haiku)
|
|
# broken_wiki_links removed — downgraded to warning, not a gate. Links to claims
|
|
# in other open PRs resolve naturally as the dependency chain merges. (Cory, Mar 14)
|
|
MECHANICAL_ISSUE_TAGS = {"frontmatter_schema", "near_duplicate"}
|
|
# Issue tags that require re-extraction (substantive quality problems)
|
|
SUBSTANTIVE_ISSUE_TAGS = {"factual_discrepancy", "confidence_miscalibration", "scope_error", "title_overclaims"}
|
|
|
|
# --- Content type schemas ---
|
|
# Registry of content types. validate.py branches on type to apply the right
|
|
# required fields, confidence rules, and title checks. Adding a new type is a
|
|
# dict entry here — no code changes in validate.py needed.
|
|
TYPE_SCHEMAS = {
|
|
"claim": {
|
|
"required": ("type", "domain", "description", "confidence", "source", "created"),
|
|
"valid_confidence": ("proven", "likely", "experimental", "speculative"),
|
|
"needs_proposition_title": True,
|
|
},
|
|
"framework": {
|
|
"required": ("type", "domain", "description", "source", "created"),
|
|
"valid_confidence": None,
|
|
"needs_proposition_title": True,
|
|
},
|
|
"entity": {
|
|
"required": ("type", "domain", "description"),
|
|
"valid_confidence": None,
|
|
"needs_proposition_title": False,
|
|
},
|
|
"decision": {
|
|
"required": ("type", "domain", "description", "parent_entity", "status"),
|
|
"valid_confidence": None,
|
|
"needs_proposition_title": False,
|
|
"valid_status": ("active", "passed", "failed", "expired", "cancelled"),
|
|
},
|
|
}
|
|
|
|
# --- Content directories ---
|
|
ENTITY_DIR_TEMPLATE = "entities/{domain}" # centralized path (Rhea: don't hardcode across 5 files)
|
|
DECISION_DIR_TEMPLATE = "decisions/{domain}"
|
|
|
|
# --- Contributor tiers ---
|
|
# Auto-promotion rules. CI is computed, not stored.
|
|
CONTRIBUTOR_TIER_RULES = {
|
|
"contributor": {
|
|
"claims_merged": 1,
|
|
},
|
|
"veteran": {
|
|
"claims_merged": 10,
|
|
"min_days_since_first": 30,
|
|
"challenges_survived": 1,
|
|
},
|
|
}
|
|
|
|
# Role weights for CI computation (must match schemas/contribution-weights.yaml)
|
|
CONTRIBUTION_ROLE_WEIGHTS = {
|
|
"sourcer": 0.15,
|
|
"extractor": 0.40,
|
|
"challenger": 0.20,
|
|
"synthesizer": 0.15,
|
|
"reviewer": 0.10,
|
|
}
|
|
|
|
# --- Circuit breakers ---
|
|
BREAKER_THRESHOLD = 5
|
|
BREAKER_COOLDOWN = 900 # 15 min
|
|
|
|
# --- Cost budgets ---
|
|
OPENROUTER_DAILY_BUDGET = 20.0 # USD
|
|
OPENROUTER_WARN_THRESHOLD = 0.8 # 80% of budget
|
|
|
|
# --- Quality ---
|
|
SAMPLE_AUDIT_RATE = 0.15 # 15% of LIGHT merges get pre-merge promotion to STANDARD (Rio)
|
|
SAMPLE_AUDIT_DISAGREEMENT_THRESHOLD = 0.10 # 10% disagreement → tighten LIGHT criteria
|
|
SAMPLE_AUDIT_MODEL = MODEL_OPUS # Opus for audit — different family from Haiku triage (Leo)
|
|
|
|
# --- Batch eval ---
|
|
# Batch domain review: group STANDARD PRs by domain, one LLM call per batch.
|
|
# Leo review stays individual (safety net for cross-contamination).
|
|
BATCH_EVAL_MAX_PRS = int(os.environ.get("BATCH_EVAL_MAX_PRS", "5"))
|
|
BATCH_EVAL_MAX_DIFF_BYTES = int(os.environ.get("BATCH_EVAL_MAX_DIFF_BYTES", "100000")) # 100KB
|
|
|
|
# --- Tier logic ---
|
|
# LIGHT_SKIP_LLM: when True, LIGHT PRs skip domain+Leo review entirely (auto-approve on Tier 0 pass).
|
|
# Set False for shadow mode (domain review runs but logs only). Flip True after 24h validation (Rhea).
|
|
LIGHT_SKIP_LLM = os.environ.get("LIGHT_SKIP_LLM", "false").lower() == "true"
|
|
# Random pre-merge promotion: fraction of LIGHT PRs upgraded to STANDARD before eval (Rio).
|
|
# Makes gaming unpredictable — extraction agents can't know which LIGHT PRs get full review.
|
|
LIGHT_PROMOTION_RATE = float(os.environ.get("LIGHT_PROMOTION_RATE", "0.15"))
|
|
|
|
# --- Polling intervals (seconds) ---
|
|
INGEST_INTERVAL = 60
|
|
VALIDATE_INTERVAL = 30
|
|
EVAL_INTERVAL = 30
|
|
MERGE_INTERVAL = 30
|
|
FIX_INTERVAL = 60
|
|
HEALTH_CHECK_INTERVAL = 60
|
|
|
|
# --- Retrieval (Telegram bot) ---
|
|
RETRIEVAL_RRF_K = 20 # RRF smoothing constant — tuned for 5-10 results per source
|
|
RETRIEVAL_ENTITY_BOOST = 1.5 # RRF score multiplier for claims wiki-linked from matched entities
|
|
RETRIEVAL_MAX_RESULTS = 10 # Max claims shown to LLM after RRF merge
|
|
RETRIEVAL_MIN_CLAIM_SCORE = 3.0 # Floor for keyword claim scoring — filters single-stopword matches
|
|
|
|
# --- Health API ---
|
|
HEALTH_PORT = 8080
|
|
|
|
# --- Logging ---
|
|
LOG_FILE = LOG_DIR / "pipeline.jsonl"
|
|
LOG_ROTATION_MAX_BYTES = 50 * 1024 * 1024 # 50MB per file
|
|
LOG_ROTATION_BACKUP_COUNT = 7 # keep 7 days
|
|
|
|
# --- Versioning (tracked in metrics_snapshots for chart annotations) ---
|
|
PROMPT_VERSION = "v2-lean-directed" # bump on every prompt change
|
|
PIPELINE_VERSION = "2.2" # bump on every significant pipeline change
|