Some checks are pending
CI / lint-and-test (push) Waiting to run
Three targeted fixes from Ganymede's review of commit 469cb7f:
BUG #1 — Success path now updates sources.status='extracting' before PR
creation, so queue scan's DB-authoritative filter catches sources between
PR creation and merge. Previously the cooldown gate was load-bearing for
this window, not belt-and-suspenders as claimed.
BUG #2 — Second null-result path (line 573, triggered when enrichments
existed but all targets were missing in worktree) now updates DB. Without
this, that path created no PR, no DB mark, and would have re-entered the
runaway loop 4h later when the cooldown window expired.
NIT #6 — 4h cooldown moved to config.EXTRACTION_COOLDOWN_HOURS. Tunable
without code change. Log format now shows the configured hours.
Also backfilled 59 pre-existing zombie queue-path rows where the file
was already archived but DB status said 'unprocessed' — these would have
leaked past the DB filter once the 4h cooldown expired.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
222 lines
9.6 KiB
Python
222 lines
9.6 KiB
Python
"""Pipeline v2 configuration — all constants and thresholds."""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# --- Paths ---
|
|
BASE_DIR = Path(os.environ.get("PIPELINE_BASE", "/opt/teleo-eval"))
|
|
REPO_DIR = BASE_DIR / "workspaces" / "teleo-codex.git"
|
|
MAIN_WORKTREE = BASE_DIR / "workspaces" / "main"
|
|
SECRETS_DIR = BASE_DIR / "secrets"
|
|
LOG_DIR = BASE_DIR / "logs"
|
|
DB_PATH = BASE_DIR / "pipeline" / "pipeline.db"
|
|
# File-based worktree lock path — used by all processes that write to main worktree
|
|
# (pipeline daemon stages + telegram bot). Ganymede: one lock, one mechanism.
|
|
MAIN_WORKTREE_LOCKFILE = BASE_DIR / "workspaces" / ".main-worktree.lock"
|
|
|
|
INBOX_QUEUE = "inbox/queue"
|
|
INBOX_ARCHIVE = "inbox/archive"
|
|
INBOX_NULL_RESULT = "inbox/null-result"
|
|
|
|
# --- Forgejo ---
|
|
FORGEJO_URL = os.environ.get("FORGEJO_URL", "http://localhost:3000")
|
|
FORGEJO_OWNER = "teleo"
|
|
FORGEJO_REPO = "teleo-codex"
|
|
FORGEJO_TOKEN_FILE = SECRETS_DIR / "forgejo-admin-token"
|
|
FORGEJO_PIPELINE_USER = "teleo" # git user for pipeline commits
|
|
|
|
# --- Models ---
|
|
CLAUDE_CLI = os.environ.get("CLAUDE_CLI", "/home/teleo/.local/bin/claude")
|
|
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
|
|
|
|
# Model IDs
|
|
MODEL_OPUS = "opus"
|
|
MODEL_SONNET = "sonnet"
|
|
MODEL_HAIKU = "anthropic/claude-3.5-haiku"
|
|
MODEL_GPT4O = "openai/gpt-4o" # legacy, kept for reference
|
|
MODEL_GEMINI_FLASH = "google/gemini-2.5-flash" # was -preview, removed by OpenRouter
|
|
MODEL_SONNET_OR = "anthropic/claude-sonnet-4.5" # OpenRouter Sonnet (paid, not Claude Max)
|
|
|
|
# --- Model assignment per stage ---
|
|
# Principle: Opus is scarce (Claude Max). Reserve for DEEP eval + overnight research.
|
|
# Model diversity: domain (GPT-4o) + Leo (Sonnet) = two model families, no correlated blindspots.
|
|
# Both on OpenRouter = Claude Max rate limit untouched for Opus.
|
|
#
|
|
# Pipeline eval ordering (domain-first, Leo-last):
|
|
# 1. Domain review → GPT-4o (OpenRouter) — different family from Leo
|
|
# 2. Leo STANDARD → Sonnet (OpenRouter) — different family from domain
|
|
# 3. Leo DEEP → Opus (Claude Max) — highest judgment, scarce
|
|
EXTRACT_MODEL = MODEL_SONNET # extraction: structured output, volume work (Claude Max)
|
|
TRIAGE_MODEL = MODEL_HAIKU # triage: routing decision, cheapest (OpenRouter)
|
|
EVAL_DOMAIN_MODEL = MODEL_GEMINI_FLASH # domain review: Gemini 2.5 Flash (was GPT-4o — 16x cheaper, different family from Sonnet)
|
|
EVAL_LEO_MODEL = MODEL_OPUS # Leo DEEP review: Claude Max Opus
|
|
EVAL_LEO_STANDARD_MODEL = MODEL_SONNET_OR # Leo STANDARD review: OpenRouter Sonnet
|
|
EVAL_DEEP_MODEL = MODEL_GEMINI_FLASH # DEEP cross-family: paid, adversarial
|
|
|
|
# --- Model backends ---
|
|
# Each model can run on Claude Max (subscription, base load) or API (overflow/spikes).
|
|
# Claude Max: free but rate-limited. API: paid but unlimited.
|
|
# When Claude Max is rate-limited, behavior per stage:
|
|
# "queue" — wait for capacity (preferred for non-urgent work)
|
|
# "overflow" — fall back to API (for time-sensitive work)
|
|
# "skip" — skip this cycle (for optional stages like sample audit)
|
|
OVERFLOW_POLICY = {
|
|
"extract": "queue", # extraction can wait
|
|
"triage": "overflow", # triage is cheap on API anyway
|
|
"eval_domain": "overflow", # domain review is the volume filter — don't let it bottleneck (Rhea)
|
|
"eval_leo": "queue", # Leo review is the bottleneck we protect
|
|
"eval_deep": "overflow", # DEEP is already on API
|
|
"sample_audit": "skip", # optional, skip if constrained
|
|
}
|
|
|
|
# OpenRouter cost rates per 1K tokens (only applies when using API, not Claude Max)
|
|
MODEL_COSTS = {
|
|
"opus": {"input": 0.015, "output": 0.075},
|
|
"sonnet": {"input": 0.003, "output": 0.015},
|
|
MODEL_HAIKU: {"input": 0.0008, "output": 0.004},
|
|
MODEL_GPT4O: {"input": 0.0025, "output": 0.01},
|
|
MODEL_GEMINI_FLASH: {"input": 0.00015, "output": 0.0006},
|
|
MODEL_SONNET_OR: {"input": 0.003, "output": 0.015},
|
|
}
|
|
|
|
# --- Concurrency ---
|
|
MAX_EXTRACT_WORKERS = int(os.environ.get("MAX_EXTRACT_WORKERS", "5"))
|
|
MAX_EVAL_WORKERS = int(os.environ.get("MAX_EVAL_WORKERS", "7"))
|
|
MAX_MERGE_WORKERS = 1 # domain-serialized, but one merge at a time per domain
|
|
|
|
# --- Timeouts (seconds) ---
|
|
EXTRACT_TIMEOUT = 600 # 10 min
|
|
EVAL_TIMEOUT = 120 # 2 min — routine Sonnet/Gemini Flash calls (was 600, caused 10-min stalls)
|
|
EVAL_TIMEOUT_OPUS = 600 # 10 min — Opus DEEP eval needs more time for complex reasoning
|
|
MERGE_TIMEOUT = 300 # 5 min — force-reset to conflict if exceeded (Rhea)
|
|
CLAUDE_MAX_PROBE_TIMEOUT = 15
|
|
|
|
# --- Backpressure ---
|
|
BACKPRESSURE_HIGH = 40 # pause extraction above this
|
|
BACKPRESSURE_LOW = 20 # throttle extraction above this
|
|
BACKPRESSURE_THROTTLE_WORKERS = 2 # workers when throttled
|
|
|
|
# --- Retry budgets ---
|
|
TRANSIENT_RETRY_MAX = 5 # API timeouts, rate limits
|
|
SUBSTANTIVE_RETRY_STANDARD = 2 # reviewer request_changes
|
|
SUBSTANTIVE_RETRY_DEEP = 3
|
|
MAX_EVAL_ATTEMPTS = 3 # Hard cap on eval cycles per PR before terminal
|
|
MAX_FIX_ATTEMPTS = 2 # Hard cap on auto-fix cycles per PR before giving up
|
|
MAX_FIX_PER_CYCLE = 15 # PRs to fix per cycle — bumped from 5 to clear backlog (Cory, Mar 14)
|
|
|
|
# Issue tags that can be fixed mechanically (Python fixer or Haiku)
|
|
# broken_wiki_links removed — downgraded to warning, not a gate. Links to claims
|
|
# in other open PRs resolve naturally as the dependency chain merges. (Cory, Mar 14)
|
|
MECHANICAL_ISSUE_TAGS = {"frontmatter_schema", "near_duplicate"}
|
|
# Issue tags that require re-extraction (substantive quality problems)
|
|
SUBSTANTIVE_ISSUE_TAGS = {"factual_discrepancy", "confidence_miscalibration", "scope_error", "title_overclaims"}
|
|
|
|
# --- Content type schemas ---
|
|
# Registry of content types. validate.py branches on type to apply the right
|
|
# required fields, confidence rules, and title checks. Adding a new type is a
|
|
# dict entry here — no code changes in validate.py needed.
|
|
TYPE_SCHEMAS = {
|
|
"claim": {
|
|
"required": ("type", "domain", "description", "confidence", "source", "created"),
|
|
"valid_confidence": ("proven", "likely", "experimental", "speculative"),
|
|
"needs_proposition_title": True,
|
|
},
|
|
"framework": {
|
|
"required": ("type", "domain", "description", "source", "created"),
|
|
"valid_confidence": None,
|
|
"needs_proposition_title": True,
|
|
},
|
|
"entity": {
|
|
"required": ("type", "domain", "description"),
|
|
"valid_confidence": None,
|
|
"needs_proposition_title": False,
|
|
},
|
|
"decision": {
|
|
"required": ("type", "domain", "description", "parent_entity", "status"),
|
|
"valid_confidence": None,
|
|
"needs_proposition_title": False,
|
|
"valid_status": ("active", "passed", "failed", "expired", "cancelled"),
|
|
},
|
|
}
|
|
|
|
# --- Content directories ---
|
|
ENTITY_DIR_TEMPLATE = "entities/{domain}" # centralized path (Rhea: don't hardcode across 5 files)
|
|
DECISION_DIR_TEMPLATE = "decisions/{domain}"
|
|
|
|
# --- Contributor tiers ---
|
|
# Auto-promotion rules. CI is computed, not stored.
|
|
CONTRIBUTOR_TIER_RULES = {
|
|
"contributor": {
|
|
"claims_merged": 1,
|
|
},
|
|
"veteran": {
|
|
"claims_merged": 10,
|
|
"min_days_since_first": 30,
|
|
"challenges_survived": 1,
|
|
},
|
|
}
|
|
|
|
# Role weights for CI computation (must match core/contribution-architecture.md)
|
|
CONTRIBUTION_ROLE_WEIGHTS = {
|
|
"challenger": 0.35,
|
|
"synthesizer": 0.25,
|
|
"reviewer": 0.20,
|
|
"sourcer": 0.15,
|
|
"extractor": 0.05,
|
|
}
|
|
|
|
# --- Circuit breakers ---
|
|
BREAKER_THRESHOLD = 5
|
|
BREAKER_COOLDOWN = 900 # 15 min
|
|
|
|
# --- Cost budgets ---
|
|
OPENROUTER_DAILY_BUDGET = 20.0 # USD
|
|
OPENROUTER_WARN_THRESHOLD = 0.8 # 80% of budget
|
|
|
|
# --- Quality ---
|
|
SAMPLE_AUDIT_RATE = 0.15 # 15% of LIGHT merges get pre-merge promotion to STANDARD (Rio)
|
|
SAMPLE_AUDIT_DISAGREEMENT_THRESHOLD = 0.10 # 10% disagreement → tighten LIGHT criteria
|
|
SAMPLE_AUDIT_MODEL = MODEL_OPUS # Opus for audit — different family from Haiku triage (Leo)
|
|
|
|
# --- Batch eval ---
|
|
# Batch domain review: group STANDARD PRs by domain, one LLM call per batch.
|
|
# Leo review stays individual (safety net for cross-contamination).
|
|
BATCH_EVAL_MAX_PRS = int(os.environ.get("BATCH_EVAL_MAX_PRS", "5"))
|
|
BATCH_EVAL_MAX_DIFF_BYTES = int(os.environ.get("BATCH_EVAL_MAX_DIFF_BYTES", "100000")) # 100KB
|
|
|
|
# --- Tier logic ---
|
|
# LIGHT_SKIP_LLM: when True, LIGHT PRs skip domain+Leo review entirely (auto-approve on Tier 0 pass).
|
|
# Set False for shadow mode (domain review runs but logs only). Flip True after 24h validation (Rhea).
|
|
LIGHT_SKIP_LLM = os.environ.get("LIGHT_SKIP_LLM", "false").lower() == "true"
|
|
# Random pre-merge promotion: fraction of LIGHT PRs upgraded to STANDARD before eval (Rio).
|
|
# Makes gaming unpredictable — extraction agents can't know which LIGHT PRs get full review.
|
|
LIGHT_PROMOTION_RATE = float(os.environ.get("LIGHT_PROMOTION_RATE", "0.15"))
|
|
|
|
# --- Polling intervals (seconds) ---
|
|
INGEST_INTERVAL = 60
|
|
VALIDATE_INTERVAL = 30
|
|
EVAL_INTERVAL = 30
|
|
MERGE_INTERVAL = 30
|
|
FIX_INTERVAL = 60
|
|
HEALTH_CHECK_INTERVAL = 60
|
|
|
|
# --- Extraction gates ---
|
|
EXTRACTION_COOLDOWN_HOURS = 4 # Skip sources with any PR activity in this window. Defense-in-depth for DB-status filter.
|
|
|
|
# --- Retrieval (Telegram bot) ---
|
|
RETRIEVAL_RRF_K = 20 # RRF smoothing constant — tuned for 5-10 results per source
|
|
RETRIEVAL_ENTITY_BOOST = 1.5 # RRF score multiplier for claims wiki-linked from matched entities
|
|
RETRIEVAL_MAX_RESULTS = 10 # Max claims shown to LLM after RRF merge
|
|
RETRIEVAL_MIN_CLAIM_SCORE = 3.0 # Floor for keyword claim scoring — filters single-stopword matches
|
|
|
|
# --- Health API ---
|
|
HEALTH_PORT = 8080
|
|
|
|
# --- Logging ---
|
|
LOG_FILE = LOG_DIR / "pipeline.jsonl"
|
|
LOG_ROTATION_MAX_BYTES = 50 * 1024 * 1024 # 50MB per file
|
|
LOG_ROTATION_BACKUP_COUNT = 7 # keep 7 days
|
|
|
|
# --- Versioning (tracked in metrics_snapshots for chart annotations) ---
|
|
PROMPT_VERSION = "v2-lean-directed" # bump on every prompt change
|
|
PIPELINE_VERSION = "2.2" # bump on every significant pipeline change
|