teleo-codex/ops/pipeline-v2/lib/post_extract.py
m3taversal 05d74d5e32 sync: import all VPS pipeline + diagnostics code as baseline
Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source
of truth. Previously only 8 of 67 files existed in repo — the rest were
deployed directly to VPS via SCP, causing massive drift.

Includes:
- pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.)
- pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh
- diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics)
- agent-state/: bootstrap, lib-state, cascade inbox processor, schema
- systemd/: service unit files for reference
- deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate
- research-session.sh: updated with Step 8.5 digest + cascade inbox processing

No new code written — all files are exact copies from VPS as of 2026-04-06.
From this point forward: edit in repo, commit, then deploy.sh.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 00:00:00 +01:00

551 lines
20 KiB
Python

"""Post-extraction validator — deterministic fixes and quality gate.
Runs AFTER LLM extraction, BEFORE git commit. Pure Python, $0 cost.
Catches the mechanical issues that account for 73% of eval rejections:
- Frontmatter schema violations (missing/invalid fields)
- Broken wiki links (strips brackets, keeps text)
- Date errors (wrong format, source date instead of today)
- Filename convention violations
- Title precision (too short, not a proposition)
- Duplicate detection against existing KB
Design principles (Leo):
- Mechanical rules belong in code, not prompts
- Fix what's fixable, reject what's not
- Never silently drop content — log everything
Epimetheus owns this module. Leo reviews changes.
"""
import json
import logging
import os
import re
from datetime import date, datetime
from difflib import SequenceMatcher
from pathlib import Path
logger = logging.getLogger("pipeline.post_extract")
# ─── Constants ──────────────────────────────────────────────────────────────
VALID_DOMAINS = frozenset({
"internet-finance", "entertainment", "health", "ai-alignment",
"space-development", "grand-strategy", "mechanisms", "living-capital",
"living-agents", "teleohumanity", "critical-systems",
"collective-intelligence", "teleological-economics", "cultural-dynamics",
})
VALID_CONFIDENCE = frozenset({"proven", "likely", "experimental", "speculative"})
REQUIRED_CLAIM_FIELDS = ("type", "domain", "description", "confidence", "source", "created")
REQUIRED_ENTITY_FIELDS = ("type", "domain", "description")
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
# Minimum title word count for claims (Leo: titles must name specific mechanism)
MIN_TITLE_WORDS = 8
DEDUP_THRESHOLD = 0.85
# ─── YAML parsing ──────────────────────────────────────────────────────────
def parse_frontmatter(text: str) -> tuple[dict | None, str]:
"""Extract YAML frontmatter from markdown. Returns (frontmatter_dict, body)."""
if not text.startswith("---"):
return None, text
end = text.find("---", 3)
if end == -1:
return None, text
raw = text[3:end]
body = text[end + 3:].strip()
try:
import yaml
fm = yaml.safe_load(raw)
if not isinstance(fm, dict):
return None, body
return fm, body
except ImportError:
pass
except Exception:
return None, body
# Fallback: simple key-value parser
fm = {}
for line in raw.strip().split("\n"):
line = line.strip()
if not line or line.startswith("#"):
continue
if ":" not in line:
continue
key, _, val = line.partition(":")
key = key.strip()
val = val.strip().strip('"').strip("'")
if val.lower() == "null" or val == "":
val = None
elif val.startswith("["):
val = [v.strip().strip('"').strip("'") for v in val.strip("[]").split(",") if v.strip()]
fm[key] = val
return fm if fm else None, body
# ─── Fixers (modify content, return fixed version) ─────────────────────────
def fix_frontmatter(content: str, domain: str, agent: str) -> tuple[str, list[str]]:
"""Fix common frontmatter issues. Returns (fixed_content, list_of_fixes_applied)."""
fixes = []
fm, body = parse_frontmatter(content)
if fm is None:
return content, ["unfixable:no_frontmatter"]
changed = False
ftype = fm.get("type", "claim")
# Fix 1: created = extraction date, always today. No parsing, no comparison.
# "created" means "when this was extracted," period. Source publication date
# belongs in a separate field if needed. (Ganymede review)
today_str = date.today().isoformat()
if ftype == "claim":
old_created = fm.get("created")
fm["created"] = today_str
if old_created != today_str:
fixes.append(f"set_created:{today_str}")
changed = True
# Fix 2: type field
if "type" not in fm:
fm["type"] = "claim"
fixes.append("added_type:claim")
changed = True
# Fix 3: domain field
if "domain" not in fm or fm["domain"] not in VALID_DOMAINS:
fm["domain"] = domain
fixes.append(f"fixed_domain:{fm.get('domain', 'missing')}->{domain}")
changed = True
# Fix 4: confidence field (claims only)
if ftype == "claim":
conf = fm.get("confidence")
if conf is None:
fm["confidence"] = "experimental"
fixes.append("added_confidence:experimental")
changed = True
elif conf not in VALID_CONFIDENCE:
fm["confidence"] = "experimental"
fixes.append(f"fixed_confidence:{conf}->experimental")
changed = True
# Fix 5: description field
if "description" not in fm or not fm["description"]:
# Try to derive from body's first sentence
first_sentence = body.split(".")[0].strip().lstrip("# ") if body else ""
if first_sentence and len(first_sentence) > 10:
fm["description"] = first_sentence[:200]
fixes.append("derived_description_from_body")
changed = True
# Fix 6: source field (claims only)
if ftype == "claim" and ("source" not in fm or not fm["source"]):
fm["source"] = f"extraction by {agent}"
fixes.append("added_default_source")
changed = True
if not changed:
return content, []
# Reconstruct frontmatter
return _rebuild_content(fm, body), fixes
def fix_wiki_links(content: str, existing_claims: set[str]) -> tuple[str, list[str]]:
"""Fix or strip broken wiki links. Resolves slug→space mismatches before stripping.
The LLM often generates wiki links as slugs (hyphens) but KB filenames use spaces.
Try normalizing hyphens→spaces before giving up and stripping brackets.
"""
fixes = []
# Build a lookup: normalized (lowercased, hyphens→spaces) → original stem
_normalized_lookup: dict[str, str] = {}
for stem in existing_claims:
_normalized_lookup[stem.lower().replace("-", " ")] = stem
def replace_broken(match):
link = match.group(1).strip()
if link in existing_claims:
return match.group(0) # Exact match — keep as-is
# Try normalizing slug to spaces
normalized = link.lower().replace("-", " ")
if normalized in _normalized_lookup:
resolved = _normalized_lookup[normalized]
fixes.append(f"resolved_wiki_link:{link[:40]}->{resolved[:40]}")
return f"[[{resolved}]]"
fixes.append(f"stripped_wiki_link:{link[:60]}")
return link # Keep text, remove brackets
fixed = WIKI_LINK_RE.sub(replace_broken, content)
return fixed, fixes
def fix_trailing_newline(content: str) -> tuple[str, list[str]]:
"""Ensure file ends with exactly one newline."""
if not content.endswith("\n"):
return content + "\n", ["added_trailing_newline"]
return content, []
def fix_h1_title_match(content: str, filename: str) -> tuple[str, list[str]]:
"""Ensure the content has an H1 title. Does NOT replace existing H1s.
The H1 title in the content is authoritative — the filename is derived from it
and may be truncated or slightly different. We only add a missing H1, never
overwrite an existing one.
"""
expected_title = Path(filename).stem.replace("-", " ")
fm, body = parse_frontmatter(content)
if fm is None:
return content, []
# Find existing H1
h1_match = re.search(r"^# (.+)$", body, re.MULTILINE)
if h1_match:
# H1 exists — leave it alone. The content's H1 is authoritative.
return content, []
elif body and not body.startswith("#"):
# No H1 at all — add one derived from filename
body = f"# {expected_title}\n\n{body}"
return _rebuild_content(fm, body), ["added_h1_title"]
return content, []
# ─── Validators (check without modifying, return issues) ──────────────────
def validate_claim(filename: str, content: str, existing_claims: set[str], agent: str | None = None) -> list[str]:
"""Validate a claim file. Returns list of issues (empty = pass)."""
issues = []
fm, body = parse_frontmatter(content)
if fm is None:
return ["no_frontmatter"]
ftype = fm.get("type", "claim")
# Schema check
required = REQUIRED_CLAIM_FIELDS if ftype == "claim" else REQUIRED_ENTITY_FIELDS
for field in required:
if field not in fm or fm[field] is None:
issues.append(f"missing_field:{field}")
# Domain check
domain = fm.get("domain")
if domain and domain not in VALID_DOMAINS:
issues.append(f"invalid_domain:{domain}")
# Confidence check (claims only)
if ftype == "claim":
conf = fm.get("confidence")
if conf and conf not in VALID_CONFIDENCE:
issues.append(f"invalid_confidence:{conf}")
# Title checks (claims only, not entities)
# Use H1 from body if available (authoritative), fall back to filename
if ftype in ("claim", "framework"):
h1_match = re.search(r"^# (.+)$", body, re.MULTILINE)
title = h1_match.group(1).strip() if h1_match else Path(filename).stem.replace("-", " ")
words = title.split()
# Always enforce minimum 4 words — a 2-3 word title is never specific
# enough to disagree with. (Ganymede review)
if len(words) < 4:
issues.append("title_too_few_words")
elif len(words) < 8:
# For 4-7 word titles, also require a verb/connective
has_verb = bool(re.search(
r"\b(is|are|was|were|will|would|can|could|should|must|has|have|had|"
r"does|did|do|may|might|shall|"
r"because|therefore|however|although|despite|since|through|by|"
r"when|where|while|if|unless|"
r"rather than|instead of|not just|more than|"
r"\w+(?:s|ed|ing|es|tes|ses|zes|ves|cts|pts|nts|rns))\b",
title, re.IGNORECASE,
))
if not has_verb:
issues.append("title_not_proposition")
# Description quality
desc = fm.get("description", "")
if isinstance(desc, str) and len(desc.strip()) < 10:
issues.append("description_too_short")
# Attribution check: extractor must be identified. (Leo: block extractor, warn sourcer)
if ftype == "claim":
from .attribution import validate_attribution
issues.extend(validate_attribution(fm, agent=agent))
# OPSEC check: flag claims containing dollar amounts + internal entity references.
# Rio's rule: never extract LivingIP/Teleo deal terms to public codex. (Ganymede review)
if ftype == "claim":
combined_text = (title + " " + desc + " " + body).lower()
has_dollar = bool(re.search(r"\$[\d,.]+[mkb]?\b", combined_text, re.IGNORECASE))
has_internal = bool(re.search(
r"\b(livingip|teleo|internal|deal terms?|valuation|equity percent)",
combined_text, re.IGNORECASE,
))
if has_dollar and has_internal:
issues.append("opsec_internal_deal_terms")
# Body substance check (claims only)
if ftype == "claim" and body:
# Strip the H1 title line and check remaining content
body_no_h1 = re.sub(r"^# .+\n*", "", body).strip()
# Remove "Relevant Notes" and "Topics" sections
body_content = re.split(r"\n---\n", body_no_h1)[0].strip()
if len(body_content) < 50:
issues.append("body_too_thin")
# Near-duplicate check (claims only, not entities)
if ftype != "entity":
title_lower = Path(filename).stem.replace("-", " ").lower()
title_words = set(title_lower.split()[:6])
for existing in existing_claims:
# Normalize existing stem: hyphens → spaces for consistent comparison
existing_normalized = existing.replace("-", " ").lower()
if len(title_words & set(existing_normalized.split()[:6])) < 2:
continue
ratio = SequenceMatcher(None, title_lower, existing_normalized).ratio()
if ratio >= DEDUP_THRESHOLD:
issues.append(f"near_duplicate:{existing[:80]}")
break # One is enough to flag
return issues
# ─── Main entry point ──────────────────────────────────────────────────────
def validate_and_fix_claims(
claims: list[dict],
domain: str,
agent: str,
existing_claims: set[str],
repo_root: str = ".",
) -> tuple[list[dict], list[dict], dict]:
"""Validate and fix extracted claims. Returns (kept_claims, rejected_claims, stats).
Each claim dict has: filename, domain, content
Returned claims have content fixed where possible.
Stats: {total, kept, fixed, rejected, fixes_applied: [...], rejections: [...]}
"""
kept = []
rejected = []
all_fixes = []
all_rejections = []
# Add intra-batch stems to existing claims (avoid false positive duplicates within same extraction)
batch_stems = {Path(c["filename"]).stem for c in claims}
existing_plus_batch = existing_claims | batch_stems
for claim in claims:
filename = claim.get("filename", "")
content = claim.get("content", "")
claim_domain = claim.get("domain", domain)
if not filename or not content:
rejected.append(claim)
all_rejections.append(f"{filename or '?'}:missing_filename_or_content")
continue
# Phase 1: Apply fixers
content, fixes1 = fix_frontmatter(content, claim_domain, agent)
content, fixes2 = fix_wiki_links(content, existing_plus_batch)
content, fixes3 = fix_trailing_newline(content)
content, fixes4 = fix_h1_title_match(content, filename)
fixes = fixes1 + fixes2 + fixes3 + fixes4
if fixes:
all_fixes.extend([f"{filename}:{f}" for f in fixes])
# Phase 2: Validate (after fixes)
issues = validate_claim(filename, content, existing_claims, agent=agent)
# Separate hard failures from warnings
hard_failures = [i for i in issues if not i.startswith("near_duplicate")]
warnings = [i for i in issues if i.startswith("near_duplicate")]
if hard_failures:
rejected.append({**claim, "content": content, "issues": hard_failures})
all_rejections.extend([f"{filename}:{i}" for i in hard_failures])
else:
if warnings:
all_fixes.extend([f"{filename}:WARN:{w}" for w in warnings])
kept.append({**claim, "content": content})
stats = {
"total": len(claims),
"kept": len(kept),
"fixed": len([f for f in all_fixes if ":WARN:" not in f]),
"rejected": len(rejected),
"fixes_applied": all_fixes,
"rejections": all_rejections,
}
logger.info(
"Post-extraction: %d/%d claims kept (%d fixed, %d rejected)",
stats["kept"], stats["total"], stats["fixed"], stats["rejected"],
)
return kept, rejected, stats
def validate_and_fix_entities(
entities: list[dict],
domain: str,
existing_claims: set[str],
) -> tuple[list[dict], list[dict], dict]:
"""Validate and fix extracted entities. Returns (kept, rejected, stats).
Lighter validation than claims — entities are factual records, not arguable propositions.
"""
kept = []
rejected = []
all_issues = []
for ent in entities:
filename = ent.get("filename", "")
content = ent.get("content", "")
action = ent.get("action", "create")
if not filename:
rejected.append(ent)
all_issues.append("missing_filename")
continue
issues = []
if action == "create" and content:
fm, body = parse_frontmatter(content)
if fm is None:
issues.append("no_frontmatter")
else:
if fm.get("type") != "entity":
issues.append("wrong_type")
if "entity_type" not in fm:
issues.append("missing_entity_type")
if "domain" not in fm:
issues.append("missing_domain")
# decision_market specific checks
if fm.get("entity_type") == "decision_market":
for field in ("parent_entity", "platform", "category", "status"):
if field not in fm:
issues.append(f"dm_missing:{field}")
# Fix trailing newline
if content and not content.endswith("\n"):
ent["content"] = content + "\n"
elif action == "update":
timeline = ent.get("timeline_entry", "")
if not timeline:
issues.append("update_no_timeline")
if issues:
rejected.append({**ent, "issues": issues})
all_issues.extend([f"{filename}:{i}" for i in issues])
else:
kept.append(ent)
stats = {
"total": len(entities),
"kept": len(kept),
"rejected": len(rejected),
"issues": all_issues,
}
return kept, rejected, stats
def load_existing_claims_from_repo(repo_root: str) -> set[str]:
"""Build set of known claim/entity stems from the repo."""
claims: set[str] = set()
base = Path(repo_root)
for subdir in ["domains", "core", "foundations", "maps", "agents", "schemas", "entities"]:
full = base / subdir
if not full.is_dir():
continue
for f in full.rglob("*.md"):
claims.add(f.stem)
return claims
# ─── Helpers ────────────────────────────────────────────────────────────────
def _rebuild_content(fm: dict, body: str) -> str:
"""Rebuild markdown content from frontmatter dict and body."""
# Order frontmatter fields consistently
field_order = ["type", "entity_type", "name", "domain", "description",
"confidence", "source", "created", "status", "parent_entity",
"platform", "proposer", "proposal_url", "proposal_date",
"resolution_date", "category", "summary", "tracked_by",
"secondary_domains", "challenged_by"]
lines = ["---"]
written = set()
for field in field_order:
if field in fm and fm[field] is not None:
lines.append(_yaml_line(field, fm[field]))
written.add(field)
# Write remaining fields not in the order list
for key, val in fm.items():
if key not in written and val is not None:
lines.append(_yaml_line(key, val))
lines.append("---")
lines.append("")
lines.append(body)
content = "\n".join(lines)
if not content.endswith("\n"):
content += "\n"
return content
def _yaml_line(key: str, val) -> str:
"""Format a single YAML key-value line."""
if isinstance(val, dict):
# Nested YAML block (e.g. attribution with sub-keys)
lines = [f"{key}:"]
for sub_key, sub_val in val.items():
if isinstance(sub_val, list) and sub_val:
lines.append(f" {sub_key}:")
for item in sub_val:
if isinstance(item, dict):
first = True
for ik, iv in item.items():
prefix = " - " if first else " "
lines.append(f'{prefix}{ik}: "{iv}"')
first = False
else:
lines.append(f' - "{item}"')
else:
lines.append(f" {sub_key}: []")
return "\n".join(lines)
if isinstance(val, list):
return f"{key}: {json.dumps(val)}"
if isinstance(val, bool):
return f"{key}: {'true' if val else 'false'}"
if isinstance(val, (int, float)):
return f"{key}: {val}"
if isinstance(val, date):
return f"{key}: {val.isoformat()}"
# String — quote if it contains special chars
s = str(val)
if any(c in s for c in ":#{}[]|>&*!%@`"):
return f'{key}: "{s}"'
return f"{key}: {s}"