teleo-infrastructure/lib/post_extract.py
2026-05-29 15:08:09 +02:00

558 lines
21 KiB
Python

"""Post-extraction validator — deterministic fixes and quality gate.
Runs AFTER LLM extraction, BEFORE git commit. Pure Python, $0 cost.
Catches the mechanical issues that account for 73% of eval rejections:
- Frontmatter schema violations (missing/invalid fields)
- Broken wiki links (strips brackets, keeps text)
- Date errors (wrong format, source date instead of today)
- Filename convention violations
- Title precision (too short, not a proposition)
- Duplicate detection against existing KB
Design principles (Leo):
- Mechanical rules belong in code, not prompts
- Fix what's fixable, reject what's not
- Never silently drop content — log everything
Epimetheus owns this module. Leo reviews changes.
"""
import json
import logging
import re
from datetime import date, datetime
from difflib import SequenceMatcher
from pathlib import Path
logger = logging.getLogger("pipeline.post_extract")
# ─── Constants ──────────────────────────────────────────────────────────────
VALID_DOMAINS = frozenset({
"internet-finance", "entertainment", "health", "ai-alignment",
"space-development", "grand-strategy", "mechanisms", "living-capital",
"living-agents", "teleohumanity", "critical-systems",
"collective-intelligence", "teleological-economics", "cultural-dynamics",
})
VALID_CONFIDENCE = frozenset({"proven", "likely", "experimental", "speculative"})
REQUIRED_CLAIM_FIELDS = ("type", "domain", "description", "confidence", "source", "created")
REQUIRED_ENTITY_FIELDS = ("type", "domain", "description")
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
# Minimum title word count for claims (Leo: titles must name specific mechanism)
MIN_TITLE_WORDS = 8
DEDUP_THRESHOLD = 0.85
# ─── YAML parsing ──────────────────────────────────────────────────────────
def parse_frontmatter(text: str) -> tuple[dict | None, str]:
"""Extract YAML frontmatter from markdown. Returns (frontmatter_dict, body)."""
if not text.startswith("---"):
return None, text
end = text.find("---", 3)
if end == -1:
return None, text
raw = text[3:end]
body = text[end + 3:].strip()
try:
import yaml
fm = yaml.safe_load(raw)
if not isinstance(fm, dict):
return None, body
for key, value in list(fm.items()):
if isinstance(value, date | datetime):
fm[key] = value.isoformat()
return fm, body
except ImportError:
pass
except Exception:
return None, body
# Fallback: simple key-value parser
fm = {}
for line in raw.strip().split("\n"):
line = line.strip()
if not line or line.startswith("#"):
continue
if ":" not in line:
continue
key, _, val = line.partition(":")
key = key.strip()
val = val.strip().strip('"').strip("'")
if val.lower() == "null" or val == "":
val = None
elif val.startswith("["):
val = [v.strip().strip('"').strip("'") for v in val.strip("[]").split(",") if v.strip()]
fm[key] = val
return fm if fm else None, body
# ─── Fixers (modify content, return fixed version) ─────────────────────────
def fix_frontmatter(content: str, domain: str, agent: str) -> tuple[str, list[str]]:
"""Fix common frontmatter issues. Returns (fixed_content, list_of_fixes_applied)."""
fixes = []
fm, body = parse_frontmatter(content)
if fm is None:
return content, ["unfixable:no_frontmatter"]
changed = False
ftype = fm.get("type", "claim")
# Fix 1: created = extraction date, always today. No parsing, no comparison.
# "created" means "when this was extracted," period. Source publication date
# belongs in a separate field if needed. (Ganymede review)
today_str = date.today().isoformat()
if ftype == "claim":
old_created = fm.get("created")
fm["created"] = today_str
if old_created != today_str:
fixes.append(f"set_created:{today_str}")
changed = True
# Fix 2: type field
if "type" not in fm:
fm["type"] = "claim"
fixes.append("added_type:claim")
changed = True
# Fix 3: domain field
if "domain" not in fm or fm["domain"] not in VALID_DOMAINS:
fm["domain"] = domain
fixes.append(f"fixed_domain:{fm.get('domain', 'missing')}->{domain}")
changed = True
# Fix 4: confidence field (claims only)
if ftype == "claim":
conf = fm.get("confidence")
if conf is None:
fm["confidence"] = "experimental"
fixes.append("added_confidence:experimental")
changed = True
elif conf not in VALID_CONFIDENCE:
fm["confidence"] = "experimental"
fixes.append(f"fixed_confidence:{conf}->experimental")
changed = True
# Fix 5: description field
if "description" not in fm or not fm["description"]:
# Try to derive from the first non-empty body line.
first_sentence = ""
for line in body.splitlines():
first_sentence = line.strip().lstrip("# ")
if first_sentence:
first_sentence = first_sentence.split(".")[0].strip()
break
if first_sentence and len(first_sentence) > 10:
fm["description"] = first_sentence[:200]
fixes.append("derived_description_from_body")
changed = True
# Fix 6: source field (claims only)
if ftype == "claim" and ("source" not in fm or not fm["source"]):
fm["source"] = f"extraction by {agent}"
fixes.append("added_default_source")
changed = True
if not changed:
return content, []
# Reconstruct frontmatter
return _rebuild_content(fm, body), fixes
def fix_wiki_links(content: str, existing_claims: set[str]) -> tuple[str, list[str]]:
"""Fix or strip broken wiki links. Resolves slug→space mismatches before stripping.
The LLM often generates wiki links as slugs (hyphens) but KB filenames use spaces.
Try normalizing hyphens→spaces before giving up and stripping brackets.
"""
fixes = []
# Build a lookup: normalized (lowercased, hyphens→spaces) → original stem
_normalized_lookup: dict[str, str] = {}
for stem in existing_claims:
_normalized_lookup[stem.lower().replace("-", " ")] = stem
def replace_broken(match):
link = match.group(1).strip()
if link in existing_claims:
return match.group(0) # Exact match — keep as-is
# Try normalizing slug to spaces
normalized = link.lower().replace("-", " ")
if normalized in _normalized_lookup:
resolved = _normalized_lookup[normalized]
fixes.append(f"resolved_wiki_link:{link[:40]}->{resolved[:40]}")
return f"[[{resolved}]]"
fixes.append(f"stripped_wiki_link:{link[:60]}")
return link # Keep text, remove brackets
fixed = WIKI_LINK_RE.sub(replace_broken, content)
return fixed, fixes
def fix_trailing_newline(content: str) -> tuple[str, list[str]]:
"""Ensure file ends with exactly one newline."""
if not content.endswith("\n"):
return content + "\n", ["added_trailing_newline"]
return content, []
def fix_h1_title_match(content: str, filename: str) -> tuple[str, list[str]]:
"""Ensure the content has an H1 title. Does NOT replace existing H1s.
The H1 title in the content is authoritative — the filename is derived from it
and may be truncated or slightly different. We only add a missing H1, never
overwrite an existing one.
"""
expected_title = Path(filename).stem.replace("-", " ")
fm, body = parse_frontmatter(content)
if fm is None:
return content, []
# Find existing H1
h1_match = re.search(r"^# (.+)$", body, re.MULTILINE)
if h1_match:
# H1 exists — leave it alone. The content's H1 is authoritative.
return content, []
elif body and not body.startswith("#"):
# No H1 at all — add one derived from filename
body = f"# {expected_title}\n\n{body}"
return _rebuild_content(fm, body), ["added_h1_title"]
return content, []
# ─── Validators (check without modifying, return issues) ──────────────────
def validate_claim(filename: str, content: str, existing_claims: set[str], agent: str | None = None) -> list[str]:
"""Validate a claim file. Returns list of issues (empty = pass)."""
issues = []
fm, body = parse_frontmatter(content)
if fm is None:
return ["no_frontmatter"]
ftype = fm.get("type", "claim")
# Schema check
required = REQUIRED_CLAIM_FIELDS if ftype == "claim" else REQUIRED_ENTITY_FIELDS
for field in required:
if field not in fm or fm[field] is None:
issues.append(f"missing_field:{field}")
# Domain check
domain = fm.get("domain")
if domain and domain not in VALID_DOMAINS:
issues.append(f"invalid_domain:{domain}")
# Confidence check (claims only)
if ftype == "claim":
conf = fm.get("confidence")
if conf and conf not in VALID_CONFIDENCE:
issues.append(f"invalid_confidence:{conf}")
# Title checks (claims only, not entities)
# Use H1 from body if available (authoritative), fall back to filename
if ftype in ("claim", "framework"):
h1_match = re.search(r"^# (.+)$", body, re.MULTILINE)
title = h1_match.group(1).strip() if h1_match else Path(filename).stem.replace("-", " ")
words = title.split()
# Always enforce minimum 4 words — a 2-3 word title is never specific
# enough to disagree with. (Ganymede review)
if len(words) < 4:
issues.append("title_too_few_words")
elif len(words) < 8:
# For 4-7 word titles, also require a verb/connective
has_verb = bool(re.search(
r"\b(is|are|was|were|will|would|can|could|should|must|has|have|had|"
r"does|did|do|may|might|shall|"
r"because|therefore|however|although|despite|since|through|by|"
r"when|where|while|if|unless|"
r"rather than|instead of|not just|more than|"
r"\w+(?:s|ed|ing|es|tes|ses|zes|ves|cts|pts|nts|rns))\b",
title, re.IGNORECASE,
))
if not has_verb:
issues.append("title_not_proposition")
# Description quality
desc = fm.get("description", "")
if isinstance(desc, str) and len(desc.strip()) < 10:
issues.append("description_too_short")
# Attribution check: extractor must be identified. (Leo: block extractor, warn sourcer)
if ftype == "claim":
from .attribution import validate_attribution
issues.extend(validate_attribution(fm, agent=agent))
# OPSEC check: flag claims containing dollar amounts + internal entity references.
# Rio's rule: never extract LivingIP/Teleo deal terms to public codex. (Ganymede review)
if ftype == "claim":
combined_text = (title + " " + desc + " " + body).lower()
has_dollar = bool(re.search(r"\$[\d,.]+[mkb]?\b", combined_text, re.IGNORECASE))
has_internal = bool(re.search(
r"\b(livingip|teleo|internal|deal terms?|valuation|equity percent)",
combined_text, re.IGNORECASE,
))
if has_dollar and has_internal:
issues.append("opsec_internal_deal_terms")
# Body substance check (claims only)
if ftype == "claim" and body:
# Strip the H1 title line and check remaining content
body_no_h1 = re.sub(r"^# .+\n*", "", body).strip()
# Remove "Relevant Notes" and "Topics" sections
body_content = re.split(r"\n---\n", body_no_h1)[0].strip()
if len(body_content) < 50:
issues.append("body_too_thin")
# Near-duplicate check (claims only, not entities)
if ftype != "entity":
title_lower = Path(filename).stem.replace("-", " ").lower()
title_words = set(title_lower.split()[:6])
for existing in existing_claims:
# Normalize existing stem: hyphens → spaces for consistent comparison
existing_normalized = existing.replace("-", " ").lower()
if len(title_words & set(existing_normalized.split()[:6])) < 2:
continue
ratio = SequenceMatcher(None, title_lower, existing_normalized).ratio()
if ratio >= DEDUP_THRESHOLD:
issues.append(f"near_duplicate:{existing[:80]}")
break # One is enough to flag
return issues
# ─── Main entry point ──────────────────────────────────────────────────────
def validate_and_fix_claims(
claims: list[dict],
domain: str,
agent: str,
existing_claims: set[str],
repo_root: str = ".",
) -> tuple[list[dict], list[dict], dict]:
"""Validate and fix extracted claims. Returns (kept_claims, rejected_claims, stats).
Each claim dict has: filename, domain, content
Returned claims have content fixed where possible.
Stats: {total, kept, fixed, rejected, fixes_applied: [...], rejections: [...]}
"""
kept = []
rejected = []
all_fixes = []
all_rejections = []
# Add intra-batch stems to existing claims (avoid false positive duplicates within same extraction)
batch_stems = {Path(c["filename"]).stem for c in claims}
existing_plus_batch = existing_claims | batch_stems
for claim in claims:
filename = claim.get("filename", "")
content = claim.get("content", "")
claim_domain = claim.get("domain", domain)
if not filename or not content:
rejected.append(claim)
all_rejections.append(f"{filename or '?'}:missing_filename_or_content")
continue
# Phase 1: Apply fixers
content, fixes1 = fix_frontmatter(content, claim_domain, agent)
content, fixes2 = fix_wiki_links(content, existing_plus_batch)
content, fixes3 = fix_trailing_newline(content)
content, fixes4 = fix_h1_title_match(content, filename)
fixes = fixes1 + fixes2 + fixes3 + fixes4
if fixes:
all_fixes.extend([f"{filename}:{f}" for f in fixes])
# Phase 2: Validate (after fixes)
issues = validate_claim(filename, content, existing_claims, agent=agent)
# Separate hard failures from warnings
hard_failures = [i for i in issues if not i.startswith("near_duplicate")]
warnings = [i for i in issues if i.startswith("near_duplicate")]
if hard_failures:
rejected.append({**claim, "content": content, "issues": hard_failures})
all_rejections.extend([f"{filename}:{i}" for i in hard_failures])
else:
if warnings:
all_fixes.extend([f"{filename}:WARN:{w}" for w in warnings])
kept.append({**claim, "content": content})
stats = {
"total": len(claims),
"kept": len(kept),
"fixed": len([f for f in all_fixes if ":WARN:" not in f]),
"rejected": len(rejected),
"fixes_applied": all_fixes,
"rejections": all_rejections,
}
logger.info(
"Post-extraction: %d/%d claims kept (%d fixed, %d rejected)",
stats["kept"], stats["total"], stats["fixed"], stats["rejected"],
)
return kept, rejected, stats
def validate_and_fix_entities(
entities: list[dict],
domain: str,
existing_claims: set[str],
) -> tuple[list[dict], list[dict], dict]:
"""Validate and fix extracted entities. Returns (kept, rejected, stats).
Lighter validation than claims — entities are factual records, not arguable propositions.
"""
kept = []
rejected = []
all_issues = []
for ent in entities:
filename = ent.get("filename", "")
content = ent.get("content", "")
action = ent.get("action", "create")
if not filename:
rejected.append(ent)
all_issues.append("missing_filename")
continue
issues = []
if action == "create" and content:
fm, _body = parse_frontmatter(content)
if fm is None:
issues.append("no_frontmatter")
else:
if fm.get("type") != "entity":
issues.append("wrong_type")
if "entity_type" not in fm:
issues.append("missing_entity_type")
if "domain" not in fm:
issues.append("missing_domain")
# decision_market specific checks
if fm.get("entity_type") == "decision_market":
for field in ("parent_entity", "platform", "category", "status"):
if field not in fm:
issues.append(f"dm_missing:{field}")
# Fix trailing newline
if content and not content.endswith("\n"):
ent["content"] = content + "\n"
elif action == "update":
timeline = ent.get("timeline_entry", "")
if not timeline:
issues.append("update_no_timeline")
if issues:
rejected.append({**ent, "issues": issues})
all_issues.extend([f"{filename}:{i}" for i in issues])
else:
kept.append(ent)
stats = {
"total": len(entities),
"kept": len(kept),
"rejected": len(rejected),
"issues": all_issues,
}
return kept, rejected, stats
def load_existing_claims_from_repo(repo_root: str) -> set[str]:
"""Build set of known claim/entity stems from the repo."""
claims: set[str] = set()
base = Path(repo_root)
for subdir in ["domains", "core", "foundations", "maps", "agents", "schemas", "entities"]:
full = base / subdir
if not full.is_dir():
continue
for f in full.rglob("*.md"):
claims.add(f.stem)
return claims
# ─── Helpers ────────────────────────────────────────────────────────────────
def _rebuild_content(fm: dict, body: str) -> str:
"""Rebuild markdown content from frontmatter dict and body."""
# Order frontmatter fields consistently
field_order = ["type", "entity_type", "name", "domain", "description",
"confidence", "source", "created", "status", "parent_entity",
"platform", "proposer", "proposal_url", "proposal_date",
"resolution_date", "category", "summary", "tracked_by",
"secondary_domains", "challenged_by"]
lines = ["---"]
written = set()
for field in field_order:
if field in fm and fm[field] is not None:
lines.append(_yaml_line(field, fm[field]))
written.add(field)
# Write remaining fields not in the order list
for key, val in fm.items():
if key not in written and val is not None:
lines.append(_yaml_line(key, val))
lines.append("---")
lines.append("")
lines.append(body)
content = "\n".join(lines)
if not content.endswith("\n"):
content += "\n"
return content
def _yaml_line(key: str, val) -> str:
"""Format a single YAML key-value line."""
if isinstance(val, dict):
# Nested YAML block (e.g. attribution with sub-keys)
lines = [f"{key}:"]
for sub_key, sub_val in val.items():
if isinstance(sub_val, list) and sub_val:
lines.append(f" {sub_key}:")
for item in sub_val:
if isinstance(item, dict):
first = True
for ik, iv in item.items():
prefix = " - " if first else " "
lines.append(f'{prefix}{ik}: "{iv}"')
first = False
else:
lines.append(f' - "{item}"')
else:
lines.append(f" {sub_key}: []")
return "\n".join(lines)
if isinstance(val, list):
return f"{key}: {json.dumps(val)}"
if isinstance(val, bool):
return f"{key}: {'true' if val else 'false'}"
if isinstance(val, (int, float)):
return f"{key}: {val}"
if isinstance(val, date):
return f"{key}: {val.isoformat()}"
# String — quote if it contains special chars
s = str(val)
if any(c in s for c in ":#{}[]|>&*!%@`"):
return f'{key}: "{s}"'
return f"{key}: {s}"