teleo-infrastructure/lib/validate.py
m3taversal 799249d470 Initial commit: Pipeline v2 daemon + infrastructure docs
- teleo-pipeline.py: async daemon with 4 stage loops (ingest/validate/evaluate/merge)
- lib/: config, db, evaluate, validate, merge, breaker, costs, health, log modules
- INFRASTRUCTURE.md: comprehensive deep-dive for onboarding
- teleo-pipeline.service: systemd unit file

Pentagon-Agent: Leo <294C3CA1-0205-4668-82FA-B984D54F48AD>
2026-03-12 14:11:18 +00:00

591 lines
21 KiB
Python

"""Validate stage — Tier 0 deterministic validation gate.
Ported from tier0-gate.py + validate_claims.py. Pure Python, no LLM calls.
Validates claim frontmatter, title format, wiki links, domain-directory match,
proposition heuristic, universal quantifiers, near-duplicate detection.
Runs against PRs with status 'open' that have tier0_pass IS NULL.
Posts results as PR comments. In gate mode, sets tier0_pass = 0/1.
"""
import json
import logging
import re
from datetime import date, datetime, timezone
from difflib import SequenceMatcher
from pathlib import Path
from . import config, db
logger = logging.getLogger("pipeline.validate")
# ─── Constants ──────────────────────────────────────────────────────────────
VALID_DOMAINS = frozenset({
"internet-finance", "entertainment", "health", "ai-alignment",
"space-development", "grand-strategy", "mechanisms", "living-capital",
"living-agents", "teleohumanity", "critical-systems",
"collective-intelligence", "teleological-economics", "cultural-dynamics",
})
VALID_CONFIDENCE = frozenset({"proven", "likely", "experimental", "speculative"})
VALID_TYPES = frozenset({"claim", "framework"})
REQUIRED_FIELDS = ("type", "domain", "description", "confidence", "source", "created")
DATE_MIN = date(2020, 1, 1)
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
DEDUP_THRESHOLD = 0.85
# Proposition heuristic patterns
_STRONG_SIGNALS = re.compile(
r"\b(because|therefore|however|although|despite|since|"
r"rather than|instead of|not just|more than|less than|"
r"by\b|through\b|via\b|without\b|"
r"when\b|where\b|while\b|if\b|unless\b|"
r"which\b|that\b|"
r"is\b|are\b|was\b|were\b|will\b|would\b|"
r"can\b|could\b|should\b|must\b|"
r"has\b|have\b|had\b|does\b|did\b)",
re.IGNORECASE,
)
_VERB_ENDINGS = re.compile(
r"\b\w{2,}(ed|ing|es|tes|ses|zes|ves|cts|pts|nts|rns|ps|ts|rs|ns|ds)\b",
re.IGNORECASE,
)
_UNIVERSAL_QUANTIFIERS = re.compile(
r"\b(all|every|always|never|no one|nobody|nothing|none of|"
r"the only|the fundamental|the sole|the single|"
r"universally|invariably|without exception|in every case)\b",
re.IGNORECASE,
)
_SCOPING_LANGUAGE = re.compile(
r"\b(when|if|under|given|assuming|provided|in cases where|"
r"for .+ that|among|within|across|during|between|"
r"approximately|roughly|nearly|most|many|often|typically|"
r"tends? to|generally|usually|frequently)\b",
re.IGNORECASE,
)
# ─── YAML frontmatter parser ───────────────────────────────────────────────
def parse_frontmatter(text: str) -> tuple[dict | None, str]:
"""Extract YAML frontmatter and body from markdown text."""
if not text.startswith("---"):
return None, text
end = text.find("---", 3)
if end == -1:
return None, text
raw = text[3:end]
body = text[end + 3:].strip()
try:
import yaml
fm = yaml.safe_load(raw)
if not isinstance(fm, dict):
return None, body
return fm, body
except ImportError:
pass
except Exception:
return None, body
# Fallback: simple key-value parser
fm = {}
for line in raw.strip().split("\n"):
line = line.strip()
if not line or line.startswith("#"):
continue
if ":" not in line:
continue
key, _, val = line.partition(":")
key = key.strip()
val = val.strip().strip('"').strip("'")
if val.lower() == "null" or val == "":
val = None
elif val.startswith("["):
val = [v.strip().strip('"').strip("'")
for v in val.strip("[]").split(",") if v.strip()]
fm[key] = val
return fm if fm else None, body
# ─── Validators ─────────────────────────────────────────────────────────────
def validate_schema(fm: dict) -> list[str]:
"""Check required fields and valid enums."""
violations = []
for field in REQUIRED_FIELDS:
if field not in fm or fm[field] is None:
violations.append(f"missing_field:{field}")
ftype = fm.get("type")
if ftype and ftype not in VALID_TYPES:
violations.append(f"invalid_type:{ftype}")
domain = fm.get("domain")
if domain and domain not in VALID_DOMAINS:
violations.append(f"invalid_domain:{domain}")
confidence = fm.get("confidence")
if confidence and confidence not in VALID_CONFIDENCE:
violations.append(f"invalid_confidence:{confidence}")
desc = fm.get("description")
if isinstance(desc, str) and len(desc.strip()) < 10:
violations.append("description_too_short")
source = fm.get("source")
if isinstance(source, str) and len(source.strip()) < 3:
violations.append("source_too_short")
return violations
def validate_date(date_val) -> list[str]:
"""Validate created date."""
violations = []
if date_val is None:
return ["missing_field:created"]
parsed = None
if isinstance(date_val, date):
parsed = date_val
elif isinstance(date_val, str):
try:
parsed = datetime.strptime(date_val, "%Y-%m-%d").date()
except ValueError:
return [f"invalid_date_format:{date_val}"]
else:
return [f"invalid_date_type:{type(date_val).__name__}"]
today = date.today()
if parsed > today:
violations.append(f"future_date:{parsed}")
if parsed < DATE_MIN:
violations.append(f"date_before_2020:{parsed}")
return violations
def validate_title(filepath: str) -> list[str]:
"""Check filename follows prose-as-claim convention."""
violations = []
name = Path(filepath).stem
normalized = name.replace("-", " ")
if len(normalized) < 20:
violations.append("title_too_short")
words = normalized.split()
if len(words) < 4:
violations.append("title_too_few_words")
cleaned = re.sub(r"[a-zA-Z0-9\s\-\.,'()%]", "", name)
if cleaned:
violations.append(f"title_special_chars:{cleaned[:20]}")
return violations
def validate_wiki_links(body: str, existing_claims: set[str]) -> list[str]:
"""Check that [[wiki links]] resolve to known claims."""
violations = []
for link in WIKI_LINK_RE.findall(body):
if link.strip() and link.strip() not in existing_claims:
violations.append(f"broken_wiki_link:{link.strip()[:80]}")
return violations
def validate_proposition(title: str) -> list[str]:
"""Check title reads as a proposition, not a label."""
normalized = title.replace("-", " ")
words = normalized.split()
n = len(words)
if n < 4:
return ["title_not_proposition:too short to be a disagreeable sentence"]
if _STRONG_SIGNALS.search(normalized):
return []
if _VERB_ENDINGS.search(normalized):
return []
if n >= 8:
return []
return ["title_not_proposition:no verb or connective found"]
def validate_universal_quantifiers(title: str) -> list[str]:
"""Flag unscoped universal quantifiers (warning, not gate)."""
universals = _UNIVERSAL_QUANTIFIERS.findall(title)
if universals and not _SCOPING_LANGUAGE.search(title):
return [f"unscoped_universal:{','.join(universals)}"]
return []
def validate_domain_directory_match(filepath: str, fm: dict) -> list[str]:
"""Check file's directory matches its domain field."""
domain = fm.get("domain")
if not domain:
return []
parts = Path(filepath).parts
for i, part in enumerate(parts):
if part == "domains" and i + 1 < len(parts):
dir_domain = parts[i + 1]
if dir_domain != domain:
secondary = fm.get("secondary_domains", [])
if isinstance(secondary, str):
secondary = [secondary]
if dir_domain not in (secondary or []):
return [f"domain_directory_mismatch:file in domains/{dir_domain}/ "
f"but domain field says '{domain}'"]
break
return []
def validate_description_not_title(title: str, description: str) -> list[str]:
"""Check description adds info beyond the title."""
if not description:
return []
title_lower = title.lower().strip()
desc_lower = description.lower().strip().rstrip(".")
if desc_lower in title_lower or title_lower in desc_lower:
return ["description_echoes_title"]
ratio = SequenceMatcher(None, title_lower, desc_lower).ratio()
if ratio > 0.75:
return [f"description_too_similar:{ratio:.0%}"]
return []
def find_near_duplicates(title: str, existing_claims: set[str]) -> list[str]:
"""Find near-duplicate titles using SequenceMatcher with word pre-filter."""
title_lower = title.lower()
title_words = set(title_lower.split()[:6])
warnings = []
for existing in existing_claims:
existing_lower = existing.lower()
if len(title_words & set(existing_lower.split()[:6])) < 2:
continue
ratio = SequenceMatcher(None, title_lower, existing_lower).ratio()
if ratio >= DEDUP_THRESHOLD:
warnings.append(f"near_duplicate:{existing[:80]} (similarity={ratio:.2f})")
return warnings
# ─── Full Tier 0 validation ────────────────────────────────────────────────
def tier0_validate_claim(filepath: str, content: str, existing_claims: set[str]) -> dict:
"""Run full Tier 0 validation. Returns {filepath, passes, violations, warnings}."""
violations = []
warnings = []
fm, body = parse_frontmatter(content)
if fm is None:
return {"filepath": filepath, "passes": False,
"violations": ["no_frontmatter"], "warnings": []}
violations.extend(validate_schema(fm))
violations.extend(validate_date(fm.get("created")))
violations.extend(validate_title(filepath))
violations.extend(validate_wiki_links(body, existing_claims))
title = Path(filepath).stem
violations.extend(validate_proposition(title))
warnings.extend(validate_universal_quantifiers(title))
violations.extend(validate_domain_directory_match(filepath, fm))
desc = fm.get("description", "")
if isinstance(desc, str):
warnings.extend(validate_description_not_title(title, desc))
warnings.extend(find_near_duplicates(title, existing_claims))
return {"filepath": filepath, "passes": len(violations) == 0,
"violations": violations, "warnings": warnings}
# ─── Diff parsing ──────────────────────────────────────────────────────────
def extract_claim_files_from_diff(diff: str) -> dict[str, str]:
"""Parse unified diff to extract new/modified claim file contents."""
claim_dirs = ("domains/", "core/", "foundations/")
files = {}
current_file = None
current_lines = []
is_deletion = False
for line in diff.split("\n"):
if line.startswith("diff --git"):
if current_file and not is_deletion:
files[current_file] = "\n".join(current_lines)
current_file = None
current_lines = []
is_deletion = False
elif line.startswith("deleted file mode") or line.startswith("+++ /dev/null"):
is_deletion = True
current_file = None
elif line.startswith("+++ b/") and not is_deletion:
path = line[6:]
basename = path.rsplit("/", 1)[-1] if "/" in path else path
if (any(path.startswith(d) for d in claim_dirs)
and path.endswith(".md")
and not basename.startswith("_")):
current_file = path
elif current_file and line.startswith("+") and not line.startswith("+++"):
current_lines.append(line[1:])
if current_file and not is_deletion:
files[current_file] = "\n".join(current_lines)
return files
# ─── Forgejo API (using merge module's helper) ─────────────────────────────
async def _forgejo_api(method: str, path: str, body: dict = None):
"""Call Forgejo API. Reuses merge module pattern."""
import aiohttp
url = f"{config.FORGEJO_URL}/api/v1{path}"
token = config.FORGEJO_TOKEN_FILE.read_text().strip() if config.FORGEJO_TOKEN_FILE.exists() else ""
headers = {"Authorization": f"token {token}", "Content-Type": "application/json"}
try:
async with aiohttp.ClientSession() as session:
async with session.request(method, url, headers=headers,
json=body, timeout=aiohttp.ClientTimeout(total=30)) as resp:
if resp.status >= 400:
text = await resp.text()
logger.error("Forgejo API %s %s%d: %s", method, path, resp.status, text[:200])
return None
if resp.status == 204:
return {}
return await resp.json()
except Exception as e:
logger.error("Forgejo API error: %s %s%s", method, path, e)
return None
async def _get_pr_diff(pr_number: int) -> str:
"""Fetch PR diff via Forgejo API."""
import aiohttp
url = f"{config.FORGEJO_URL}/api/v1/repos/{config.FORGEJO_OWNER}/{config.FORGEJO_REPO}/pulls/{pr_number}.diff"
token = config.FORGEJO_TOKEN_FILE.read_text().strip() if config.FORGEJO_TOKEN_FILE.exists() else ""
headers = {"Authorization": f"token {token}", "Accept": "text/plain"}
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers,
timeout=aiohttp.ClientTimeout(total=60)) as resp:
if resp.status >= 400:
return ""
diff = await resp.text()
if len(diff) > 2_000_000:
return "" # Too large
return diff
except Exception as e:
logger.error("Failed to fetch diff for PR #%d: %s", pr_number, e)
return ""
async def _get_pr_head_sha(pr_number: int) -> str:
"""Get HEAD SHA of PR's branch."""
pr_info = await _forgejo_api(
"GET",
f"/repos/{config.FORGEJO_OWNER}/{config.FORGEJO_REPO}/pulls/{pr_number}",
)
if pr_info:
return pr_info.get("head", {}).get("sha", "")
return ""
async def _has_tier0_comment(pr_number: int, head_sha: str) -> bool:
"""Check if we already validated this exact commit."""
if not head_sha:
return False
# Paginate comments (Ganymede standing rule)
page = 1
while True:
comments = await _forgejo_api(
"GET",
f"/repos/{config.FORGEJO_OWNER}/{config.FORGEJO_REPO}/issues/{pr_number}/comments"
f"?limit=50&page={page}",
)
if not comments:
break
marker = f"<!-- TIER0-VALIDATION:{head_sha} -->"
for c in comments:
if marker in c.get("body", ""):
return True
if len(comments) < 50:
break
page += 1
return False
async def _post_validation_comment(pr_number: int, results: list[dict], head_sha: str):
"""Post Tier 0 validation results as PR comment."""
all_pass = all(r["passes"] for r in results)
total = len(results)
passing = sum(1 for r in results if r["passes"])
marker = f"<!-- TIER0-VALIDATION:{head_sha} -->" if head_sha else "<!-- TIER0-VALIDATION -->"
status = "PASS" if all_pass else "FAIL"
lines = [
marker,
f"**Tier 0 Validation: {status}** — {passing}/{total} claims pass\n",
]
for r in results:
icon = "pass" if r["passes"] else "FAIL"
short_path = r["filepath"].split("/", 1)[-1] if "/" in r["filepath"] else r["filepath"]
lines.append(f"**[{icon}]** `{short_path}`")
for v in r["violations"]:
lines.append(f" - {v}")
for w in r["warnings"]:
lines.append(f" - (warn) {w}")
lines.append("")
if not all_pass:
lines.append("---")
lines.append("Fix the violations above and push to trigger re-validation.")
lines.append(f"\n*tier0-gate v2 | {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*")
await _forgejo_api(
"POST",
f"/repos/{config.FORGEJO_OWNER}/{config.FORGEJO_REPO}/issues/{pr_number}/comments",
{"body": "\n".join(lines)},
)
# ─── Existing claims index ─────────────────────────────────────────────────
def load_existing_claims() -> set[str]:
"""Build set of known claim titles from the main worktree."""
claims: set[str] = set()
base = config.MAIN_WORKTREE
for subdir in ["domains", "core", "foundations", "maps", "agents", "schemas"]:
full = base / subdir
if not full.is_dir():
continue
for f in full.rglob("*.md"):
claims.add(f.stem)
return claims
# ─── Main entry point ──────────────────────────────────────────────────────
async def validate_pr(conn, pr_number: int) -> dict:
"""Run Tier 0 validation on a single PR.
Returns {pr, all_pass, total, passing, skipped, reason}.
"""
# Get HEAD SHA for idempotency
head_sha = await _get_pr_head_sha(pr_number)
# Skip if already validated for this commit
if await _has_tier0_comment(pr_number, head_sha):
logger.debug("PR #%d already validated at %s", pr_number, head_sha[:8])
return {"pr": pr_number, "skipped": True, "reason": "already_validated"}
# Fetch diff
diff = await _get_pr_diff(pr_number)
if not diff:
logger.debug("PR #%d: empty or oversized diff", pr_number)
return {"pr": pr_number, "skipped": True, "reason": "no_diff"}
# Extract claim files
claim_files = extract_claim_files_from_diff(diff)
if not claim_files:
logger.debug("PR #%d: no claim files in diff", pr_number)
return {"pr": pr_number, "skipped": True, "reason": "no_claims"}
# Load existing claims index
existing_claims = load_existing_claims()
# Validate each claim
results = []
for filepath, content in claim_files.items():
result = tier0_validate_claim(filepath, content, existing_claims)
results.append(result)
status = "PASS" if result["passes"] else "FAIL"
logger.debug("PR #%d: %s %s v=%s w=%s", pr_number, status, filepath,
result["violations"], result["warnings"])
all_pass = all(r["passes"] for r in results)
total = len(results)
passing = sum(1 for r in results if r["passes"])
logger.info("PR #%d: Tier 0 — %d/%d pass, all_pass=%s", pr_number, passing, total, all_pass)
# Post comment
await _post_validation_comment(pr_number, results, head_sha)
# Update PR record
conn.execute(
"UPDATE prs SET tier0_pass = ? WHERE number = ?",
(1 if all_pass else 0, pr_number),
)
db.audit(conn, "validate", "tier0_complete",
json.dumps({"pr": pr_number, "pass": all_pass, "passing": passing, "total": total}))
return {"pr": pr_number, "all_pass": all_pass, "total": total, "passing": passing}
async def validate_cycle(conn, max_workers=None) -> tuple[int, int]:
"""Run one validation cycle.
Finds PRs with status='open' and tier0_pass IS NULL, validates them.
"""
# Find unvalidated PRs (priority ordered)
rows = conn.execute(
"""SELECT p.number FROM prs p
LEFT JOIN sources s ON p.source_path = s.path
WHERE p.status = 'open'
AND p.tier0_pass IS NULL
ORDER BY
CASE COALESCE(p.priority, s.priority, 'medium')
WHEN 'critical' THEN 0
WHEN 'high' THEN 1
WHEN 'medium' THEN 2
WHEN 'low' THEN 3
ELSE 4
END,
p.created_at ASC
LIMIT ?""",
(max_workers or 10,),
).fetchall()
if not rows:
return 0, 0
succeeded = 0
failed = 0
for row in rows:
try:
result = await validate_pr(conn, row["number"])
if result.get("skipped"):
# Mark as validated even if skipped (no claims = pass)
conn.execute(
"UPDATE prs SET tier0_pass = 1 WHERE number = ? AND tier0_pass IS NULL",
(row["number"],),
)
succeeded += 1
elif result.get("all_pass"):
succeeded += 1
else:
succeeded += 1 # Validation ran successfully, even if claims failed
except Exception:
logger.exception("Failed to validate PR #%d", row["number"])
failed += 1
if succeeded or failed:
logger.info("Validate cycle: %d validated, %d errors", succeeded, failed)
return succeeded, failed