"""Validate stage — Tier 0 deterministic validation gate. Ported from tier0-gate.py + validate_claims.py. Pure Python, no LLM calls. Validates claim frontmatter, title format, wiki links, domain-directory match, proposition heuristic, universal quantifiers, near-duplicate detection. Runs against PRs with status 'open' that have tier0_pass IS NULL. Posts results as PR comments. In gate mode, sets tier0_pass = 0/1. """ import json import logging import re from datetime import date, datetime, timezone from difflib import SequenceMatcher from pathlib import Path from . import config, db from .domains import VALID_DOMAINS from .forgejo import api as forgejo_api from .forgejo import get_pr_diff, repo_path logger = logging.getLogger("pipeline.validate") # ─── Constants ────────────────────────────────────────────────────────────── VALID_TYPES = frozenset(config.TYPE_SCHEMAS.keys()) # Default confidence values (union of all types that define them) VALID_CONFIDENCE = frozenset( c for schema in config.TYPE_SCHEMAS.values() if schema.get("valid_confidence") for c in schema["valid_confidence"] ) DATE_MIN = date(2020, 1, 1) WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]") DEDUP_THRESHOLD = 0.85 # Proposition heuristic patterns _STRONG_SIGNALS = re.compile( r"\b(because|therefore|however|although|despite|since|" r"rather than|instead of|not just|more than|less than|" r"by\b|through\b|via\b|without\b|" r"when\b|where\b|while\b|if\b|unless\b|" r"which\b|that\b|" r"is\b|are\b|was\b|were\b|will\b|would\b|" r"can\b|could\b|should\b|must\b|" r"has\b|have\b|had\b|does\b|did\b)", re.IGNORECASE, ) _VERB_ENDINGS = re.compile( r"\b\w{2,}(ed|ing|es|tes|ses|zes|ves|cts|pts|nts|rns|ps|ts|rs|ns|ds)\b", re.IGNORECASE, ) _UNIVERSAL_QUANTIFIERS = re.compile( r"\b(all|every|always|never|no one|nobody|nothing|none of|" r"the only|the fundamental|the sole|the single|" r"universally|invariably|without exception|in every case)\b", re.IGNORECASE, ) _SCOPING_LANGUAGE = re.compile( r"\b(when|if|under|given|assuming|provided|in cases where|" r"for .+ that|among|within|across|during|between|" r"approximately|roughly|nearly|most|many|often|typically|" r"tends? to|generally|usually|frequently)\b", re.IGNORECASE, ) # ─── YAML frontmatter parser ─────────────────────────────────────────────── def parse_frontmatter(text: str) -> tuple[dict | None, str]: """Extract YAML frontmatter and body from markdown text.""" if not text.startswith("---"): return None, text end = text.find("---", 3) if end == -1: return None, text raw = text[3:end] body = text[end + 3 :].strip() try: import yaml fm = yaml.safe_load(raw) if not isinstance(fm, dict): return None, body return fm, body except ImportError: pass except Exception: return None, body # Fallback: simple key-value parser fm = {} for line in raw.strip().split("\n"): line = line.strip() if not line or line.startswith("#"): continue if ":" not in line: continue key, _, val = line.partition(":") key = key.strip() val = val.strip().strip('"').strip("'") if val.lower() == "null" or val == "": val = None elif val.startswith("["): val = [v.strip().strip('"').strip("'") for v in val.strip("[]").split(",") if v.strip()] fm[key] = val return fm if fm else None, body # ─── Validators ───────────────────────────────────────────────────────────── def validate_schema(fm: dict) -> list[str]: """Check required fields and valid enums, branching on content type.""" violations = [] ftype = fm.get("type") if not ftype: violations.append("missing_field:type") schema = config.TYPE_SCHEMAS["claim"] # strictest default elif ftype not in config.TYPE_SCHEMAS: violations.append(f"invalid_type:{ftype}") schema = config.TYPE_SCHEMAS["claim"] else: schema = config.TYPE_SCHEMAS[ftype] for field in schema["required"]: if field not in fm or fm[field] is None: violations.append(f"missing_field:{field}") domain = fm.get("domain") if domain and domain not in VALID_DOMAINS: violations.append(f"invalid_domain:{domain}") valid_conf = schema.get("valid_confidence") confidence = fm.get("confidence") if valid_conf and confidence and confidence not in valid_conf: violations.append(f"invalid_confidence:{confidence}") desc = fm.get("description") if isinstance(desc, str) and len(desc.strip()) < 10: violations.append("description_too_short") source = fm.get("source") if "source" in schema["required"] and isinstance(source, str) and len(source.strip()) < 3: violations.append("source_too_short") return violations def validate_date(date_val) -> list[str]: """Validate created date.""" violations = [] if date_val is None: return ["missing_field:created"] parsed = None if isinstance(date_val, date): parsed = date_val elif isinstance(date_val, str): try: parsed = datetime.strptime(date_val, "%Y-%m-%d").date() except ValueError: return [f"invalid_date_format:{date_val}"] else: return [f"invalid_date_type:{type(date_val).__name__}"] today = date.today() if parsed > today: violations.append(f"future_date:{parsed}") if parsed < DATE_MIN: violations.append(f"date_before_2020:{parsed}") return violations def validate_title(filepath: str) -> list[str]: """Check filename follows prose-as-claim convention.""" violations = [] name = Path(filepath).stem normalized = name.replace("-", " ") if len(normalized) < 20: violations.append("title_too_short") words = normalized.split() if len(words) < 4: violations.append("title_too_few_words") cleaned = re.sub(r"[a-zA-Z0-9\s\-\.,'()%]", "", name) if cleaned: violations.append(f"title_special_chars:{cleaned[:20]}") return violations def validate_wiki_links(body: str, existing_claims: set[str]) -> list[str]: """Check that [[wiki links]] resolve to known claims.""" violations = [] for link in WIKI_LINK_RE.findall(body): if link.strip() and link.strip() not in existing_claims: violations.append(f"broken_wiki_link:{link.strip()[:80]}") return violations def validate_proposition(title: str) -> list[str]: """Check title reads as a proposition, not a label.""" normalized = title.replace("-", " ") words = normalized.split() n = len(words) if n < 4: return ["title_not_proposition:too short to be a disagreeable sentence"] if _STRONG_SIGNALS.search(normalized): return [] if _VERB_ENDINGS.search(normalized): return [] if n >= 8: return [] return ["title_not_proposition:no verb or connective found"] def validate_universal_quantifiers(title: str) -> list[str]: """Flag unscoped universal quantifiers (warning, not gate).""" universals = _UNIVERSAL_QUANTIFIERS.findall(title) if universals and not _SCOPING_LANGUAGE.search(title): return [f"unscoped_universal:{','.join(universals)}"] return [] def validate_domain_directory_match(filepath: str, fm: dict) -> list[str]: """Check file's directory matches its domain field.""" domain = fm.get("domain") if not domain: return [] parts = Path(filepath).parts for i, part in enumerate(parts): if part == "domains" and i + 1 < len(parts): dir_domain = parts[i + 1] if dir_domain != domain: secondary = fm.get("secondary_domains", []) if isinstance(secondary, str): secondary = [secondary] if dir_domain not in (secondary or []): return [f"domain_directory_mismatch:file in domains/{dir_domain}/ but domain field says '{domain}'"] break return [] def validate_description_not_title(title: str, description: str) -> list[str]: """Check description adds info beyond the title.""" if not description: return [] title_lower = title.lower().strip() desc_lower = description.lower().strip().rstrip(".") if desc_lower in title_lower or title_lower in desc_lower: return ["description_echoes_title"] ratio = SequenceMatcher(None, title_lower, desc_lower).ratio() if ratio > 0.75: return [f"description_too_similar:{ratio:.0%}"] return [] def find_near_duplicates(title: str, existing_claims: set[str]) -> list[str]: """Find near-duplicate titles using SequenceMatcher with word pre-filter.""" title_lower = title.lower() title_words = set(title_lower.split()[:6]) warnings = [] for existing in existing_claims: existing_lower = existing.lower() if len(title_words & set(existing_lower.split()[:6])) < 2: continue ratio = SequenceMatcher(None, title_lower, existing_lower).ratio() if ratio >= DEDUP_THRESHOLD: warnings.append(f"near_duplicate:{existing[:80]} (similarity={ratio:.2f})") return warnings # ─── Full Tier 0 validation ──────────────────────────────────────────────── def tier0_validate_claim(filepath: str, content: str, existing_claims: set[str]) -> dict: """Run full Tier 0 validation. Returns {filepath, passes, violations, warnings}. Branches on content type (claim/framework/entity) via TYPE_SCHEMAS. Entities skip proposition title check, date validation, and confidence — they're factual records, not arguable claims. """ violations = [] warnings = [] fm, body = parse_frontmatter(content) if fm is None: return {"filepath": filepath, "passes": False, "violations": ["no_frontmatter"], "warnings": []} violations.extend(validate_schema(fm)) # Type-aware checks ftype = fm.get("type", "claim") schema = config.TYPE_SCHEMAS.get(ftype, config.TYPE_SCHEMAS["claim"]) if "created" in schema["required"]: violations.extend(validate_date(fm.get("created"))) title = Path(filepath).stem if schema.get("needs_proposition_title", True): # Title length/format checks only for claims/frameworks — entity filenames # like "metadao.md" are intentionally short (Ganymede review) violations.extend(validate_title(filepath)) violations.extend(validate_proposition(title)) warnings.extend(validate_universal_quantifiers(title)) # Wiki links are warnings, not violations — broken links usually point to # claims in other open PRs that haven't merged yet. (Cory, Mar 14) warnings.extend(validate_wiki_links(body, existing_claims)) violations.extend(validate_domain_directory_match(filepath, fm)) desc = fm.get("description", "") if isinstance(desc, str): warnings.extend(validate_description_not_title(title, desc)) # Skip near_duplicate for entities — entity updates matching existing entities # is correct behavior, not duplication. 83% false positive rate on entities. (Leo/Rhea) if ftype != "entity" and not filepath.startswith("entities/"): warnings.extend(find_near_duplicates(title, existing_claims)) return {"filepath": filepath, "passes": len(violations) == 0, "violations": violations, "warnings": warnings} # ─── Diff parsing ────────────────────────────────────────────────────────── def extract_claim_files_from_diff(diff: str) -> dict[str, str]: """Parse unified diff to extract new/modified claim file contents.""" claim_dirs = ("domains/", "core/", "foundations/") files = {} current_file = None current_lines = [] is_deletion = False for line in diff.split("\n"): if line.startswith("diff --git"): if current_file and not is_deletion: files[current_file] = "\n".join(current_lines) current_file = None current_lines = [] is_deletion = False elif line.startswith("deleted file mode") or line.startswith("+++ /dev/null"): is_deletion = True current_file = None elif line.startswith("+++ b/") and not is_deletion: path = line[6:] basename = path.rsplit("/", 1)[-1] if "/" in path else path if any(path.startswith(d) for d in claim_dirs) and path.endswith(".md") and not basename.startswith("_"): current_file = path elif current_file and line.startswith("+") and not line.startswith("+++"): current_lines.append(line[1:]) if current_file and not is_deletion: files[current_file] = "\n".join(current_lines) return files async def _get_pr_head_sha(pr_number: int) -> str: """Get HEAD SHA of PR's branch.""" pr_info = await forgejo_api( "GET", repo_path(f"pulls/{pr_number}"), ) if pr_info: return pr_info.get("head", {}).get("sha", "") return "" async def _has_tier0_comment(pr_number: int, head_sha: str) -> bool: """Check if we already validated this exact commit.""" if not head_sha: return False # Paginate comments (Ganymede standing rule) page = 1 while True: comments = await forgejo_api( "GET", repo_path(f"issues/{pr_number}/comments?limit=50&page={page}"), ) if not comments: break marker = f"" for c in comments: if marker in c.get("body", ""): return True if len(comments) < 50: break page += 1 return False async def _post_validation_comment( pr_number: int, results: list[dict], head_sha: str, t05_issues: list[str] | None = None, t05_details: list[str] | None = None, ): """Post Tier 0 + Tier 0.5 validation results as PR comment.""" tier0_pass = all(r["passes"] for r in results) t05_pass = not t05_issues # empty list = pass all_pass = tier0_pass and t05_pass total = len(results) passing = sum(1 for r in results if r["passes"]) marker = f"" if head_sha else "" status = "PASS" if all_pass else "FAIL" lines = [ marker, f"**Validation: {status}** — {passing}/{total} claims pass\n", ] for r in results: icon = "pass" if r["passes"] else "FAIL" short_path = r["filepath"].split("/", 1)[-1] if "/" in r["filepath"] else r["filepath"] lines.append(f"**[{icon}]** `{short_path}`") for v in r["violations"]: lines.append(f" - {v}") for w in r["warnings"]: lines.append(f" - (warn) {w}") lines.append("") # Tier 0.5 results (diff-level checks) if t05_issues: lines.append("**Tier 0.5 — mechanical pre-check: FAIL**\n") for detail in (t05_details or []): lines.append(f" - {detail}") lines.append("") if not all_pass: lines.append("---") lines.append("Fix the violations above and push to trigger re-validation.") lines.append("LLM review will run after all mechanical checks pass.") lines.append(f"\n*tier0-gate v2 | {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*") await forgejo_api( "POST", repo_path(f"issues/{pr_number}/comments"), {"body": "\n".join(lines)}, ) # ─── Existing claims index ───────────────────────────────────────────────── def load_existing_claims() -> set[str]: """Build set of known claim titles from the main worktree.""" claims: set[str] = set() base = config.MAIN_WORKTREE for subdir in ["domains", "core", "foundations", "maps", "agents", "schemas", "entities", "decisions"]: full = base / subdir if not full.is_dir(): continue for f in full.rglob("*.md"): claims.add(f.stem) return claims # ─── Main entry point ────────────────────────────────────────────────────── def _extract_all_md_added_content(diff: str) -> dict[str, str]: """Extract added content from ALL .md files in diff (not just claim dirs). Used for wiki link validation on agent files, musings, etc. that extract_claim_files_from_diff skips. Returns {filepath: added_lines}. """ files: dict[str, str] = {} current_file = None current_lines: list[str] = [] is_deletion = False for line in diff.split("\n"): if line.startswith("diff --git"): if current_file and not is_deletion: files[current_file] = "\n".join(current_lines) current_file = None current_lines = [] is_deletion = False elif line.startswith("deleted file mode") or line.startswith("+++ /dev/null"): is_deletion = True current_file = None elif line.startswith("+++ b/") and not is_deletion: path = line[6:] if path.endswith(".md"): current_file = path elif current_file and line.startswith("+") and not line.startswith("+++"): current_lines.append(line[1:]) if current_file and not is_deletion: files[current_file] = "\n".join(current_lines) return files def _new_files_in_diff(diff: str) -> set[str]: """Extract paths of newly added files from a unified diff.""" new_files: set[str] = set() lines = diff.split("\n") for i, line in enumerate(lines): if line.startswith("--- /dev/null") and i + 1 < len(lines) and lines[i + 1].startswith("+++ b/"): new_files.add(lines[i + 1][6:]) return new_files def tier05_mechanical_check(diff: str, existing_claims: set[str] | None = None) -> tuple[bool, list[str], list[str]]: """Tier 0.5: mechanical pre-check for frontmatter schema + wiki links. Runs deterministic Python checks ($0) to catch issues that LLM reviewers rubber-stamp or reject without structured issue tags. Moved from evaluate.py to validate.py so that mechanical issues are caught BEFORE eval, not during. Only checks NEW files for frontmatter (modified files have partial content from diff — Bug 2). Wiki links checked on ALL .md files. Returns (passes, issue_tags, detail_messages). """ claim_files = extract_claim_files_from_diff(diff) all_md_files = _extract_all_md_added_content(diff) if not claim_files and not all_md_files: return True, [], [] if existing_claims is None: existing_claims = load_existing_claims() new_files = _new_files_in_diff(diff) issues: list[str] = [] details: list[str] = [] gate_failed = False # Pass 1: Claim-specific checks (frontmatter, schema, near-duplicate) for filepath, content in claim_files.items(): is_new = filepath in new_files if is_new: fm, body = parse_frontmatter(content) if fm is None: issues.append("frontmatter_schema") details.append(f"{filepath}: no valid YAML frontmatter") gate_failed = True continue schema_errors = validate_schema(fm) if schema_errors: issues.append("frontmatter_schema") details.append(f"{filepath}: {', '.join(schema_errors)}") gate_failed = True # Near-duplicate (warning only — tagged but doesn't gate) # Skip for entities — entity updates matching existing entities is expected. title = Path(filepath).stem ftype_check = fm.get("type", "claim") if ftype_check != "entity" and not filepath.startswith("entities/"): dup_warnings = find_near_duplicates(title, existing_claims) if dup_warnings: issues.append("near_duplicate") details.append(f"{filepath}: {', '.join(w[:60] for w in dup_warnings[:2])}") # Pass 2: Wiki link check on ALL .md files # Broken wiki links are a WARNING, not a gate. Most broken links point to claims # in other open PRs that haven't merged yet — they resolve naturally as the # dependency chain merges. LLM reviewers catch genuinely missing references. # (Cory directive, Mar 14: "they'll likely merge") for filepath, content in all_md_files.items(): link_errors = validate_wiki_links(content, existing_claims) if link_errors: issues.append("broken_wiki_links") details.append(f"{filepath}: (warn) {', '.join(e[:60] for e in link_errors[:3])}") # NOT gate_failed — wiki links are warnings, not blockers unique_issues = list(dict.fromkeys(issues)) return not gate_failed, unique_issues, details async def validate_pr(conn, pr_number: int) -> dict: """Run Tier 0 + Tier 0.5 validation on a single PR. Tier 0: per-claim validation (schema, date, title, wiki links, proposition). Tier 0.5: diff-level mechanical checks (frontmatter schema on new files, wiki links on all .md). Both must pass for tier0_pass = 1. If either fails, eval won't touch this PR. Fixer handles wiki links; non-fixable issues exhaust fix_attempts → terminal. Returns {pr, all_pass, total, passing, skipped, reason, tier05_issues}. """ # Get HEAD SHA for idempotency head_sha = await _get_pr_head_sha(pr_number) # Skip if already validated for this commit if await _has_tier0_comment(pr_number, head_sha): logger.debug("PR #%d already validated at %s", pr_number, head_sha[:8]) return {"pr": pr_number, "skipped": True, "reason": "already_validated"} # Fetch diff diff = await get_pr_diff(pr_number) if not diff: logger.debug("PR #%d: empty or oversized diff", pr_number) return {"pr": pr_number, "skipped": True, "reason": "no_diff"} # Load existing claims index (shared between Tier 0 and Tier 0.5) existing_claims = load_existing_claims() # Extract claim files (domains/, core/, foundations/) claim_files = extract_claim_files_from_diff(diff) # ── Tier 0: per-claim validation ── # Only validates NEW files (not modified). Modified files have partial content # from diffs (only + lines) — frontmatter parsing fails on partial content, # producing false no_frontmatter violations. Enrichment PRs that modify # existing claim files were getting stuck here. (Epimetheus session 2) new_files = _new_files_in_diff(diff) results = [] for filepath, content in claim_files.items(): if filepath not in new_files: continue # Skip modified files — partial diff content can't be validated result = tier0_validate_claim(filepath, content, existing_claims) results.append(result) status = "PASS" if result["passes"] else "FAIL" logger.debug("PR #%d: %s %s v=%s w=%s", pr_number, status, filepath, result["violations"], result["warnings"]) tier0_pass = all(r["passes"] for r in results) if results else True total = len(results) passing = sum(1 for r in results if r["passes"]) # ── Tier 0.5: diff-level mechanical checks ── # Always runs — catches broken wiki links in ALL .md files including entities. t05_pass, t05_issues, t05_details = tier05_mechanical_check(diff, existing_claims) if not claim_files and t05_pass: # Entity/source-only PR with no wiki link issues — pass through logger.debug("PR #%d: no claim files, Tier 0.5 passed — auto-pass", pr_number) elif not claim_files and not t05_pass: logger.info("PR #%d: no claim files but Tier 0.5 failed: %s", pr_number, t05_issues) # Combined result: both tiers must pass all_pass = tier0_pass and t05_pass logger.info( "PR #%d: Tier 0 — %d/%d pass | Tier 0.5 — %s (issues: %s) | combined: %s", pr_number, passing, total, "PASS" if t05_pass else "FAIL", t05_issues, all_pass, ) # Post combined comment await _post_validation_comment(pr_number, results, head_sha, t05_issues, t05_details) # Update PR record — reset eval state on new commits # WARNING-ONLY issue tags (broken_wiki_links, near_duplicate) should NOT # prevent tier0_pass. Only blocking tags (frontmatter_schema, etc.) gate. # This was causing an infinite fixer→validate loop where wiki link warnings # kept resetting tier0_pass=0. (Epimetheus, session 2 fix) # Determine effective pass: per-claim violations always gate. Tier 0.5 warnings don't. # (Ganymede: verify this doesn't accidentally pass real schema failures) WARNING_ONLY_TAGS = {"broken_wiki_links", "near_duplicate"} blocking_t05_issues = set(t05_issues) - WARNING_ONLY_TAGS if t05_issues else set() # Pass if: per-claim checks pass AND no blocking Tier 0.5 issues effective_pass = tier0_pass and not blocking_t05_issues conn.execute( """UPDATE prs SET tier0_pass = ?, eval_attempts = 0, eval_issues = ?, domain_verdict = 'pending', leo_verdict = 'pending', last_error = NULL WHERE number = ?""", (1 if effective_pass else 0, json.dumps(t05_issues) if t05_issues else "[]", pr_number), ) db.audit( conn, "validate", "tier0_complete", json.dumps({ "pr": pr_number, "pass": all_pass, "tier0_pass": tier0_pass, "tier05_pass": t05_pass, "passing": passing, "total": total, "tier05_issues": t05_issues, }), ) return { "pr": pr_number, "all_pass": all_pass, "total": total, "passing": passing, "tier05_issues": t05_issues, } async def validate_cycle(conn, max_workers=None) -> tuple[int, int]: """Run one validation cycle. Finds PRs with status='open' and tier0_pass IS NULL, validates them. """ # Find unvalidated PRs (priority ordered) rows = conn.execute( """SELECT p.number FROM prs p LEFT JOIN sources s ON p.source_path = s.path WHERE p.status = 'open' AND p.tier0_pass IS NULL ORDER BY CASE COALESCE(p.priority, s.priority, 'medium') WHEN 'critical' THEN 0 WHEN 'high' THEN 1 WHEN 'medium' THEN 2 WHEN 'low' THEN 3 ELSE 4 END, p.created_at ASC LIMIT ?""", (max_workers or 10,), ).fetchall() if not rows: return 0, 0 succeeded = 0 failed = 0 for row in rows: try: result = await validate_pr(conn, row["number"]) if result.get("skipped"): # Mark as validated even if skipped (no claims = pass) conn.execute( "UPDATE prs SET tier0_pass = 1 WHERE number = ? AND tier0_pass IS NULL", (row["number"],), ) succeeded += 1 elif result.get("all_pass"): succeeded += 1 else: succeeded += 1 # Validation ran successfully, even if claims failed except Exception: logger.exception("Failed to validate PR #%d", row["number"]) failed += 1 if succeeded or failed: logger.info("Validate cycle: %d validated, %d errors", succeeded, failed) return succeeded, failed