teleo-codex/ops/pipeline-v2/lib/validate.py

"""Validate stage — Tier 0 deterministic validation gate.

Ported from tier0-gate.py + validate_claims.py. Pure Python, no LLM calls.
Validates claim frontmatter, title format, wiki links, domain-directory match,
proposition heuristic, universal quantifiers, near-duplicate detection.

Runs against PRs with status 'open' that have tier0_pass IS NULL.
Posts results as PR comments. In gate mode, sets tier0_pass = 0/1.
"""

import json
import logging
import re
from datetime import date, datetime, timezone
from difflib import SequenceMatcher
from pathlib import Path

from . import config, db
from .domains import VALID_DOMAINS
from .forgejo import api as forgejo_api
from .forgejo import get_pr_diff, repo_path

logger = logging.getLogger("pipeline.validate")

# ─── Constants ──────────────────────────────────────────────────────────────

VALID_TYPES = frozenset(config.TYPE_SCHEMAS.keys())
# Default confidence values (union of all types that define them)
VALID_CONFIDENCE = frozenset(
    c for schema in config.TYPE_SCHEMAS.values()
    if schema.get("valid_confidence") for c in schema["valid_confidence"]
)
DATE_MIN = date(2020, 1, 1)
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
DEDUP_THRESHOLD = 0.85

# Proposition heuristic patterns
_STRONG_SIGNALS = re.compile(
    r"\b(because|therefore|however|although|despite|since|"
    r"rather than|instead of|not just|more than|less than|"
    r"by\b|through\b|via\b|without\b|"
    r"when\b|where\b|while\b|if\b|unless\b|"
    r"which\b|that\b|"
    r"is\b|are\b|was\b|were\b|will\b|would\b|"
    r"can\b|could\b|should\b|must\b|"
    r"has\b|have\b|had\b|does\b|did\b)",
    re.IGNORECASE,
)

_VERB_ENDINGS = re.compile(
    r"\b\w{2,}(ed|ing|es|tes|ses|zes|ves|cts|pts|nts|rns|ps|ts|rs|ns|ds)\b",
    re.IGNORECASE,
)

_UNIVERSAL_QUANTIFIERS = re.compile(
    r"\b(all|every|always|never|no one|nobody|nothing|none of|"
    r"the only|the fundamental|the sole|the single|"
    r"universally|invariably|without exception|in every case)\b",
    re.IGNORECASE,
)

_SCOPING_LANGUAGE = re.compile(
    r"\b(when|if|under|given|assuming|provided|in cases where|"
    r"for .+ that|among|within|across|during|between|"
    r"approximately|roughly|nearly|most|many|often|typically|"
    r"tends? to|generally|usually|frequently)\b",
    re.IGNORECASE,
)


# ─── YAML frontmatter parser ───────────────────────────────────────────────


def parse_frontmatter(text: str) -> tuple[dict | None, str]:
    """Extract YAML frontmatter and body from markdown text."""
    if not text.startswith("---"):
        return None, text
    end = text.find("---", 3)
    if end == -1:
        return None, text
    raw = text[3:end]
    body = text[end + 3 :].strip()

    try:
        import yaml

        fm = yaml.safe_load(raw)
        if not isinstance(fm, dict):
            return None, body
        return fm, body
    except ImportError:
        pass
    except Exception:
        return None, body

    # Fallback: simple key-value parser
    fm = {}
    for line in raw.strip().split("\n"):
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        key, _, val = line.partition(":")
        key = key.strip()
        val = val.strip().strip('"').strip("'")
        if val.lower() == "null" or val == "":
            val = None
        elif val.startswith("["):
            val = [v.strip().strip('"').strip("'") for v in val.strip("[]").split(",") if v.strip()]
        fm[key] = val
    return fm if fm else None, body


# ─── Validators ─────────────────────────────────────────────────────────────


def validate_schema(fm: dict) -> list[str]:
    """Check required fields and valid enums, branching on content type."""
    violations = []

    ftype = fm.get("type")
    if not ftype:
        violations.append("missing_field:type")
        schema = config.TYPE_SCHEMAS["claim"]  # strictest default
    elif ftype not in config.TYPE_SCHEMAS:
        violations.append(f"invalid_type:{ftype}")
        schema = config.TYPE_SCHEMAS["claim"]
    else:
        schema = config.TYPE_SCHEMAS[ftype]

    for field in schema["required"]:
        if field not in fm or fm[field] is None:
            violations.append(f"missing_field:{field}")

    domain = fm.get("domain")
    if domain and domain not in VALID_DOMAINS:
        violations.append(f"invalid_domain:{domain}")

    valid_conf = schema.get("valid_confidence")
    confidence = fm.get("confidence")
    if valid_conf and confidence and confidence not in valid_conf:
        violations.append(f"invalid_confidence:{confidence}")

    desc = fm.get("description")
    if isinstance(desc, str) and len(desc.strip()) < 10:
        violations.append("description_too_short")

    source = fm.get("source")
    if "source" in schema["required"] and isinstance(source, str) and len(source.strip()) < 3:
        violations.append("source_too_short")

    return violations


def validate_date(date_val) -> list[str]:
    """Validate created date."""
    violations = []
    if date_val is None:
        return ["missing_field:created"]

    parsed = None
    if isinstance(date_val, date):
        parsed = date_val
    elif isinstance(date_val, str):
        try:
            parsed = datetime.strptime(date_val, "%Y-%m-%d").date()
        except ValueError:
            return [f"invalid_date_format:{date_val}"]
    else:
        return [f"invalid_date_type:{type(date_val).__name__}"]

    today = date.today()
    if parsed > today:
        violations.append(f"future_date:{parsed}")
    if parsed < DATE_MIN:
        violations.append(f"date_before_2020:{parsed}")
    return violations


def validate_title(filepath: str) -> list[str]:
    """Check filename follows prose-as-claim convention."""
    violations = []
    name = Path(filepath).stem
    normalized = name.replace("-", " ")

    if len(normalized) < 20:
        violations.append("title_too_short")

    words = normalized.split()
    if len(words) < 4:
        violations.append("title_too_few_words")

    cleaned = re.sub(r"[a-zA-Z0-9\s\-\.,'()%]", "", name)
    if cleaned:
        violations.append(f"title_special_chars:{cleaned[:20]}")

    return violations


def validate_wiki_links(body: str, existing_claims: set[str]) -> list[str]:
    """Check that [[wiki links]] resolve to known claims."""
    violations = []
    for link in WIKI_LINK_RE.findall(body):
        if link.strip() and link.strip() not in existing_claims:
            violations.append(f"broken_wiki_link:{link.strip()[:80]}")
    return violations


def validate_proposition(title: str) -> list[str]:
    """Check title reads as a proposition, not a label."""
    normalized = title.replace("-", " ")
    words = normalized.split()
    n = len(words)

    if n < 4:
        return ["title_not_proposition:too short to be a disagreeable sentence"]

    if _STRONG_SIGNALS.search(normalized):
        return []
    if _VERB_ENDINGS.search(normalized):
        return []
    if n >= 8:
        return []

    return ["title_not_proposition:no verb or connective found"]


def validate_universal_quantifiers(title: str) -> list[str]:
    """Flag unscoped universal quantifiers (warning, not gate)."""
    universals = _UNIVERSAL_QUANTIFIERS.findall(title)
    if universals and not _SCOPING_LANGUAGE.search(title):
        return [f"unscoped_universal:{','.join(universals)}"]
    return []


def validate_domain_directory_match(filepath: str, fm: dict) -> list[str]:
    """Check file's directory matches its domain field."""
    domain = fm.get("domain")
    if not domain:
        return []

    parts = Path(filepath).parts
    for i, part in enumerate(parts):
        if part == "domains" and i + 1 < len(parts):
            dir_domain = parts[i + 1]
            if dir_domain != domain:
                secondary = fm.get("secondary_domains", [])
                if isinstance(secondary, str):
                    secondary = [secondary]
                if dir_domain not in (secondary or []):
                    return [f"domain_directory_mismatch:file in domains/{dir_domain}/ but domain field says '{domain}'"]
            break
    return []


def validate_description_not_title(title: str, description: str) -> list[str]:
    """Check description adds info beyond the title."""
    if not description:
        return []
    title_lower = title.lower().strip()
    desc_lower = description.lower().strip().rstrip(".")

    if desc_lower in title_lower or title_lower in desc_lower:
        return ["description_echoes_title"]

    ratio = SequenceMatcher(None, title_lower, desc_lower).ratio()
    if ratio > 0.75:
        return [f"description_too_similar:{ratio:.0%}"]
    return []


def find_near_duplicates(title: str, existing_claims: set[str]) -> list[str]:
    """Find near-duplicate titles using SequenceMatcher with word pre-filter."""
    title_lower = title.lower()
    title_words = set(title_lower.split()[:6])
    warnings = []
    for existing in existing_claims:
        existing_lower = existing.lower()
        if len(title_words & set(existing_lower.split()[:6])) < 2:
            continue
        ratio = SequenceMatcher(None, title_lower, existing_lower).ratio()
        if ratio >= DEDUP_THRESHOLD:
            warnings.append(f"near_duplicate:{existing[:80]} (similarity={ratio:.2f})")
    return warnings


# ─── Full Tier 0 validation ────────────────────────────────────────────────


def tier0_validate_claim(filepath: str, content: str, existing_claims: set[str]) -> dict:
    """Run full Tier 0 validation. Returns {filepath, passes, violations, warnings}.

    Branches on content type (claim/framework/entity) via TYPE_SCHEMAS.
    Entities skip proposition title check, date validation, and confidence —
    they're factual records, not arguable claims.
    """
    violations = []
    warnings = []

    fm, body = parse_frontmatter(content)
    if fm is None:
        return {"filepath": filepath, "passes": False, "violations": ["no_frontmatter"], "warnings": []}

    violations.extend(validate_schema(fm))

    # Type-aware checks
    ftype = fm.get("type", "claim")
    schema = config.TYPE_SCHEMAS.get(ftype, config.TYPE_SCHEMAS["claim"])

    if "created" in schema["required"]:
        violations.extend(validate_date(fm.get("created")))

    title = Path(filepath).stem
    if schema.get("needs_proposition_title", True):
        # Title length/format checks only for claims/frameworks — entity filenames
        # like "metadao.md" are intentionally short (Ganymede review)
        violations.extend(validate_title(filepath))
        violations.extend(validate_proposition(title))
        warnings.extend(validate_universal_quantifiers(title))

    # Wiki links are warnings, not violations — broken links usually point to
    # claims in other open PRs that haven't merged yet. (Cory, Mar 14)
    warnings.extend(validate_wiki_links(body, existing_claims))

    violations.extend(validate_domain_directory_match(filepath, fm))

    desc = fm.get("description", "")
    if isinstance(desc, str):
        warnings.extend(validate_description_not_title(title, desc))

    # Skip near_duplicate for entities — entity updates matching existing entities
    # is correct behavior, not duplication. 83% false positive rate on entities. (Leo/Rhea)
    if ftype != "entity" and not filepath.startswith("entities/"):
        warnings.extend(find_near_duplicates(title, existing_claims))

    return {"filepath": filepath, "passes": len(violations) == 0, "violations": violations, "warnings": warnings}


# ─── Diff parsing ──────────────────────────────────────────────────────────


def extract_claim_files_from_diff(diff: str) -> dict[str, str]:
    """Parse unified diff to extract new/modified claim file contents."""
    claim_dirs = ("domains/", "core/", "foundations/")
    files = {}
    current_file = None
    current_lines = []
    is_deletion = False

    for line in diff.split("\n"):
        if line.startswith("diff --git"):
            if current_file and not is_deletion:
                files[current_file] = "\n".join(current_lines)
            current_file = None
            current_lines = []
            is_deletion = False
        elif line.startswith("deleted file mode") or line.startswith("+++ /dev/null"):
            is_deletion = True
            current_file = None
        elif line.startswith("+++ b/") and not is_deletion:
            path = line[6:]
            basename = path.rsplit("/", 1)[-1] if "/" in path else path
            if any(path.startswith(d) for d in claim_dirs) and path.endswith(".md") and not basename.startswith("_"):
                current_file = path
        elif current_file and line.startswith("+") and not line.startswith("+++"):
            current_lines.append(line[1:])

    if current_file and not is_deletion:
        files[current_file] = "\n".join(current_lines)

    return files


async def _get_pr_head_sha(pr_number: int) -> str:
    """Get HEAD SHA of PR's branch."""
    pr_info = await forgejo_api(
        "GET",
        repo_path(f"pulls/{pr_number}"),
    )
    if pr_info:
        return pr_info.get("head", {}).get("sha", "")
    return ""


async def _has_tier0_comment(pr_number: int, head_sha: str) -> bool:
    """Check if we already validated this exact commit."""
    if not head_sha:
        return False
    # Paginate comments (Ganymede standing rule)
    page = 1
    while True:
        comments = await forgejo_api(
            "GET",
            repo_path(f"issues/{pr_number}/comments?limit=50&page={page}"),
        )
        if not comments:
            break
        marker = f"<!-- TIER0-VALIDATION:{head_sha} -->"
        for c in comments:
            if marker in c.get("body", ""):
                return True
        if len(comments) < 50:
            break
        page += 1
    return False


async def _post_validation_comment(
    pr_number: int, results: list[dict], head_sha: str,
    t05_issues: list[str] | None = None, t05_details: list[str] | None = None,
):
    """Post Tier 0 + Tier 0.5 validation results as PR comment."""
    tier0_pass = all(r["passes"] for r in results)
    t05_pass = not t05_issues  # empty list = pass
    all_pass = tier0_pass and t05_pass
    total = len(results)
    passing = sum(1 for r in results if r["passes"])

    marker = f"<!-- TIER0-VALIDATION:{head_sha} -->" if head_sha else "<!-- TIER0-VALIDATION -->"
    status = "PASS" if all_pass else "FAIL"
    lines = [
        marker,
        f"**Validation: {status}** — {passing}/{total} claims pass\n",
    ]

    for r in results:
        icon = "pass" if r["passes"] else "FAIL"
        short_path = r["filepath"].split("/", 1)[-1] if "/" in r["filepath"] else r["filepath"]
        lines.append(f"**[{icon}]** `{short_path}`")
        for v in r["violations"]:
            lines.append(f"  - {v}")
        for w in r["warnings"]:
            lines.append(f"  - (warn) {w}")
        lines.append("")

    # Tier 0.5 results (diff-level checks)
    if t05_issues:
        lines.append("**Tier 0.5 — mechanical pre-check: FAIL**\n")
        for detail in (t05_details or []):
            lines.append(f"  - {detail}")
        lines.append("")

    if not all_pass:
        lines.append("---")
        lines.append("Fix the violations above and push to trigger re-validation.")
        lines.append("LLM review will run after all mechanical checks pass.")

    lines.append(f"\n*tier0-gate v2 | {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*")

    await forgejo_api(
        "POST",
        repo_path(f"issues/{pr_number}/comments"),
        {"body": "\n".join(lines)},
    )


# ─── Existing claims index ─────────────────────────────────────────────────


def load_existing_claims() -> set[str]:
    """Build set of known claim titles from the main worktree."""
    claims: set[str] = set()
    base = config.MAIN_WORKTREE
    for subdir in ["domains", "core", "foundations", "maps", "agents", "schemas", "entities", "decisions"]:
        full = base / subdir
        if not full.is_dir():
            continue
        for f in full.rglob("*.md"):
            claims.add(f.stem)
    return claims


# ─── Main entry point ──────────────────────────────────────────────────────


def _extract_all_md_added_content(diff: str) -> dict[str, str]:
    """Extract added content from ALL .md files in diff (not just claim dirs).

    Used for wiki link validation on agent files, musings, etc. that
    extract_claim_files_from_diff skips. Returns {filepath: added_lines}.
    """
    files: dict[str, str] = {}
    current_file = None
    current_lines: list[str] = []
    is_deletion = False

    for line in diff.split("\n"):
        if line.startswith("diff --git"):
            if current_file and not is_deletion:
                files[current_file] = "\n".join(current_lines)
            current_file = None
            current_lines = []
            is_deletion = False
        elif line.startswith("deleted file mode") or line.startswith("+++ /dev/null"):
            is_deletion = True
            current_file = None
        elif line.startswith("+++ b/") and not is_deletion:
            path = line[6:]
            if path.endswith(".md"):
                current_file = path
        elif current_file and line.startswith("+") and not line.startswith("+++"):
            current_lines.append(line[1:])

    if current_file and not is_deletion:
        files[current_file] = "\n".join(current_lines)

    return files


def _new_files_in_diff(diff: str) -> set[str]:
    """Extract paths of newly added files from a unified diff."""
    new_files: set[str] = set()
    lines = diff.split("\n")
    for i, line in enumerate(lines):
        if line.startswith("--- /dev/null") and i + 1 < len(lines) and lines[i + 1].startswith("+++ b/"):
            new_files.add(lines[i + 1][6:])
    return new_files


def tier05_mechanical_check(diff: str, existing_claims: set[str] | None = None) -> tuple[bool, list[str], list[str]]:
    """Tier 0.5: mechanical pre-check for frontmatter schema + wiki links.

    Runs deterministic Python checks ($0) to catch issues that LLM reviewers
    rubber-stamp or reject without structured issue tags. Moved from evaluate.py
    to validate.py so that mechanical issues are caught BEFORE eval, not during.

    Only checks NEW files for frontmatter (modified files have partial content
    from diff — Bug 2). Wiki links checked on ALL .md files.

    Returns (passes, issue_tags, detail_messages).
    """
    claim_files = extract_claim_files_from_diff(diff)
    all_md_files = _extract_all_md_added_content(diff)

    if not claim_files and not all_md_files:
        return True, [], []

    if existing_claims is None:
        existing_claims = load_existing_claims()

    new_files = _new_files_in_diff(diff)

    issues: list[str] = []
    details: list[str] = []
    gate_failed = False

    # Pass 1: Claim-specific checks (frontmatter, schema, near-duplicate)
    for filepath, content in claim_files.items():
        is_new = filepath in new_files

        if is_new:
            fm, body = parse_frontmatter(content)
            if fm is None:
                issues.append("frontmatter_schema")
                details.append(f"{filepath}: no valid YAML frontmatter")
                gate_failed = True
                continue

            schema_errors = validate_schema(fm)
            if schema_errors:
                issues.append("frontmatter_schema")
                details.append(f"{filepath}: {', '.join(schema_errors)}")
                gate_failed = True

            # Near-duplicate (warning only — tagged but doesn't gate)
            # Skip for entities — entity updates matching existing entities is expected.
            title = Path(filepath).stem
            ftype_check = fm.get("type", "claim")
            if ftype_check != "entity" and not filepath.startswith("entities/"):
                dup_warnings = find_near_duplicates(title, existing_claims)
                if dup_warnings:
                    issues.append("near_duplicate")
                    details.append(f"{filepath}: {', '.join(w[:60] for w in dup_warnings[:2])}")

    # Pass 2: Wiki link check on ALL .md files
    # Broken wiki links are a WARNING, not a gate. Most broken links point to claims
    # in other open PRs that haven't merged yet — they resolve naturally as the
    # dependency chain merges. LLM reviewers catch genuinely missing references.
    # (Cory directive, Mar 14: "they'll likely merge")
    for filepath, content in all_md_files.items():
        link_errors = validate_wiki_links(content, existing_claims)
        if link_errors:
            issues.append("broken_wiki_links")
            details.append(f"{filepath}: (warn) {', '.join(e[:60] for e in link_errors[:3])}")
            # NOT gate_failed — wiki links are warnings, not blockers

    unique_issues = list(dict.fromkeys(issues))
    return not gate_failed, unique_issues, details


async def validate_pr(conn, pr_number: int) -> dict:
    """Run Tier 0 + Tier 0.5 validation on a single PR.

    Tier 0: per-claim validation (schema, date, title, wiki links, proposition).
    Tier 0.5: diff-level mechanical checks (frontmatter schema on new files, wiki links on all .md).

    Both must pass for tier0_pass = 1. If either fails, eval won't touch this PR.
    Fixer handles wiki links; non-fixable issues exhaust fix_attempts → terminal.

    Returns {pr, all_pass, total, passing, skipped, reason, tier05_issues}.
    """
    # Get HEAD SHA for idempotency
    head_sha = await _get_pr_head_sha(pr_number)

    # Skip if already validated for this commit
    if await _has_tier0_comment(pr_number, head_sha):
        logger.debug("PR #%d already validated at %s", pr_number, head_sha[:8])
        return {"pr": pr_number, "skipped": True, "reason": "already_validated"}

    # Fetch diff
    diff = await get_pr_diff(pr_number)
    if not diff:
        logger.debug("PR #%d: empty or oversized diff", pr_number)
        return {"pr": pr_number, "skipped": True, "reason": "no_diff"}

    # Load existing claims index (shared between Tier 0 and Tier 0.5)
    existing_claims = load_existing_claims()

    # Extract claim files (domains/, core/, foundations/)
    claim_files = extract_claim_files_from_diff(diff)

    # ── Tier 0: per-claim validation ──
    # Only validates NEW files (not modified). Modified files have partial content
    # from diffs (only + lines) — frontmatter parsing fails on partial content,
    # producing false no_frontmatter violations. Enrichment PRs that modify
    # existing claim files were getting stuck here. (Epimetheus session 2)
    new_files = _new_files_in_diff(diff)
    results = []
    for filepath, content in claim_files.items():
        if filepath not in new_files:
            continue  # Skip modified files — partial diff content can't be validated
        result = tier0_validate_claim(filepath, content, existing_claims)
        results.append(result)
        status = "PASS" if result["passes"] else "FAIL"
        logger.debug("PR #%d: %s %s v=%s w=%s", pr_number, status, filepath, result["violations"], result["warnings"])

    tier0_pass = all(r["passes"] for r in results) if results else True
    total = len(results)
    passing = sum(1 for r in results if r["passes"])

    # ── Tier 0.5: diff-level mechanical checks ──
    # Always runs — catches broken wiki links in ALL .md files including entities.
    t05_pass, t05_issues, t05_details = tier05_mechanical_check(diff, existing_claims)

    if not claim_files and t05_pass:
        # Entity/source-only PR with no wiki link issues — pass through
        logger.debug("PR #%d: no claim files, Tier 0.5 passed — auto-pass", pr_number)
    elif not claim_files and not t05_pass:
        logger.info("PR #%d: no claim files but Tier 0.5 failed: %s", pr_number, t05_issues)

    # Combined result: both tiers must pass
    all_pass = tier0_pass and t05_pass

    logger.info(
        "PR #%d: Tier 0 — %d/%d pass | Tier 0.5 — %s (issues: %s) | combined: %s",
        pr_number, passing, total, "PASS" if t05_pass else "FAIL", t05_issues, all_pass,
    )

    # Post combined comment
    await _post_validation_comment(pr_number, results, head_sha, t05_issues, t05_details)

    # Update PR record — reset eval state on new commits
    # WARNING-ONLY issue tags (broken_wiki_links, near_duplicate) should NOT
    # prevent tier0_pass. Only blocking tags (frontmatter_schema, etc.) gate.
    # This was causing an infinite fixer→validate loop where wiki link warnings
    # kept resetting tier0_pass=0. (Epimetheus, session 2 fix)
    # Determine effective pass: per-claim violations always gate. Tier 0.5 warnings don't.
    # (Ganymede: verify this doesn't accidentally pass real schema failures)
    WARNING_ONLY_TAGS = {"broken_wiki_links", "near_duplicate"}
    blocking_t05_issues = set(t05_issues) - WARNING_ONLY_TAGS if t05_issues else set()
    # Pass if: per-claim checks pass AND no blocking Tier 0.5 issues
    effective_pass = tier0_pass and not blocking_t05_issues

    conn.execute(
        """UPDATE prs SET tier0_pass = ?,
           eval_attempts = 0, eval_issues = ?,
           domain_verdict = 'pending', leo_verdict = 'pending',
           last_error = NULL
           WHERE number = ?""",
        (1 if effective_pass else 0, json.dumps(t05_issues) if t05_issues else "[]", pr_number),
    )
    db.audit(
        conn,
        "validate",
        "tier0_complete",
        json.dumps({
            "pr": pr_number, "pass": all_pass,
            "tier0_pass": tier0_pass, "tier05_pass": t05_pass,
            "passing": passing, "total": total,
            "tier05_issues": t05_issues,
        }),
    )

    return {
        "pr": pr_number, "all_pass": all_pass,
        "total": total, "passing": passing,
        "tier05_issues": t05_issues,
    }


async def validate_cycle(conn, max_workers=None) -> tuple[int, int]:
    """Run one validation cycle.

    Finds PRs with status='open' and tier0_pass IS NULL, validates them.
    """
    # Find unvalidated PRs (priority ordered)
    rows = conn.execute(
        """SELECT p.number FROM prs p
           LEFT JOIN sources s ON p.source_path = s.path
           WHERE p.status = 'open'
           AND p.tier0_pass IS NULL
           ORDER BY
               CASE COALESCE(p.priority, s.priority, 'medium')
                   WHEN 'critical' THEN 0
                   WHEN 'high' THEN 1
                   WHEN 'medium' THEN 2
                   WHEN 'low' THEN 3
                   ELSE 4
               END,
               p.created_at ASC
           LIMIT ?""",
        (max_workers or 10,),
    ).fetchall()

    if not rows:
        return 0, 0

    succeeded = 0
    failed = 0

    for row in rows:
        try:
            result = await validate_pr(conn, row["number"])
            if result.get("skipped"):
                # Mark as validated even if skipped (no claims = pass)
                conn.execute(
                    "UPDATE prs SET tier0_pass = 1 WHERE number = ? AND tier0_pass IS NULL",
                    (row["number"],),
                )
                succeeded += 1
            elif result.get("all_pass"):
                succeeded += 1
            else:
                succeeded += 1  # Validation ran successfully, even if claims failed
        except Exception:
            logger.exception("Failed to validate PR #%d", row["number"])
            failed += 1

    if succeeded or failed:
        logger.info("Validate cycle: %d validated, %d errors", succeeded, failed)

    return succeeded, failed