feat: add wiki-link audit script for codex graph integrity

Crawls domains/foundations/core/decisions for [[wiki-links]], resolves against claim files, entities, maps, and agents. Reports dead links, orphans, and connectivity stats. Prerequisite for CI scoring connectivity bonus — broken links would inflate scores. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 10:46:55 +01:00 · 2026-04-21 10:46:55 +01:00 · e043cf98dc
commit e043cf98dc
parent 9c0be78620
1 changed files with 259 additions and 0 deletions
--- a/scripts/audit-wiki-links.py
+++ b/scripts/audit-wiki-links.py
@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""Audit wiki-links across the teleo-codex knowledge base.
+
+Crawls domains/, foundations/, core/, decisions/ for [[wiki-links]].
+Resolves each link against known claim files, entity files, and _map files.
+Reports dead links, orphaned claims, and link counts.
+
+Output: JSON to stdout with dead links, orphans, and per-file link counts.
+"""
+
+import json
+import os
+import re
+import sys
+import unicodedata
+from pathlib import Path
+
+CODEX_ROOT = Path(os.environ.get("CODEX_ROOT", "/opt/teleo-eval/workspaces/main"))
+CLAIM_DIRS = ["domains", "foundations", "core", "decisions"]
+ENTITY_DIR = "entities"
+
+WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
+
+
+def slugify(title: str) -> str:
+    """Convert a wiki-link title to the kebab-case slug used for filenames."""
+    s = title.strip().lower()
+    s = unicodedata.normalize("NFKD", s)
+    s = re.sub(r"[^\w\s-]", "", s)
+    s = re.sub(r"[\s_]+", "-", s)
+    s = re.sub(r"-+", "-", s)
+    return s.strip("-")
+
+
+def build_index(codex: Path) -> dict:
+    """Build a lookup index of all resolvable targets.
+
+    Returns dict mapping normalized slug -> file path.
+    Also maps raw stem (filename without .md) -> file path.
+    """
+    index = {}
+
+    # Index claim files across all claim directories
+    for claim_dir in CLAIM_DIRS:
+        d = codex / claim_dir
+        if not d.exists():
+            continue
+        for md in d.rglob("*.md"):
+            stem = md.stem
+            rel = str(md.relative_to(codex))
+            # Map by stem (exact filename match)
+            index[stem.lower()] = rel
+            # Map by slugified stem
+            index[slugify(stem)] = rel
+
+    # Index entity files
+    entity_root = codex / ENTITY_DIR
+    if entity_root.exists():
+        for md in entity_root.rglob("*.md"):
+            stem = md.stem
+            rel = str(md.relative_to(codex))
+            index[stem.lower()] = rel
+            index[slugify(stem)] = rel
+
+    # Index maps/ directory (MOC-style overview docs)
+    maps_root = codex / "maps"
+    if maps_root.exists():
+        for md in maps_root.rglob("*.md"):
+            stem = md.stem
+            rel = str(md.relative_to(codex))
+            index[stem.lower()] = rel
+            index[slugify(stem)] = rel
+
+    # Index top-level docs that might be link targets
+    for special in ["overview.md", "livingip-overview.md"]:
+        p = codex / special
+        if p.exists():
+            index[p.stem.lower()] = str(p.relative_to(codex))
+
+    # Index agents/ beliefs and positions (sometimes linked)
+    agents_dir = codex / "agents"
+    if agents_dir.exists():
+        for md in agents_dir.rglob("*.md"):
+            stem = md.stem
+            rel = str(md.relative_to(codex))
+            index[stem.lower()] = rel
+
+    return index
+
+
+def resolve_link(link_text: str, index: dict, source_dir: str) -> str | None:
+    """Try to resolve a wiki-link target. Returns file path or None."""
+    text = link_text.strip()
+
+    # Special case: [[_map]] resolves to _map.md in the same domain directory
+    if text == "_map":
+        parts = source_dir.split("/")
+        if len(parts) >= 2:
+            candidate = f"{parts[0]}/{parts[1]}/_map.md"
+            if (CODEX_ROOT / candidate).exists():
+                return candidate
+        return None
+
+    # Path-style references like [[domains/health/_map]]
+    if "/" in text:
+        candidate = text.rstrip("/")
+        if not candidate.endswith(".md"):
+            candidate += ".md"
+        if (CODEX_ROOT / candidate).exists():
+            return candidate
+        return None
+
+    # Try exact stem match (lowercased)
+    key = text.lower()
+    if key in index:
+        return index[key]
+
+    # Try slugified version
+    slug = slugify(text)
+    if slug in index:
+        return index[slug]
+
+    # Try with common variations
+    for variant in [
+        slug.replace("metadaos", "metadao"),
+        slug.replace("ais", "ai"),
+    ]:
+        if variant in index:
+            return index[variant]
+
+    return None
+
+
+def audit(codex: Path) -> dict:
+    """Run the full wiki-link audit."""
+    index = build_index(codex)
+
+    dead_links = []       # {file, link, line_number}
+    link_counts = {}      # file -> {outbound: N, targets: []}
+    all_targets = set()   # files that are linked TO
+    all_files = set()     # all claim/foundation files
+
+    # Scan all markdown files in claim directories
+    for claim_dir in CLAIM_DIRS:
+        d = codex / claim_dir
+        if not d.exists():
+            continue
+        for md in d.rglob("*.md"):
+            rel = str(md.relative_to(codex))
+            all_files.add(rel)
+            source_dir = str(md.parent.relative_to(codex))
+
+            try:
+                content = md.read_text(encoding="utf-8")
+            except Exception:
+                continue
+
+            links_in_file = []
+            for i, line in enumerate(content.split("\n"), 1):
+                for match in WIKI_LINK_RE.finditer(line):
+                    link_text = match.group(1)
+                    # Skip links with | (display text aliases) - take the target part
+                    if "|" in link_text:
+                        link_text = link_text.split("|")[0].strip()
+
+                    resolved = resolve_link(link_text, index, source_dir)
+                    if resolved:
+                        all_targets.add(resolved)
+                        links_in_file.append(resolved)
+                    else:
+                        dead_links.append({
+                            "file": rel,
+                            "link": link_text,
+                            "line": i,
+                        })
+
+            link_counts[rel] = {
+                "outbound": len(links_in_file),
+                "targets": links_in_file,
+            }
+
+    # Find orphaned claims (no inbound links AND no outbound links)
+    files_with_outbound = {f for f, c in link_counts.items() if c["outbound"] > 0}
+    orphaned = sorted(
+        f for f in all_files
+        if f not in all_targets
+        and f not in files_with_outbound
+        and not f.endswith("_map.md")  # MOC files are structural, not orphans
+    )
+
+    # Compute inbound link counts
+    inbound_counts = {}
+    for f, c in link_counts.items():
+        for target in c["targets"]:
+            inbound_counts[target] = inbound_counts.get(target, 0) + 1
+
+    # Claims with high outbound (good connectivity)
+    high_connectivity = sorted(
+        [(f, c["outbound"]) for f, c in link_counts.items() if c["outbound"] >= 3],
+        key=lambda x: -x[1],
+    )
+
+    # Summary stats
+    total_links = sum(c["outbound"] for c in link_counts.values())
+    files_with_links = sum(1 for c in link_counts.values() if c["outbound"] > 0)
+
+    # Domain breakdown of dead links
+    dead_by_domain = {}
+    for dl in dead_links:
+        parts = dl["file"].split("/")
+        domain = parts[1] if len(parts) >= 3 else parts[0]
+        dead_by_domain[domain] = dead_by_domain.get(domain, 0) + 1
+
+    # Domain breakdown of orphans
+    orphan_by_domain = {}
+    for o in orphaned:
+        parts = o.split("/")
+        domain = parts[1] if len(parts) >= 3 else parts[0]
+        orphan_by_domain[domain] = orphan_by_domain.get(domain, 0) + 1
+
+    return {
+        "summary": {
+            "total_files": len(all_files),
+            "total_links": total_links,
+            "files_with_links": files_with_links,
+            "files_without_links": len(all_files) - files_with_links,
+            "dead_link_count": len(dead_links),
+            "orphan_count": len(orphaned),
+            "avg_links_per_file": round(total_links / max(len(all_files), 1), 2),
+            "high_connectivity_count": len(high_connectivity),
+        },
+        "dead_links": dead_links,
+        "dead_by_domain": dict(sorted(dead_by_domain.items(), key=lambda x: -x[1])),
+        "orphaned": orphaned,
+        "orphan_by_domain": dict(sorted(orphan_by_domain.items(), key=lambda x: -x[1])),
+        "high_connectivity": [{"file": f, "outbound_links": n} for f, n in high_connectivity[:20]],
+        "inbound_top20": sorted(
+            [{"file": f, "inbound_links": n} for f, n in inbound_counts.items()],
+            key=lambda x: -x["inbound_links"],
+        )[:20],
+    }
+
+
+if __name__ == "__main__":
+    codex = Path(sys.argv[1]) if len(sys.argv) > 1 else CODEX_ROOT
+    result = audit(codex)
+    json.dump(result, sys.stdout, indent=2)
+    print()
+
+    # Print human-readable summary to stderr
+    s = result["summary"]
+    print(f"\n=== Wiki-Link Audit ===", file=sys.stderr)
+    print(f"Files scanned: {s['total_files']}", file=sys.stderr)
+    print(f"Total links: {s['total_links']}", file=sys.stderr)
+    print(f"Files with links: {s['files_with_links']} ({100*s['files_with_links']//max(s['total_files'],1)}%)", file=sys.stderr)
+    print(f"Dead links: {s['dead_link_count']}", file=sys.stderr)
+    print(f"Orphaned claims: {s['orphan_count']}", file=sys.stderr)
+    print(f"Avg links/file: {s['avg_links_per_file']}", file=sys.stderr)
+    print(f"High connectivity (≥3 links): {s['high_connectivity_count']}", file=sys.stderr)