From e043cf98dc134d57135a5eaf44150c9a9c24aca5 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Tue, 21 Apr 2026 10:46:55 +0100 Subject: [PATCH] feat: add wiki-link audit script for codex graph integrity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crawls domains/foundations/core/decisions for [[wiki-links]], resolves against claim files, entities, maps, and agents. Reports dead links, orphans, and connectivity stats. Prerequisite for CI scoring connectivity bonus — broken links would inflate scores. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/audit-wiki-links.py | 259 ++++++++++++++++++++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 scripts/audit-wiki-links.py diff --git a/scripts/audit-wiki-links.py b/scripts/audit-wiki-links.py new file mode 100644 index 0000000..9aef144 --- /dev/null +++ b/scripts/audit-wiki-links.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +"""Audit wiki-links across the teleo-codex knowledge base. + +Crawls domains/, foundations/, core/, decisions/ for [[wiki-links]]. +Resolves each link against known claim files, entity files, and _map files. +Reports dead links, orphaned claims, and link counts. + +Output: JSON to stdout with dead links, orphans, and per-file link counts. +""" + +import json +import os +import re +import sys +import unicodedata +from pathlib import Path + +CODEX_ROOT = Path(os.environ.get("CODEX_ROOT", "/opt/teleo-eval/workspaces/main")) +CLAIM_DIRS = ["domains", "foundations", "core", "decisions"] +ENTITY_DIR = "entities" + +WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]") + + +def slugify(title: str) -> str: + """Convert a wiki-link title to the kebab-case slug used for filenames.""" + s = title.strip().lower() + s = unicodedata.normalize("NFKD", s) + s = re.sub(r"[^\w\s-]", "", s) + s = re.sub(r"[\s_]+", "-", s) + s = re.sub(r"-+", "-", s) + return s.strip("-") + + +def build_index(codex: Path) -> dict: + """Build a lookup index of all resolvable targets. + + Returns dict mapping normalized slug -> file path. + Also maps raw stem (filename without .md) -> file path. + """ + index = {} + + # Index claim files across all claim directories + for claim_dir in CLAIM_DIRS: + d = codex / claim_dir + if not d.exists(): + continue + for md in d.rglob("*.md"): + stem = md.stem + rel = str(md.relative_to(codex)) + # Map by stem (exact filename match) + index[stem.lower()] = rel + # Map by slugified stem + index[slugify(stem)] = rel + + # Index entity files + entity_root = codex / ENTITY_DIR + if entity_root.exists(): + for md in entity_root.rglob("*.md"): + stem = md.stem + rel = str(md.relative_to(codex)) + index[stem.lower()] = rel + index[slugify(stem)] = rel + + # Index maps/ directory (MOC-style overview docs) + maps_root = codex / "maps" + if maps_root.exists(): + for md in maps_root.rglob("*.md"): + stem = md.stem + rel = str(md.relative_to(codex)) + index[stem.lower()] = rel + index[slugify(stem)] = rel + + # Index top-level docs that might be link targets + for special in ["overview.md", "livingip-overview.md"]: + p = codex / special + if p.exists(): + index[p.stem.lower()] = str(p.relative_to(codex)) + + # Index agents/ beliefs and positions (sometimes linked) + agents_dir = codex / "agents" + if agents_dir.exists(): + for md in agents_dir.rglob("*.md"): + stem = md.stem + rel = str(md.relative_to(codex)) + index[stem.lower()] = rel + + return index + + +def resolve_link(link_text: str, index: dict, source_dir: str) -> str | None: + """Try to resolve a wiki-link target. Returns file path or None.""" + text = link_text.strip() + + # Special case: [[_map]] resolves to _map.md in the same domain directory + if text == "_map": + parts = source_dir.split("/") + if len(parts) >= 2: + candidate = f"{parts[0]}/{parts[1]}/_map.md" + if (CODEX_ROOT / candidate).exists(): + return candidate + return None + + # Path-style references like [[domains/health/_map]] + if "/" in text: + candidate = text.rstrip("/") + if not candidate.endswith(".md"): + candidate += ".md" + if (CODEX_ROOT / candidate).exists(): + return candidate + return None + + # Try exact stem match (lowercased) + key = text.lower() + if key in index: + return index[key] + + # Try slugified version + slug = slugify(text) + if slug in index: + return index[slug] + + # Try with common variations + for variant in [ + slug.replace("metadaos", "metadao"), + slug.replace("ais", "ai"), + ]: + if variant in index: + return index[variant] + + return None + + +def audit(codex: Path) -> dict: + """Run the full wiki-link audit.""" + index = build_index(codex) + + dead_links = [] # {file, link, line_number} + link_counts = {} # file -> {outbound: N, targets: []} + all_targets = set() # files that are linked TO + all_files = set() # all claim/foundation files + + # Scan all markdown files in claim directories + for claim_dir in CLAIM_DIRS: + d = codex / claim_dir + if not d.exists(): + continue + for md in d.rglob("*.md"): + rel = str(md.relative_to(codex)) + all_files.add(rel) + source_dir = str(md.parent.relative_to(codex)) + + try: + content = md.read_text(encoding="utf-8") + except Exception: + continue + + links_in_file = [] + for i, line in enumerate(content.split("\n"), 1): + for match in WIKI_LINK_RE.finditer(line): + link_text = match.group(1) + # Skip links with | (display text aliases) - take the target part + if "|" in link_text: + link_text = link_text.split("|")[0].strip() + + resolved = resolve_link(link_text, index, source_dir) + if resolved: + all_targets.add(resolved) + links_in_file.append(resolved) + else: + dead_links.append({ + "file": rel, + "link": link_text, + "line": i, + }) + + link_counts[rel] = { + "outbound": len(links_in_file), + "targets": links_in_file, + } + + # Find orphaned claims (no inbound links AND no outbound links) + files_with_outbound = {f for f, c in link_counts.items() if c["outbound"] > 0} + orphaned = sorted( + f for f in all_files + if f not in all_targets + and f not in files_with_outbound + and not f.endswith("_map.md") # MOC files are structural, not orphans + ) + + # Compute inbound link counts + inbound_counts = {} + for f, c in link_counts.items(): + for target in c["targets"]: + inbound_counts[target] = inbound_counts.get(target, 0) + 1 + + # Claims with high outbound (good connectivity) + high_connectivity = sorted( + [(f, c["outbound"]) for f, c in link_counts.items() if c["outbound"] >= 3], + key=lambda x: -x[1], + ) + + # Summary stats + total_links = sum(c["outbound"] for c in link_counts.values()) + files_with_links = sum(1 for c in link_counts.values() if c["outbound"] > 0) + + # Domain breakdown of dead links + dead_by_domain = {} + for dl in dead_links: + parts = dl["file"].split("/") + domain = parts[1] if len(parts) >= 3 else parts[0] + dead_by_domain[domain] = dead_by_domain.get(domain, 0) + 1 + + # Domain breakdown of orphans + orphan_by_domain = {} + for o in orphaned: + parts = o.split("/") + domain = parts[1] if len(parts) >= 3 else parts[0] + orphan_by_domain[domain] = orphan_by_domain.get(domain, 0) + 1 + + return { + "summary": { + "total_files": len(all_files), + "total_links": total_links, + "files_with_links": files_with_links, + "files_without_links": len(all_files) - files_with_links, + "dead_link_count": len(dead_links), + "orphan_count": len(orphaned), + "avg_links_per_file": round(total_links / max(len(all_files), 1), 2), + "high_connectivity_count": len(high_connectivity), + }, + "dead_links": dead_links, + "dead_by_domain": dict(sorted(dead_by_domain.items(), key=lambda x: -x[1])), + "orphaned": orphaned, + "orphan_by_domain": dict(sorted(orphan_by_domain.items(), key=lambda x: -x[1])), + "high_connectivity": [{"file": f, "outbound_links": n} for f, n in high_connectivity[:20]], + "inbound_top20": sorted( + [{"file": f, "inbound_links": n} for f, n in inbound_counts.items()], + key=lambda x: -x["inbound_links"], + )[:20], + } + + +if __name__ == "__main__": + codex = Path(sys.argv[1]) if len(sys.argv) > 1 else CODEX_ROOT + result = audit(codex) + json.dump(result, sys.stdout, indent=2) + print() + + # Print human-readable summary to stderr + s = result["summary"] + print(f"\n=== Wiki-Link Audit ===", file=sys.stderr) + print(f"Files scanned: {s['total_files']}", file=sys.stderr) + print(f"Total links: {s['total_links']}", file=sys.stderr) + print(f"Files with links: {s['files_with_links']} ({100*s['files_with_links']//max(s['total_files'],1)}%)", file=sys.stderr) + print(f"Dead links: {s['dead_link_count']}", file=sys.stderr) + print(f"Orphaned claims: {s['orphan_count']}", file=sys.stderr) + print(f"Avg links/file: {s['avg_links_per_file']}", file=sys.stderr) + print(f"High connectivity (≥3 links): {s['high_connectivity_count']}", file=sys.stderr)