feat: add wiki-link audit script for codex graph integrity

Crawls domains/foundations/core/decisions for [[wiki-links]], resolves
against claim files, entities, maps, and agents. Reports dead links,
orphans, and connectivity stats. Prerequisite for CI scoring connectivity
bonus — broken links would inflate scores.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
m3taversal 2026-04-21 10:46:55 +01:00
parent 9c0be78620
commit e043cf98dc

259
scripts/audit-wiki-links.py Normal file
View file

@ -0,0 +1,259 @@
#!/usr/bin/env python3
"""Audit wiki-links across the teleo-codex knowledge base.
Crawls domains/, foundations/, core/, decisions/ for [[wiki-links]].
Resolves each link against known claim files, entity files, and _map files.
Reports dead links, orphaned claims, and link counts.
Output: JSON to stdout with dead links, orphans, and per-file link counts.
"""
import json
import os
import re
import sys
import unicodedata
from pathlib import Path
CODEX_ROOT = Path(os.environ.get("CODEX_ROOT", "/opt/teleo-eval/workspaces/main"))
CLAIM_DIRS = ["domains", "foundations", "core", "decisions"]
ENTITY_DIR = "entities"
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
def slugify(title: str) -> str:
"""Convert a wiki-link title to the kebab-case slug used for filenames."""
s = title.strip().lower()
s = unicodedata.normalize("NFKD", s)
s = re.sub(r"[^\w\s-]", "", s)
s = re.sub(r"[\s_]+", "-", s)
s = re.sub(r"-+", "-", s)
return s.strip("-")
def build_index(codex: Path) -> dict:
"""Build a lookup index of all resolvable targets.
Returns dict mapping normalized slug -> file path.
Also maps raw stem (filename without .md) -> file path.
"""
index = {}
# Index claim files across all claim directories
for claim_dir in CLAIM_DIRS:
d = codex / claim_dir
if not d.exists():
continue
for md in d.rglob("*.md"):
stem = md.stem
rel = str(md.relative_to(codex))
# Map by stem (exact filename match)
index[stem.lower()] = rel
# Map by slugified stem
index[slugify(stem)] = rel
# Index entity files
entity_root = codex / ENTITY_DIR
if entity_root.exists():
for md in entity_root.rglob("*.md"):
stem = md.stem
rel = str(md.relative_to(codex))
index[stem.lower()] = rel
index[slugify(stem)] = rel
# Index maps/ directory (MOC-style overview docs)
maps_root = codex / "maps"
if maps_root.exists():
for md in maps_root.rglob("*.md"):
stem = md.stem
rel = str(md.relative_to(codex))
index[stem.lower()] = rel
index[slugify(stem)] = rel
# Index top-level docs that might be link targets
for special in ["overview.md", "livingip-overview.md"]:
p = codex / special
if p.exists():
index[p.stem.lower()] = str(p.relative_to(codex))
# Index agents/ beliefs and positions (sometimes linked)
agents_dir = codex / "agents"
if agents_dir.exists():
for md in agents_dir.rglob("*.md"):
stem = md.stem
rel = str(md.relative_to(codex))
index[stem.lower()] = rel
return index
def resolve_link(link_text: str, index: dict, source_dir: str) -> str | None:
"""Try to resolve a wiki-link target. Returns file path or None."""
text = link_text.strip()
# Special case: [[_map]] resolves to _map.md in the same domain directory
if text == "_map":
parts = source_dir.split("/")
if len(parts) >= 2:
candidate = f"{parts[0]}/{parts[1]}/_map.md"
if (CODEX_ROOT / candidate).exists():
return candidate
return None
# Path-style references like [[domains/health/_map]]
if "/" in text:
candidate = text.rstrip("/")
if not candidate.endswith(".md"):
candidate += ".md"
if (CODEX_ROOT / candidate).exists():
return candidate
return None
# Try exact stem match (lowercased)
key = text.lower()
if key in index:
return index[key]
# Try slugified version
slug = slugify(text)
if slug in index:
return index[slug]
# Try with common variations
for variant in [
slug.replace("metadaos", "metadao"),
slug.replace("ais", "ai"),
]:
if variant in index:
return index[variant]
return None
def audit(codex: Path) -> dict:
"""Run the full wiki-link audit."""
index = build_index(codex)
dead_links = [] # {file, link, line_number}
link_counts = {} # file -> {outbound: N, targets: []}
all_targets = set() # files that are linked TO
all_files = set() # all claim/foundation files
# Scan all markdown files in claim directories
for claim_dir in CLAIM_DIRS:
d = codex / claim_dir
if not d.exists():
continue
for md in d.rglob("*.md"):
rel = str(md.relative_to(codex))
all_files.add(rel)
source_dir = str(md.parent.relative_to(codex))
try:
content = md.read_text(encoding="utf-8")
except Exception:
continue
links_in_file = []
for i, line in enumerate(content.split("\n"), 1):
for match in WIKI_LINK_RE.finditer(line):
link_text = match.group(1)
# Skip links with | (display text aliases) - take the target part
if "|" in link_text:
link_text = link_text.split("|")[0].strip()
resolved = resolve_link(link_text, index, source_dir)
if resolved:
all_targets.add(resolved)
links_in_file.append(resolved)
else:
dead_links.append({
"file": rel,
"link": link_text,
"line": i,
})
link_counts[rel] = {
"outbound": len(links_in_file),
"targets": links_in_file,
}
# Find orphaned claims (no inbound links AND no outbound links)
files_with_outbound = {f for f, c in link_counts.items() if c["outbound"] > 0}
orphaned = sorted(
f for f in all_files
if f not in all_targets
and f not in files_with_outbound
and not f.endswith("_map.md") # MOC files are structural, not orphans
)
# Compute inbound link counts
inbound_counts = {}
for f, c in link_counts.items():
for target in c["targets"]:
inbound_counts[target] = inbound_counts.get(target, 0) + 1
# Claims with high outbound (good connectivity)
high_connectivity = sorted(
[(f, c["outbound"]) for f, c in link_counts.items() if c["outbound"] >= 3],
key=lambda x: -x[1],
)
# Summary stats
total_links = sum(c["outbound"] for c in link_counts.values())
files_with_links = sum(1 for c in link_counts.values() if c["outbound"] > 0)
# Domain breakdown of dead links
dead_by_domain = {}
for dl in dead_links:
parts = dl["file"].split("/")
domain = parts[1] if len(parts) >= 3 else parts[0]
dead_by_domain[domain] = dead_by_domain.get(domain, 0) + 1
# Domain breakdown of orphans
orphan_by_domain = {}
for o in orphaned:
parts = o.split("/")
domain = parts[1] if len(parts) >= 3 else parts[0]
orphan_by_domain[domain] = orphan_by_domain.get(domain, 0) + 1
return {
"summary": {
"total_files": len(all_files),
"total_links": total_links,
"files_with_links": files_with_links,
"files_without_links": len(all_files) - files_with_links,
"dead_link_count": len(dead_links),
"orphan_count": len(orphaned),
"avg_links_per_file": round(total_links / max(len(all_files), 1), 2),
"high_connectivity_count": len(high_connectivity),
},
"dead_links": dead_links,
"dead_by_domain": dict(sorted(dead_by_domain.items(), key=lambda x: -x[1])),
"orphaned": orphaned,
"orphan_by_domain": dict(sorted(orphan_by_domain.items(), key=lambda x: -x[1])),
"high_connectivity": [{"file": f, "outbound_links": n} for f, n in high_connectivity[:20]],
"inbound_top20": sorted(
[{"file": f, "inbound_links": n} for f, n in inbound_counts.items()],
key=lambda x: -x["inbound_links"],
)[:20],
}
if __name__ == "__main__":
codex = Path(sys.argv[1]) if len(sys.argv) > 1 else CODEX_ROOT
result = audit(codex)
json.dump(result, sys.stdout, indent=2)
print()
# Print human-readable summary to stderr
s = result["summary"]
print(f"\n=== Wiki-Link Audit ===", file=sys.stderr)
print(f"Files scanned: {s['total_files']}", file=sys.stderr)
print(f"Total links: {s['total_links']}", file=sys.stderr)
print(f"Files with links: {s['files_with_links']} ({100*s['files_with_links']//max(s['total_files'],1)}%)", file=sys.stderr)
print(f"Dead links: {s['dead_link_count']}", file=sys.stderr)
print(f"Orphaned claims: {s['orphan_count']}", file=sys.stderr)
print(f"Avg links/file: {s['avg_links_per_file']}", file=sys.stderr)
print(f"High connectivity (≥3 links): {s['high_connectivity_count']}", file=sys.stderr)