feat: add wiki-link audit script for codex graph integrity
Crawls domains/foundations/core/decisions for [[wiki-links]], resolves against claim files, entities, maps, and agents. Reports dead links, orphans, and connectivity stats. Prerequisite for CI scoring connectivity bonus — broken links would inflate scores. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
9c0be78620
commit
e043cf98dc
1 changed files with 259 additions and 0 deletions
259
scripts/audit-wiki-links.py
Normal file
259
scripts/audit-wiki-links.py
Normal file
|
|
@ -0,0 +1,259 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Audit wiki-links across the teleo-codex knowledge base.
|
||||
|
||||
Crawls domains/, foundations/, core/, decisions/ for [[wiki-links]].
|
||||
Resolves each link against known claim files, entity files, and _map files.
|
||||
Reports dead links, orphaned claims, and link counts.
|
||||
|
||||
Output: JSON to stdout with dead links, orphans, and per-file link counts.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
CODEX_ROOT = Path(os.environ.get("CODEX_ROOT", "/opt/teleo-eval/workspaces/main"))
|
||||
CLAIM_DIRS = ["domains", "foundations", "core", "decisions"]
|
||||
ENTITY_DIR = "entities"
|
||||
|
||||
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
||||
|
||||
|
||||
def slugify(title: str) -> str:
|
||||
"""Convert a wiki-link title to the kebab-case slug used for filenames."""
|
||||
s = title.strip().lower()
|
||||
s = unicodedata.normalize("NFKD", s)
|
||||
s = re.sub(r"[^\w\s-]", "", s)
|
||||
s = re.sub(r"[\s_]+", "-", s)
|
||||
s = re.sub(r"-+", "-", s)
|
||||
return s.strip("-")
|
||||
|
||||
|
||||
def build_index(codex: Path) -> dict:
|
||||
"""Build a lookup index of all resolvable targets.
|
||||
|
||||
Returns dict mapping normalized slug -> file path.
|
||||
Also maps raw stem (filename without .md) -> file path.
|
||||
"""
|
||||
index = {}
|
||||
|
||||
# Index claim files across all claim directories
|
||||
for claim_dir in CLAIM_DIRS:
|
||||
d = codex / claim_dir
|
||||
if not d.exists():
|
||||
continue
|
||||
for md in d.rglob("*.md"):
|
||||
stem = md.stem
|
||||
rel = str(md.relative_to(codex))
|
||||
# Map by stem (exact filename match)
|
||||
index[stem.lower()] = rel
|
||||
# Map by slugified stem
|
||||
index[slugify(stem)] = rel
|
||||
|
||||
# Index entity files
|
||||
entity_root = codex / ENTITY_DIR
|
||||
if entity_root.exists():
|
||||
for md in entity_root.rglob("*.md"):
|
||||
stem = md.stem
|
||||
rel = str(md.relative_to(codex))
|
||||
index[stem.lower()] = rel
|
||||
index[slugify(stem)] = rel
|
||||
|
||||
# Index maps/ directory (MOC-style overview docs)
|
||||
maps_root = codex / "maps"
|
||||
if maps_root.exists():
|
||||
for md in maps_root.rglob("*.md"):
|
||||
stem = md.stem
|
||||
rel = str(md.relative_to(codex))
|
||||
index[stem.lower()] = rel
|
||||
index[slugify(stem)] = rel
|
||||
|
||||
# Index top-level docs that might be link targets
|
||||
for special in ["overview.md", "livingip-overview.md"]:
|
||||
p = codex / special
|
||||
if p.exists():
|
||||
index[p.stem.lower()] = str(p.relative_to(codex))
|
||||
|
||||
# Index agents/ beliefs and positions (sometimes linked)
|
||||
agents_dir = codex / "agents"
|
||||
if agents_dir.exists():
|
||||
for md in agents_dir.rglob("*.md"):
|
||||
stem = md.stem
|
||||
rel = str(md.relative_to(codex))
|
||||
index[stem.lower()] = rel
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def resolve_link(link_text: str, index: dict, source_dir: str) -> str | None:
|
||||
"""Try to resolve a wiki-link target. Returns file path or None."""
|
||||
text = link_text.strip()
|
||||
|
||||
# Special case: [[_map]] resolves to _map.md in the same domain directory
|
||||
if text == "_map":
|
||||
parts = source_dir.split("/")
|
||||
if len(parts) >= 2:
|
||||
candidate = f"{parts[0]}/{parts[1]}/_map.md"
|
||||
if (CODEX_ROOT / candidate).exists():
|
||||
return candidate
|
||||
return None
|
||||
|
||||
# Path-style references like [[domains/health/_map]]
|
||||
if "/" in text:
|
||||
candidate = text.rstrip("/")
|
||||
if not candidate.endswith(".md"):
|
||||
candidate += ".md"
|
||||
if (CODEX_ROOT / candidate).exists():
|
||||
return candidate
|
||||
return None
|
||||
|
||||
# Try exact stem match (lowercased)
|
||||
key = text.lower()
|
||||
if key in index:
|
||||
return index[key]
|
||||
|
||||
# Try slugified version
|
||||
slug = slugify(text)
|
||||
if slug in index:
|
||||
return index[slug]
|
||||
|
||||
# Try with common variations
|
||||
for variant in [
|
||||
slug.replace("metadaos", "metadao"),
|
||||
slug.replace("ais", "ai"),
|
||||
]:
|
||||
if variant in index:
|
||||
return index[variant]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def audit(codex: Path) -> dict:
|
||||
"""Run the full wiki-link audit."""
|
||||
index = build_index(codex)
|
||||
|
||||
dead_links = [] # {file, link, line_number}
|
||||
link_counts = {} # file -> {outbound: N, targets: []}
|
||||
all_targets = set() # files that are linked TO
|
||||
all_files = set() # all claim/foundation files
|
||||
|
||||
# Scan all markdown files in claim directories
|
||||
for claim_dir in CLAIM_DIRS:
|
||||
d = codex / claim_dir
|
||||
if not d.exists():
|
||||
continue
|
||||
for md in d.rglob("*.md"):
|
||||
rel = str(md.relative_to(codex))
|
||||
all_files.add(rel)
|
||||
source_dir = str(md.parent.relative_to(codex))
|
||||
|
||||
try:
|
||||
content = md.read_text(encoding="utf-8")
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
links_in_file = []
|
||||
for i, line in enumerate(content.split("\n"), 1):
|
||||
for match in WIKI_LINK_RE.finditer(line):
|
||||
link_text = match.group(1)
|
||||
# Skip links with | (display text aliases) - take the target part
|
||||
if "|" in link_text:
|
||||
link_text = link_text.split("|")[0].strip()
|
||||
|
||||
resolved = resolve_link(link_text, index, source_dir)
|
||||
if resolved:
|
||||
all_targets.add(resolved)
|
||||
links_in_file.append(resolved)
|
||||
else:
|
||||
dead_links.append({
|
||||
"file": rel,
|
||||
"link": link_text,
|
||||
"line": i,
|
||||
})
|
||||
|
||||
link_counts[rel] = {
|
||||
"outbound": len(links_in_file),
|
||||
"targets": links_in_file,
|
||||
}
|
||||
|
||||
# Find orphaned claims (no inbound links AND no outbound links)
|
||||
files_with_outbound = {f for f, c in link_counts.items() if c["outbound"] > 0}
|
||||
orphaned = sorted(
|
||||
f for f in all_files
|
||||
if f not in all_targets
|
||||
and f not in files_with_outbound
|
||||
and not f.endswith("_map.md") # MOC files are structural, not orphans
|
||||
)
|
||||
|
||||
# Compute inbound link counts
|
||||
inbound_counts = {}
|
||||
for f, c in link_counts.items():
|
||||
for target in c["targets"]:
|
||||
inbound_counts[target] = inbound_counts.get(target, 0) + 1
|
||||
|
||||
# Claims with high outbound (good connectivity)
|
||||
high_connectivity = sorted(
|
||||
[(f, c["outbound"]) for f, c in link_counts.items() if c["outbound"] >= 3],
|
||||
key=lambda x: -x[1],
|
||||
)
|
||||
|
||||
# Summary stats
|
||||
total_links = sum(c["outbound"] for c in link_counts.values())
|
||||
files_with_links = sum(1 for c in link_counts.values() if c["outbound"] > 0)
|
||||
|
||||
# Domain breakdown of dead links
|
||||
dead_by_domain = {}
|
||||
for dl in dead_links:
|
||||
parts = dl["file"].split("/")
|
||||
domain = parts[1] if len(parts) >= 3 else parts[0]
|
||||
dead_by_domain[domain] = dead_by_domain.get(domain, 0) + 1
|
||||
|
||||
# Domain breakdown of orphans
|
||||
orphan_by_domain = {}
|
||||
for o in orphaned:
|
||||
parts = o.split("/")
|
||||
domain = parts[1] if len(parts) >= 3 else parts[0]
|
||||
orphan_by_domain[domain] = orphan_by_domain.get(domain, 0) + 1
|
||||
|
||||
return {
|
||||
"summary": {
|
||||
"total_files": len(all_files),
|
||||
"total_links": total_links,
|
||||
"files_with_links": files_with_links,
|
||||
"files_without_links": len(all_files) - files_with_links,
|
||||
"dead_link_count": len(dead_links),
|
||||
"orphan_count": len(orphaned),
|
||||
"avg_links_per_file": round(total_links / max(len(all_files), 1), 2),
|
||||
"high_connectivity_count": len(high_connectivity),
|
||||
},
|
||||
"dead_links": dead_links,
|
||||
"dead_by_domain": dict(sorted(dead_by_domain.items(), key=lambda x: -x[1])),
|
||||
"orphaned": orphaned,
|
||||
"orphan_by_domain": dict(sorted(orphan_by_domain.items(), key=lambda x: -x[1])),
|
||||
"high_connectivity": [{"file": f, "outbound_links": n} for f, n in high_connectivity[:20]],
|
||||
"inbound_top20": sorted(
|
||||
[{"file": f, "inbound_links": n} for f, n in inbound_counts.items()],
|
||||
key=lambda x: -x["inbound_links"],
|
||||
)[:20],
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
codex = Path(sys.argv[1]) if len(sys.argv) > 1 else CODEX_ROOT
|
||||
result = audit(codex)
|
||||
json.dump(result, sys.stdout, indent=2)
|
||||
print()
|
||||
|
||||
# Print human-readable summary to stderr
|
||||
s = result["summary"]
|
||||
print(f"\n=== Wiki-Link Audit ===", file=sys.stderr)
|
||||
print(f"Files scanned: {s['total_files']}", file=sys.stderr)
|
||||
print(f"Total links: {s['total_links']}", file=sys.stderr)
|
||||
print(f"Files with links: {s['files_with_links']} ({100*s['files_with_links']//max(s['total_files'],1)}%)", file=sys.stderr)
|
||||
print(f"Dead links: {s['dead_link_count']}", file=sys.stderr)
|
||||
print(f"Orphaned claims: {s['orphan_count']}", file=sys.stderr)
|
||||
print(f"Avg links/file: {s['avg_links_per_file']}", file=sys.stderr)
|
||||
print(f"High connectivity (≥3 links): {s['high_connectivity_count']}", file=sys.stderr)
|
||||
Loading…
Reference in a new issue