#!/usr/bin/env python3 """Audit wiki-links across the teleo-codex knowledge base. Crawls domains/, foundations/, core/, decisions/ for [[wiki-links]]. Resolves each link against known claim files, entity files, and _map files. Reports dead links, orphaned claims, and link counts. Output: JSON to stdout with dead links, orphans, and per-file link counts. """ import json import os import re import sys import unicodedata from pathlib import Path CODEX_ROOT = Path(os.environ.get("CODEX_ROOT", "/opt/teleo-eval/workspaces/main")) CLAIM_DIRS = ["domains", "foundations", "core", "decisions"] ENTITY_DIR = "entities" WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]") def slugify(title: str) -> str: """Convert a wiki-link title to the kebab-case slug used for filenames.""" s = title.strip().lower() s = unicodedata.normalize("NFKD", s) s = re.sub(r"[^\w\s-]", "", s) s = re.sub(r"[\s_]+", "-", s) s = re.sub(r"-+", "-", s) return s.strip("-") def build_index(codex: Path) -> dict: """Build a lookup index of all resolvable targets. Returns dict mapping normalized slug -> file path. Also maps raw stem (filename without .md) -> file path. """ index = {} # Index claim files across all claim directories for claim_dir in CLAIM_DIRS: d = codex / claim_dir if not d.exists(): continue for md in d.rglob("*.md"): stem = md.stem rel = str(md.relative_to(codex)) # Map by stem (exact filename match) index[stem.lower()] = rel # Map by slugified stem index[slugify(stem)] = rel # Index entity files entity_root = codex / ENTITY_DIR if entity_root.exists(): for md in entity_root.rglob("*.md"): stem = md.stem rel = str(md.relative_to(codex)) index[stem.lower()] = rel index[slugify(stem)] = rel # Index maps/ directory (MOC-style overview docs) maps_root = codex / "maps" if maps_root.exists(): for md in maps_root.rglob("*.md"): stem = md.stem rel = str(md.relative_to(codex)) index[stem.lower()] = rel index[slugify(stem)] = rel # Index top-level docs that might be link targets for special in ["overview.md", "livingip-overview.md"]: p = codex / special if p.exists(): index[p.stem.lower()] = str(p.relative_to(codex)) # Index agents/ beliefs and positions (sometimes linked) agents_dir = codex / "agents" if agents_dir.exists(): for md in agents_dir.rglob("*.md"): stem = md.stem rel = str(md.relative_to(codex)) index[stem.lower()] = rel return index def resolve_link(link_text: str, index: dict, source_dir: str) -> str | None: """Try to resolve a wiki-link target. Returns file path or None.""" text = link_text.strip() # Special case: [[_map]] resolves to _map.md in the same domain directory if text == "_map": parts = source_dir.split("/") if len(parts) >= 2: candidate = f"{parts[0]}/{parts[1]}/_map.md" if (CODEX_ROOT / candidate).exists(): return candidate return None # Path-style references like [[domains/health/_map]] if "/" in text: candidate = text.rstrip("/") if not candidate.endswith(".md"): candidate += ".md" if (CODEX_ROOT / candidate).exists(): return candidate return None # Try exact stem match (lowercased) key = text.lower() if key in index: return index[key] # Try slugified version slug = slugify(text) if slug in index: return index[slug] # Try with common variations for variant in [ slug.replace("metadaos", "metadao"), slug.replace("ais", "ai"), ]: if variant in index: return index[variant] return None def audit(codex: Path) -> dict: """Run the full wiki-link audit.""" index = build_index(codex) dead_links = [] # {file, link, line_number} link_counts = {} # file -> {outbound: N, targets: []} all_targets = set() # files that are linked TO all_files = set() # all claim/foundation files # Scan all markdown files in claim directories for claim_dir in CLAIM_DIRS: d = codex / claim_dir if not d.exists(): continue for md in d.rglob("*.md"): rel = str(md.relative_to(codex)) all_files.add(rel) source_dir = str(md.parent.relative_to(codex)) try: content = md.read_text(encoding="utf-8") except Exception: continue links_in_file = [] for i, line in enumerate(content.split("\n"), 1): for match in WIKI_LINK_RE.finditer(line): link_text = match.group(1) # Skip links with | (display text aliases) - take the target part if "|" in link_text: link_text = link_text.split("|")[0].strip() resolved = resolve_link(link_text, index, source_dir) if resolved: all_targets.add(resolved) links_in_file.append(resolved) else: dead_links.append({ "file": rel, "link": link_text, "line": i, }) link_counts[rel] = { "outbound": len(links_in_file), "targets": links_in_file, } # Find orphaned claims (no inbound links AND no outbound links) files_with_outbound = {f for f, c in link_counts.items() if c["outbound"] > 0} orphaned = sorted( f for f in all_files if f not in all_targets and f not in files_with_outbound and not f.endswith("_map.md") # MOC files are structural, not orphans ) # Compute inbound link counts inbound_counts = {} for f, c in link_counts.items(): for target in c["targets"]: inbound_counts[target] = inbound_counts.get(target, 0) + 1 # Claims with high outbound (good connectivity) high_connectivity = sorted( [(f, c["outbound"]) for f, c in link_counts.items() if c["outbound"] >= 3], key=lambda x: -x[1], ) # Summary stats total_links = sum(c["outbound"] for c in link_counts.values()) files_with_links = sum(1 for c in link_counts.values() if c["outbound"] > 0) # Domain breakdown of dead links dead_by_domain = {} for dl in dead_links: parts = dl["file"].split("/") domain = parts[1] if len(parts) >= 3 else parts[0] dead_by_domain[domain] = dead_by_domain.get(domain, 0) + 1 # Domain breakdown of orphans orphan_by_domain = {} for o in orphaned: parts = o.split("/") domain = parts[1] if len(parts) >= 3 else parts[0] orphan_by_domain[domain] = orphan_by_domain.get(domain, 0) + 1 return { "summary": { "total_files": len(all_files), "total_links": total_links, "files_with_links": files_with_links, "files_without_links": len(all_files) - files_with_links, "dead_link_count": len(dead_links), "orphan_count": len(orphaned), "avg_links_per_file": round(total_links / max(len(all_files), 1), 2), "high_connectivity_count": len(high_connectivity), }, "dead_links": dead_links, "dead_by_domain": dict(sorted(dead_by_domain.items(), key=lambda x: -x[1])), "orphaned": orphaned, "orphan_by_domain": dict(sorted(orphan_by_domain.items(), key=lambda x: -x[1])), "high_connectivity": [{"file": f, "outbound_links": n} for f, n in high_connectivity[:20]], "inbound_top20": sorted( [{"file": f, "inbound_links": n} for f, n in inbound_counts.items()], key=lambda x: -x["inbound_links"], )[:20], } if __name__ == "__main__": codex = Path(sys.argv[1]) if len(sys.argv) > 1 else CODEX_ROOT result = audit(codex) json.dump(result, sys.stdout, indent=2) print() # Print human-readable summary to stderr s = result["summary"] print(f"\n=== Wiki-Link Audit ===", file=sys.stderr) print(f"Files scanned: {s['total_files']}", file=sys.stderr) print(f"Total links: {s['total_links']}", file=sys.stderr) print(f"Files with links: {s['files_with_links']} ({100*s['files_with_links']//max(s['total_files'],1)}%)", file=sys.stderr) print(f"Dead links: {s['dead_link_count']}", file=sys.stderr) print(f"Orphaned claims: {s['orphan_count']}", file=sys.stderr) print(f"Avg links/file: {s['avg_links_per_file']}", file=sys.stderr) print(f"High connectivity (≥3 links): {s['high_connectivity_count']}", file=sys.stderr)