#!/usr/bin/env python3
"""Audit wiki-links across the teleo-codex knowledge base.

Crawls domains/, foundations/, core/, decisions/ for [[wiki-links]].
Resolves each link against known claim files, entity files, and _map files.
Reports dead links, orphaned claims, and link counts.

Output: JSON to stdout with dead links, orphans, and per-file link counts.
"""

import json
import os
import re
import sys
import unicodedata
from pathlib import Path

CODEX_ROOT = Path(os.environ.get("CODEX_ROOT", "/opt/teleo-eval/workspaces/main"))
CLAIM_DIRS = ["domains", "foundations", "core", "decisions"]
ENTITY_DIR = "entities"

WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")


def slugify(title: str) -> str:
    """Convert a wiki-link title to the kebab-case slug used for filenames."""
    s = title.strip().lower()
    s = unicodedata.normalize("NFKD", s)
    s = re.sub(r"[^\w\s-]", "", s)
    s = re.sub(r"[\s_]+", "-", s)
    s = re.sub(r"-+", "-", s)
    return s.strip("-")


def build_index(codex: Path) -> dict:
    """Build a lookup index of all resolvable targets.

    Returns dict mapping normalized slug -> file path.
    Also maps raw stem (filename without .md) -> file path.
    """
    index = {}

    # Index claim files across all claim directories
    for claim_dir in CLAIM_DIRS:
        d = codex / claim_dir
        if not d.exists():
            continue
        for md in d.rglob("*.md"):
            stem = md.stem
            rel = str(md.relative_to(codex))
            # Map by stem (exact filename match)
            index[stem.lower()] = rel
            # Map by slugified stem
            index[slugify(stem)] = rel

    # Index entity files
    entity_root = codex / ENTITY_DIR
    if entity_root.exists():
        for md in entity_root.rglob("*.md"):
            stem = md.stem
            rel = str(md.relative_to(codex))
            index[stem.lower()] = rel
            index[slugify(stem)] = rel

    # Index maps/ directory (MOC-style overview docs)
    maps_root = codex / "maps"
    if maps_root.exists():
        for md in maps_root.rglob("*.md"):
            stem = md.stem
            rel = str(md.relative_to(codex))
            index[stem.lower()] = rel
            index[slugify(stem)] = rel

    # Index top-level docs that might be link targets
    for special in ["overview.md", "livingip-overview.md"]:
        p = codex / special
        if p.exists():
            index[p.stem.lower()] = str(p.relative_to(codex))

    # Index agents/ beliefs and positions (sometimes linked)
    agents_dir = codex / "agents"
    if agents_dir.exists():
        for md in agents_dir.rglob("*.md"):
            stem = md.stem
            rel = str(md.relative_to(codex))
            index[stem.lower()] = rel

    return index


def resolve_link(link_text: str, index: dict, source_dir: str) -> str | None:
    """Try to resolve a wiki-link target. Returns file path or None."""
    text = link_text.strip()

    # Special case: [[_map]] resolves to _map.md in the same domain directory
    if text == "_map":
        parts = source_dir.split("/")
        if len(parts) >= 2:
            candidate = f"{parts[0]}/{parts[1]}/_map.md"
            if (CODEX_ROOT / candidate).exists():
                return candidate
        return None

    # Path-style references like [[domains/health/_map]]
    if "/" in text:
        candidate = text.rstrip("/")
        if not candidate.endswith(".md"):
            candidate += ".md"
        if (CODEX_ROOT / candidate).exists():
            return candidate
        return None

    # Try exact stem match (lowercased)
    key = text.lower()
    if key in index:
        return index[key]

    # Try slugified version
    slug = slugify(text)
    if slug in index:
        return index[slug]

    # Try with common variations
    for variant in [
        slug.replace("metadaos", "metadao"),
        slug.replace("ais", "ai"),
    ]:
        if variant in index:
            return index[variant]

    return None


def audit(codex: Path) -> dict:
    """Run the full wiki-link audit."""
    index = build_index(codex)

    dead_links = []       # {file, link, line_number}
    link_counts = {}      # file -> {outbound: N, targets: []}
    all_targets = set()   # files that are linked TO
    all_files = set()     # all claim/foundation files

    # Scan all markdown files in claim directories
    for claim_dir in CLAIM_DIRS:
        d = codex / claim_dir
        if not d.exists():
            continue
        for md in d.rglob("*.md"):
            rel = str(md.relative_to(codex))
            all_files.add(rel)
            source_dir = str(md.parent.relative_to(codex))

            try:
                content = md.read_text(encoding="utf-8")
            except Exception:
                continue

            links_in_file = []
            for i, line in enumerate(content.split("\n"), 1):
                for match in WIKI_LINK_RE.finditer(line):
                    link_text = match.group(1)
                    # Skip links with | (display text aliases) - take the target part
                    if "|" in link_text:
                        link_text = link_text.split("|")[0].strip()

                    resolved = resolve_link(link_text, index, source_dir)
                    if resolved:
                        all_targets.add(resolved)
                        links_in_file.append(resolved)
                    else:
                        dead_links.append({
                            "file": rel,
                            "link": link_text,
                            "line": i,
                        })

            link_counts[rel] = {
                "outbound": len(links_in_file),
                "targets": links_in_file,
            }

    # Find orphaned claims (no inbound links AND no outbound links)
    files_with_outbound = {f for f, c in link_counts.items() if c["outbound"] > 0}
    orphaned = sorted(
        f for f in all_files
        if f not in all_targets
        and f not in files_with_outbound
        and not f.endswith("_map.md")  # MOC files are structural, not orphans
    )

    # Compute inbound link counts
    inbound_counts = {}
    for f, c in link_counts.items():
        for target in c["targets"]:
            inbound_counts[target] = inbound_counts.get(target, 0) + 1

    # Claims with high outbound (good connectivity)
    high_connectivity = sorted(
        [(f, c["outbound"]) for f, c in link_counts.items() if c["outbound"] >= 3],
        key=lambda x: -x[1],
    )

    # Summary stats
    total_links = sum(c["outbound"] for c in link_counts.values())
    files_with_links = sum(1 for c in link_counts.values() if c["outbound"] > 0)

    # Domain breakdown of dead links
    dead_by_domain = {}
    for dl in dead_links:
        parts = dl["file"].split("/")
        domain = parts[1] if len(parts) >= 3 else parts[0]
        dead_by_domain[domain] = dead_by_domain.get(domain, 0) + 1

    # Domain breakdown of orphans
    orphan_by_domain = {}
    for o in orphaned:
        parts = o.split("/")
        domain = parts[1] if len(parts) >= 3 else parts[0]
        orphan_by_domain[domain] = orphan_by_domain.get(domain, 0) + 1

    return {
        "summary": {
            "total_files": len(all_files),
            "total_links": total_links,
            "files_with_links": files_with_links,
            "files_without_links": len(all_files) - files_with_links,
            "dead_link_count": len(dead_links),
            "orphan_count": len(orphaned),
            "avg_links_per_file": round(total_links / max(len(all_files), 1), 2),
            "high_connectivity_count": len(high_connectivity),
        },
        "dead_links": dead_links,
        "dead_by_domain": dict(sorted(dead_by_domain.items(), key=lambda x: -x[1])),
        "orphaned": orphaned,
        "orphan_by_domain": dict(sorted(orphan_by_domain.items(), key=lambda x: -x[1])),
        "high_connectivity": [{"file": f, "outbound_links": n} for f, n in high_connectivity[:20]],
        "inbound_top20": sorted(
            [{"file": f, "inbound_links": n} for f, n in inbound_counts.items()],
            key=lambda x: -x["inbound_links"],
        )[:20],
    }


if __name__ == "__main__":
    codex = Path(sys.argv[1]) if len(sys.argv) > 1 else CODEX_ROOT
    result = audit(codex)
    json.dump(result, sys.stdout, indent=2)
    print()

    # Print human-readable summary to stderr
    s = result["summary"]
    print(f"\n=== Wiki-Link Audit ===", file=sys.stderr)
    print(f"Files scanned: {s['total_files']}", file=sys.stderr)
    print(f"Total links: {s['total_links']}", file=sys.stderr)
    print(f"Files with links: {s['files_with_links']} ({100*s['files_with_links']//max(s['total_files'],1)}%)", file=sys.stderr)
    print(f"Dead links: {s['dead_link_count']}", file=sys.stderr)
    print(f"Orphaned claims: {s['orphan_count']}", file=sys.stderr)
    print(f"Avg links/file: {s['avg_links_per_file']}", file=sys.stderr)
    print(f"High connectivity (≥3 links): {s['high_connectivity_count']}", file=sys.stderr)