teleo-codex/ops/pipeline-v2/reweave.py

#!/usr/bin/env python3
"""Orphan Reweave — connect isolated claims via vector similarity + Haiku classification.

Finds claims with zero incoming links (orphans), uses Qdrant to find semantically
similar neighbors, classifies the relationship with Haiku, and writes edges on the
neighbor's frontmatter pointing TO the orphan.

Usage:
    python3 reweave.py --dry-run                 # Show what would be connected
    python3 reweave.py --max-orphans 50          # Process up to 50 orphans
    python3 reweave.py --threshold 0.72          # Override similarity floor

Design:
    - Orphan = zero incoming links (no other claim's supports/challenges/related/depends_on points to it)
    - Write edge on NEIGHBOR (not orphan) so orphan gains an incoming link
    - Haiku classifies: supports | challenges | related (>=0.85 confidence for supports/challenges)
    - reweave_edges parallel field for tooling-readable provenance
    - Single PR per run for Leo review

Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887>
"""

import argparse
import datetime
import hashlib
import json
import logging
import os
import re
import subprocess
import sys
import time
import urllib.request
from pathlib import Path

import yaml

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger("reweave")

# --- Config ---
REPO_DIR = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main"))
SECRETS_DIR = Path(os.environ.get("SECRETS_DIR", "/opt/teleo-eval/secrets"))
QDRANT_URL = os.environ.get("QDRANT_URL", "http://localhost:6333")
QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION", "teleo-claims")
FORGEJO_URL = os.environ.get("FORGEJO_URL", "http://localhost:3000")

EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]
EDGE_FIELDS = ("supports", "challenges", "challenged_by", "depends_on", "related")
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")

# Thresholds (from calibration data — Mar 28)
DEFAULT_THRESHOLD = 0.70       # Elbow in score distribution
DEFAULT_MAX_ORPHANS = 50       # Keep PRs reviewable
DEFAULT_MAX_NEIGHBORS = 3      # Don't over-connect
HAIKU_CONFIDENCE_FLOOR = 0.85  # Below this → default to "related"
PER_FILE_EDGE_CAP = 10         # Max total reweave edges per neighbor file

# Domain processing order: diversity first, internet-finance last (Leo)
DOMAIN_PRIORITY = [
    "ai-alignment", "health", "space-development", "entertainment",
    "creative-industries", "collective-intelligence", "governance",
    # internet-finance last — batch-imported futarchy cluster, lower cross-domain value
    "internet-finance",
]


# ─── Orphan Detection ────────────────────────────────────────────────────────


def _parse_frontmatter(path: Path) -> dict | None:
    """Parse YAML frontmatter from a markdown file. Returns dict or None."""
    try:
        text = path.read_text(errors="replace")
    except Exception:
        return None
    if not text.startswith("---"):
        return None
    end = text.find("\n---", 3)
    if end == -1:
        return None
    try:
        fm = yaml.safe_load(text[3:end])
        return fm if isinstance(fm, dict) else None
    except Exception:
        return None


def _get_body(path: Path) -> str:
    """Get body text (after frontmatter) from a markdown file."""
    try:
        text = path.read_text(errors="replace")
    except Exception:
        return ""
    if not text.startswith("---"):
        return text
    end = text.find("\n---", 3)
    if end == -1:
        return text
    return text[end + 4:].strip()


def _get_edge_targets(path: Path) -> list[str]:
    """Extract all outgoing edge targets from a claim's frontmatter + wiki links."""
    targets = []
    fm = _parse_frontmatter(path)
    if fm:
        for field in EDGE_FIELDS:
            val = fm.get(field)
            if isinstance(val, list):
                targets.extend(str(v).strip().lower() for v in val if v)
            elif isinstance(val, str) and val.strip():
                targets.append(val.strip().lower())
        # Also check reweave_edges (from previous runs)
        rw = fm.get("reweave_edges")
        if isinstance(rw, list):
            targets.extend(str(v).strip().lower() for v in rw if v)

    # Wiki links in body
    try:
        text = path.read_text(errors="replace")
        end = text.find("\n---", 3)
        if end > 0:
            body = text[end + 4:]
            for link in WIKI_LINK_RE.findall(body):
                targets.append(link.strip().lower())
    except Exception:
        pass

    return targets


def _claim_name_variants(path: Path, repo_root: Path = None) -> list[str]:
    """Generate name variants for a claim file (used for incoming link matching).

    A claim at domains/ai-alignment/rlhf-reward-hacking.md could be referenced as:
      - "rlhf-reward-hacking"
      - "rlhf reward hacking"
      - "RLHF reward hacking" (title case)
      - The actual 'name' or 'title' from frontmatter
      - "domains/ai-alignment/rlhf-reward-hacking" (relative path without .md)
    """
    variants = set()
    stem = path.stem
    variants.add(stem.lower())
    variants.add(stem.lower().replace("-", " "))

    # Also match by relative path (Ganymede Q1: some edges use path references)
    if repo_root:
        try:
            rel = str(path.relative_to(repo_root)).removesuffix(".md")
            variants.add(rel.lower())
        except ValueError:
            pass

    fm = _parse_frontmatter(path)
    if fm:
        for key in ("name", "title"):
            val = fm.get(key)
            if isinstance(val, str) and val.strip():
                variants.add(val.strip().lower())

    return list(variants)


def _is_entity(path: Path) -> bool:
    """Check if a file is an entity (not a claim). Entities need different edge vocabulary."""
    fm = _parse_frontmatter(path)
    if fm and fm.get("type") == "entity":
        return True
    # Check path parts — avoids false positives on paths like "domains/entities-overview/"
    return "entities" in Path(path).parts


def _same_source(path_a: Path, path_b: Path) -> bool:
    """Check if two claims derive from the same source material.

    Prevents self-referential edges where N claims about the same paper
    all "support" each other — inflates graph density without adding information.
    """
    fm_a = _parse_frontmatter(path_a)
    fm_b = _parse_frontmatter(path_b)
    if not fm_a or not fm_b:
        return False

    # Check source field
    src_a = fm_a.get("source") or fm_a.get("source_file") or ""
    src_b = fm_b.get("source") or fm_b.get("source_file") or ""
    if src_a and src_b and str(src_a).strip() == str(src_b).strip():
        return True

    return False


def find_all_claims(repo_root: Path) -> list[Path]:
    """Find all knowledge files (claim, framework, entity, decision) in the KB."""
    claims = []
    for d in EMBED_DIRS:
        base = repo_root / d
        if not base.is_dir():
            continue
        for md in base.rglob("*.md"):
            if md.name.startswith("_"):
                continue
            fm = _parse_frontmatter(md)
            if fm and fm.get("type") not in ("source", "musing", None):
                claims.append(md)
    return claims


def build_reverse_link_index(claims: list[Path]) -> dict[str, set[Path]]:
    """Build a reverse index: claim_name_variant → set of files that link TO it.

    For each claim, extract all outgoing edges. For each target name, record
    the source claim as an incoming link for that target.
    """
    # name_variant → set of source paths that point to it
    incoming: dict[str, set[Path]] = {}

    for claim_path in claims:
        targets = _get_edge_targets(claim_path)
        for target in targets:
            if target not in incoming:
                incoming[target] = set()
            incoming[target].add(claim_path)

    return incoming


def find_orphans(claims: list[Path], incoming: dict[str, set[Path]],
                 repo_root: Path = None) -> list[Path]:
    """Find claims with zero incoming links."""
    orphans = []
    for claim_path in claims:
        variants = _claim_name_variants(claim_path, repo_root)
        has_incoming = any(
            len(incoming.get(v, set()) - {claim_path}) > 0
            for v in variants
        )
        if not has_incoming:
            orphans.append(claim_path)
    return orphans


def sort_orphans_by_domain(orphans: list[Path], repo_root: Path) -> list[Path]:
    """Sort orphans by domain priority (diversity first, internet-finance last)."""
    def domain_key(path: Path) -> tuple[int, str]:
        rel = path.relative_to(repo_root)
        parts = rel.parts
        domain = ""
        if len(parts) >= 2 and parts[0] in ("domains", "entities", "decisions"):
            domain = parts[1]
        elif parts[0] == "foundations" and len(parts) >= 2:
            domain = parts[1]
        elif parts[0] == "core":
            domain = "core"

        try:
            priority = DOMAIN_PRIORITY.index(domain)
        except ValueError:
            # Unknown domain goes before internet-finance but after known ones
            priority = len(DOMAIN_PRIORITY) - 1

        return (priority, path.stem)

    return sorted(orphans, key=domain_key)


# ─── Qdrant Search ───────────────────────────────────────────────────────────


def _get_api_key() -> str:
    """Load OpenRouter API key."""
    key_file = SECRETS_DIR / "openrouter-key"
    if key_file.exists():
        return key_file.read_text().strip()
    key = os.environ.get("OPENROUTER_API_KEY", "")
    if key:
        return key
    logger.error("No OpenRouter API key found")
    sys.exit(1)


def make_point_id(rel_path: str) -> str:
    """Deterministic point ID from repo-relative path (matches embed-claims.py)."""
    return hashlib.md5(rel_path.encode()).hexdigest()


def get_vector_from_qdrant(rel_path: str) -> list[float] | None:
    """Retrieve a claim's existing vector from Qdrant by its point ID."""
    point_id = make_point_id(rel_path)
    body = json.dumps({"ids": [point_id], "with_vector": True}).encode()
    req = urllib.request.Request(
        f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points",
        data=body,
        headers={"Content-Type": "application/json"},
    )
    try:
        with urllib.request.urlopen(req, timeout=10) as resp:
            data = json.loads(resp.read())
            points = data.get("result", [])
            if points and points[0].get("vector"):
                return points[0]["vector"]
    except Exception as e:
        logger.warning("Qdrant point lookup failed for %s: %s", rel_path, e)
    return None


def search_neighbors(vector: list[float], exclude_path: str,
                     threshold: float, limit: int) -> list[dict]:
    """Search Qdrant for nearest neighbors above threshold, excluding self."""
    body = {
        "vector": vector,
        "limit": limit + 5,  # over-fetch to account for self + filtered
        "with_payload": True,
        "score_threshold": threshold,
        "filter": {
            "must_not": [{"key": "claim_path", "match": {"value": exclude_path}}]
        },
    }
    req = urllib.request.Request(
        f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
        data=json.dumps(body).encode(),
        headers={"Content-Type": "application/json"},
    )
    try:
        with urllib.request.urlopen(req, timeout=10) as resp:
            data = json.loads(resp.read())
            hits = data.get("result", [])
            return hits[:limit]
    except Exception as e:
        logger.warning("Qdrant search failed: %s", e)
        return []


# ─── Haiku Edge Classification ───────────────────────────────────────────────


CLASSIFY_PROMPT = """You are classifying the relationship between two knowledge claims.

CLAIM A (the orphan — needs to be connected):
Title: {orphan_title}
Body: {orphan_body}

CLAIM B (the neighbor — already connected in the knowledge graph):
Title: {neighbor_title}
Body: {neighbor_body}

What is the relationship FROM Claim B TO Claim A?

Options:
- "supports" — Claim B provides evidence, reasoning, or examples that strengthen Claim A
- "challenges" — Claim B contradicts, undermines, or provides counter-evidence to Claim A. NOTE: "challenges" is underused — if one claim says X works and another says X fails, or they propose incompatible mechanisms, that IS a challenge. Use it.
- "related" — Claims are topically connected but neither supports nor challenges the other. This is the WEAKEST edge — prefer supports/challenges when the relationship has directionality.

Respond with EXACTLY this JSON format, nothing else:
{{"edge_type": "supports|challenges|related", "confidence": 0.0-1.0, "reason": "one sentence explanation"}}
"""


def classify_edge(orphan_title: str, orphan_body: str,
                  neighbor_title: str, neighbor_body: str,
                  api_key: str) -> dict:
    """Use Haiku to classify the edge type between two claims.

    Returns {"edge_type": str, "confidence": float, "reason": str}.
    Falls back to "related" on any failure.
    """
    default = {"edge_type": "related", "confidence": 0.5, "reason": "classification failed"}

    prompt = CLASSIFY_PROMPT.format(
        orphan_title=orphan_title,
        orphan_body=orphan_body[:500],
        neighbor_title=neighbor_title,
        neighbor_body=neighbor_body[:500],
    )

    payload = json.dumps({
        "model": "anthropic/claude-3.5-haiku",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 200,
        "temperature": 0.3,
    }).encode()

    req = urllib.request.Request(
        "https://openrouter.ai/api/v1/chat/completions",
        data=payload,
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        },
    )

    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            data = json.loads(resp.read())
            content = data["choices"][0]["message"]["content"].strip()

            # Parse JSON from response (handle markdown code blocks)
            if content.startswith("```"):
                content = content.split("\n", 1)[-1].rsplit("```", 1)[0].strip()

            result = json.loads(content)
            edge_type = result.get("edge_type", "related")
            confidence = float(result.get("confidence", 0.5))

            # Enforce confidence floor for supports/challenges
            if edge_type in ("supports", "challenges") and confidence < HAIKU_CONFIDENCE_FLOOR:
                edge_type = "related"

            return {
                "edge_type": edge_type,
                "confidence": confidence,
                "reason": result.get("reason", ""),
            }
    except Exception as e:
        logger.warning("Haiku classification failed: %s", e)
        return default


# ─── YAML Frontmatter Editing ────────────────────────────────────────────────


def _count_reweave_edges(path: Path) -> int:
    """Count existing reweave_edges in a file's frontmatter."""
    fm = _parse_frontmatter(path)
    if not fm:
        return 0
    rw = fm.get("reweave_edges")
    if isinstance(rw, list):
        return len(rw)
    return 0


def write_edge(neighbor_path: Path, orphan_title: str, edge_type: str,
               date_str: str, dry_run: bool = False) -> bool:
    """Write a reweave edge on the neighbor's frontmatter.

    Adds to both the edge_type list (related/supports/challenges) and
    the parallel reweave_edges list for provenance tracking.

    Uses ruamel.yaml for round-trip YAML preservation.
    """
    # Check per-file cap
    if _count_reweave_edges(neighbor_path) >= PER_FILE_EDGE_CAP:
        logger.info("  Skip %s — per-file edge cap (%d) reached", neighbor_path.name, PER_FILE_EDGE_CAP)
        return False

    try:
        text = neighbor_path.read_text(errors="replace")
    except Exception as e:
        logger.warning("  Cannot read %s: %s", neighbor_path, e)
        return False

    if not text.startswith("---"):
        logger.warning("  No frontmatter in %s", neighbor_path.name)
        return False

    end = text.find("\n---", 3)
    if end == -1:
        return False

    fm_text = text[3:end]
    body_text = text[end:]  # includes the closing ---

    # Try ruamel.yaml for round-trip editing
    try:
        from ruamel.yaml import YAML
        ry = YAML()
        ry.preserve_quotes = True
        ry.width = 4096  # prevent line wrapping

        import io
        fm = ry.load(fm_text)
        if not isinstance(fm, dict):
            return False

        # Add to edge_type list (related/supports/challenges)
        # Clean value only — provenance tracked in reweave_edges (Ganymede: comment-in-string bug)
        if edge_type not in fm:
            fm[edge_type] = []
        elif not isinstance(fm[edge_type], list):
            fm[edge_type] = [fm[edge_type]]

        # Check for duplicate
        existing = [str(v).strip().lower() for v in fm[edge_type] if v]
        if orphan_title.strip().lower() in existing:
            logger.info("  Skip duplicate edge: %s → %s", neighbor_path.name, orphan_title)
            return False

        fm[edge_type].append(orphan_title)

        # Add to reweave_edges with provenance (edge_type + date for audit trail)
        if "reweave_edges" not in fm:
            fm["reweave_edges"] = []
        elif not isinstance(fm["reweave_edges"], list):
            fm["reweave_edges"] = [fm["reweave_edges"]]
        fm["reweave_edges"].append(f"{orphan_title}|{edge_type}|{date_str}")

        # Serialize back
        buf = io.StringIO()
        ry.dump(fm, buf)
        new_fm = buf.getvalue().rstrip("\n")

        new_text = f"---\n{new_fm}{body_text}"

        if not dry_run:
            neighbor_path.write_text(new_text)
        return True

    except ImportError:
        # Fallback: regex-based editing (no ruamel.yaml installed)
        logger.info("  ruamel.yaml not available, using regex fallback")
        return _write_edge_regex(neighbor_path, fm_text, body_text, orphan_title,
                                 edge_type, date_str, dry_run)


def _write_edge_regex(neighbor_path: Path, fm_text: str, body_text: str,
                      orphan_title: str, edge_type: str, date_str: str,
                      dry_run: bool) -> bool:
    """Fallback: add edge via regex when ruamel.yaml is unavailable."""
    # Strip leading newline from fm_text (text[3:end] includes \n after ---)
    fm_text = fm_text.lstrip("\n")

    # Check for duplicate before writing
    existing_re = re.compile(
        rf'^\s*-\s*["\']?{re.escape(orphan_title)}["\']?\s*$',
        re.MULTILINE | re.IGNORECASE,
    )
    if existing_re.search(fm_text):
        logger.info("  Skip duplicate edge (regex): %s → %s", neighbor_path.name, orphan_title)
        return False

    # Check if edge_type field exists
    field_re = re.compile(rf"^{edge_type}:\s*$", re.MULTILINE)
    inline_re = re.compile(rf'^{edge_type}:\s*\[', re.MULTILINE)

    entry_line = f'- {orphan_title}'
    rw_line = f'- {orphan_title}|{edge_type}|{date_str}'

    if field_re.search(fm_text):
        # Multi-line list exists — find end of list, append
        lines = fm_text.split("\n")
        new_lines = []
        in_field = False
        inserted = False
        for line in lines:
            new_lines.append(line)
            if re.match(rf"^{edge_type}:\s*$", line):
                in_field = True
            elif in_field and not line.startswith(("- ", "  -")):
                # End of list — insert before this line
                new_lines.insert(-1, entry_line)
                in_field = False
                inserted = True
        if in_field and not inserted:
            # Field was last in frontmatter
            new_lines.append(entry_line)
        fm_text = "\n".join(new_lines)

    elif inline_re.search(fm_text):
        # Inline list — skip, too complex for regex
        logger.warning("  Inline list format for %s in %s, skipping", edge_type, neighbor_path.name)
        return False
    else:
        # Field doesn't exist — add at end of frontmatter
        fm_text = fm_text.rstrip("\n") + f"\n{edge_type}:\n{entry_line}"

    # Add reweave_edges field
    if "reweave_edges:" in fm_text:
        lines = fm_text.split("\n")
        new_lines = []
        in_rw = False
        inserted_rw = False
        for line in lines:
            new_lines.append(line)
            if re.match(r"^reweave_edges:\s*$", line):
                in_rw = True
            elif in_rw and not line.startswith(("- ", "  -")):
                new_lines.insert(-1, rw_line)
                in_rw = False
                inserted_rw = True
        if in_rw and not inserted_rw:
            new_lines.append(rw_line)
        fm_text = "\n".join(new_lines)
    else:
        fm_text = fm_text.rstrip("\n") + f"\nreweave_edges:\n{rw_line}"

    new_text = f"---\n{fm_text}{body_text}"

    if not dry_run:
        neighbor_path.write_text(new_text)
    return True


# ─── Git + PR ────────────────────────────────────────────────────────────────


def create_branch(repo_root: Path, branch_name: str) -> bool:
    """Create and checkout a new branch. Cleans up stale local/remote branches from prior failed runs."""
    # Delete stale local branch if it exists (e.g., from a failed earlier run today)
    subprocess.run(["git", "branch", "-D", branch_name],
                   cwd=str(repo_root), capture_output=True)  # ignore errors if branch doesn't exist

    # Delete stale remote branch if it exists
    token_file = SECRETS_DIR / "forgejo-admin-token"
    if token_file.exists():
        token = token_file.read_text().strip()
        push_url = f"http://teleo:{token}@localhost:3000/teleo/teleo-codex.git"
        subprocess.run(["git", "push", push_url, "--delete", branch_name],
                       cwd=str(repo_root), capture_output=True)  # ignore errors if branch doesn't exist

    try:
        subprocess.run(["git", "checkout", "-b", branch_name],
                       cwd=str(repo_root), check=True, capture_output=True)
        return True
    except subprocess.CalledProcessError as e:
        logger.error("Failed to create branch %s: %s", branch_name, e.stderr.decode())
        return False


def commit_and_push(repo_root: Path, branch_name: str, modified_files: list[Path],
                    orphan_count: int) -> bool:
    """Stage modified files, commit, and push."""
    # Stage only modified files
    for f in modified_files:
        subprocess.run(["git", "add", str(f)], cwd=str(repo_root),
                       check=True, capture_output=True)

    # Check if anything staged
    result = subprocess.run(["git", "diff", "--cached", "--name-only"],
                            cwd=str(repo_root), capture_output=True, text=True)
    if not result.stdout.strip():
        logger.info("No files staged — nothing to commit")
        return False

    msg = (
        f"reweave: connect {orphan_count} orphan claims via vector similarity\n\n"
        f"Threshold: {DEFAULT_THRESHOLD}, Haiku classification, {len(modified_files)} files modified.\n\n"
        f"Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887>"
    )
    subprocess.run(["git", "commit", "-m", msg], cwd=str(repo_root),
                    check=True, capture_output=True)

    # Push — inject token
    token_file = SECRETS_DIR / "forgejo-admin-token"
    if not token_file.exists():
        logger.error("No Forgejo token found at %s", token_file)
        return False
    token = token_file.read_text().strip()
    push_url = f"http://teleo:{token}@localhost:3000/teleo/teleo-codex.git"

    subprocess.run(["git", "push", "-u", push_url, branch_name],
                    cwd=str(repo_root), check=True, capture_output=True)
    return True


def create_pr(branch_name: str, orphan_count: int, summary_lines: list[str]) -> str | None:
    """Create a Forgejo PR for the reweave batch."""
    token_file = SECRETS_DIR / "forgejo-admin-token"
    if not token_file.exists():
        return None
    token = token_file.read_text().strip()

    summary = "\n".join(f"- {line}" for line in summary_lines[:30])
    body = (
        f"## Orphan Reweave\n\n"
        f"Connected **{orphan_count}** orphan claims to the knowledge graph "
        f"via vector similarity (threshold {DEFAULT_THRESHOLD}) + Haiku edge classification.\n\n"
        f"### Edges Added\n{summary}\n\n"
        f"### Review Guide\n"
        f"- Each edge has a `# reweave:YYYY-MM-DD` comment — strip after review\n"
        f"- `reweave_edges` field tracks automated edges for tooling (graph_expand weights them 0.75x)\n"
        f"- Upgrade `related` → `supports`/`challenges` where you have better judgment\n"
        f"- Delete any edges that don't make sense\n\n"
        f"Pentagon-Agent: Epimetheus"
    )

    payload = json.dumps({
        "title": f"reweave: connect {orphan_count} orphan claims",
        "body": body,
        "head": branch_name,
        "base": "main",
    }).encode()

    req = urllib.request.Request(
        f"{FORGEJO_URL}/api/v1/repos/teleo/teleo-codex/pulls",
        data=payload,
        headers={
            "Authorization": f"token {token}",
            "Content-Type": "application/json",
        },
    )

    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            data = json.loads(resp.read())
            return data.get("html_url", "")
    except Exception as e:
        logger.error("PR creation failed: %s", e)
        return None


# ─── Worktree Lock ───────────────────────────────────────────────────────────

_lock_fd = None  # Module-level to prevent GC and avoid function-attribute fragility


def acquire_lock(lock_path: Path, timeout: int = 30) -> bool:
    """Acquire file lock for worktree access. Returns True if acquired."""
    global _lock_fd
    import fcntl
    try:
        lock_path.parent.mkdir(parents=True, exist_ok=True)
        _lock_fd = open(lock_path, "w")
        fcntl.flock(_lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
        _lock_fd.write(f"reweave:{os.getpid()}\n")
        _lock_fd.flush()
        return True
    except (IOError, OSError):
        logger.warning("Could not acquire worktree lock at %s — another process has it", lock_path)
        _lock_fd = None
        return False


def release_lock(lock_path: Path):
    """Release worktree lock."""
    global _lock_fd
    import fcntl
    fd = _lock_fd
    _lock_fd = None
    if fd:
        try:
            fcntl.flock(fd, fcntl.LOCK_UN)
            fd.close()
        except Exception:
            pass
    try:
        lock_path.unlink(missing_ok=True)
    except Exception:
        pass


# ─── Main ────────────────────────────────────────────────────────────────────


def main():
    global REPO_DIR, DEFAULT_THRESHOLD

    parser = argparse.ArgumentParser(description="Orphan Reweave — connect isolated claims")
    parser.add_argument("--dry-run", action="store_true",
                        help="Show what would be connected without modifying files")
    parser.add_argument("--max-orphans", type=int, default=DEFAULT_MAX_ORPHANS,
                        help=f"Max orphans to process (default {DEFAULT_MAX_ORPHANS})")
    parser.add_argument("--max-neighbors", type=int, default=DEFAULT_MAX_NEIGHBORS,
                        help=f"Max neighbors per orphan (default {DEFAULT_MAX_NEIGHBORS})")
    parser.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD,
                        help=f"Minimum cosine similarity (default {DEFAULT_THRESHOLD})")
    parser.add_argument("--repo-dir", type=str, default=None,
                        help="Override repo directory")
    args = parser.parse_args()

    if args.repo_dir:
        REPO_DIR = Path(args.repo_dir)
    DEFAULT_THRESHOLD = args.threshold

    date_str = datetime.date.today().isoformat()
    branch_name = f"reweave/{date_str}"

    logger.info("=== Orphan Reweave ===")
    logger.info("Repo: %s", REPO_DIR)
    logger.info("Threshold: %.2f, Max orphans: %d, Max neighbors: %d",
                args.threshold, args.max_orphans, args.max_neighbors)
    if args.dry_run:
        logger.info("DRY RUN — no files will be modified")

    # Step 1: Find all claims and build reverse-link index
    logger.info("Step 1: Scanning KB for claims...")
    claims = find_all_claims(REPO_DIR)
    logger.info("  Found %d knowledge files", len(claims))

    logger.info("Step 2: Building reverse-link index...")
    incoming = build_reverse_link_index(claims)

    logger.info("Step 3: Finding orphans...")
    orphans = find_orphans(claims, incoming, REPO_DIR)
    orphans = sort_orphans_by_domain(orphans, REPO_DIR)
    logger.info("  Found %d orphans (%.1f%% of %d claims)",
                len(orphans), 100 * len(orphans) / max(len(claims), 1), len(claims))

    if not orphans:
        logger.info("No orphans found — KB is fully connected!")
        return

    # Cap to max_orphans
    batch = orphans[:args.max_orphans]
    logger.info("  Processing batch of %d orphans", len(batch))

    # Step 4: For each orphan, find neighbors and classify edges
    api_key = _get_api_key()
    edges_to_write: list[dict] = []  # {neighbor_path, orphan_title, edge_type, reason, score}
    skipped_no_vector = 0
    skipped_no_neighbors = 0
    skipped_entity_pair = 0
    skipped_same_source = 0

    for i, orphan_path in enumerate(batch):
        rel_path = str(orphan_path.relative_to(REPO_DIR))
        fm = _parse_frontmatter(orphan_path)
        orphan_title = fm.get("name", fm.get("title", orphan_path.stem.replace("-", " "))) if fm else orphan_path.stem
        orphan_body = _get_body(orphan_path)

        logger.info("[%d/%d] %s", i + 1, len(batch), orphan_title[:80])

        # Get vector from Qdrant
        vector = get_vector_from_qdrant(rel_path)
        if not vector:
            logger.info("  No vector in Qdrant — skipping (not embedded yet)")
            skipped_no_vector += 1
            continue

        # Find neighbors
        hits = search_neighbors(vector, rel_path, args.threshold, args.max_neighbors)
        if not hits:
            logger.info("  No neighbors above threshold %.2f", args.threshold)
            skipped_no_neighbors += 1
            continue

        for hit in hits:
            payload = hit.get("payload", {})
            neighbor_rel = payload.get("claim_path", "")
            neighbor_title = payload.get("claim_title", "")
            score = hit.get("score", 0)

            if not neighbor_rel:
                continue

            neighbor_path = REPO_DIR / neighbor_rel
            if not neighbor_path.exists():
                logger.info("  Neighbor %s not found on disk — skipping", neighbor_rel)
                continue

            # Entity-to-entity exclusion: entities need different vocabulary
            # (founded_by, competes_with, etc.) not supports/challenges
            if _is_entity(orphan_path) and _is_entity(neighbor_path):
                logger.info("  Skip entity-entity pair: %s ↔ %s", orphan_path.name, neighbor_path.name)
                skipped_entity_pair += 1
                continue

            # Same-source exclusion: N claims from one paper all "supporting" each other
            # inflates graph density without adding information
            if _same_source(orphan_path, neighbor_path):
                logger.info("  Skip same-source pair: %s ↔ %s", orphan_path.name, neighbor_path.name)
                skipped_same_source += 1
                continue

            neighbor_body = _get_body(neighbor_path)

            # Classify with Haiku
            result = classify_edge(orphan_title, orphan_body,
                                   neighbor_title, neighbor_body, api_key)
            edge_type = result["edge_type"]
            confidence = result["confidence"]
            reason = result["reason"]

            logger.info("  → %s (%.3f) %s [%.2f]: %s",
                        neighbor_title[:50], score, edge_type, confidence, reason[:60])

            edges_to_write.append({
                "neighbor_path": neighbor_path,
                "neighbor_rel": neighbor_rel,
                "neighbor_title": neighbor_title,
                "orphan_title": str(orphan_title),
                "orphan_rel": rel_path,
                "edge_type": edge_type,
                "score": score,
                "confidence": confidence,
                "reason": reason,
            })

        # Rate limit courtesy
        if not args.dry_run and i < len(batch) - 1:
            time.sleep(0.3)

    logger.info("\n=== Summary ===")
    logger.info("Orphans processed: %d", len(batch))
    logger.info("Edges to write: %d", len(edges_to_write))
    logger.info("Skipped (no vector): %d", skipped_no_vector)
    logger.info("Skipped (no neighbors): %d", skipped_no_neighbors)
    logger.info("Skipped (entity-entity): %d", skipped_entity_pair)
    logger.info("Skipped (same-source): %d", skipped_same_source)

    if not edges_to_write:
        logger.info("Nothing to write.")
        return

    if args.dry_run:
        logger.info("\n=== Dry Run — Edges That Would Be Written ===")
        for e in edges_to_write:
            logger.info("  %s → [%s] → %s (score=%.3f, conf=%.2f)",
                        e["neighbor_title"][:40], e["edge_type"],
                        e["orphan_title"][:40], e["score"], e["confidence"])
        return

    # Step 5: Acquire lock, create branch, write edges, commit, push, create PR
    lock_path = REPO_DIR.parent / ".main-worktree.lock"
    if not acquire_lock(lock_path):
        logger.error("Cannot acquire worktree lock — aborting")
        sys.exit(1)

    try:
        # Create branch
        if not create_branch(REPO_DIR, branch_name):
            logger.error("Failed to create branch %s", branch_name)
            sys.exit(1)

        # Write edges
        modified_files = set()
        written = 0
        summary_lines = []

        for e in edges_to_write:
            ok = write_edge(
                e["neighbor_path"], e["orphan_title"], e["edge_type"],
                date_str, dry_run=False,
            )
            if ok:
                modified_files.add(e["neighbor_path"])
                written += 1
                summary_lines.append(
                    f"`{e['neighbor_title'][:50]}` → [{e['edge_type']}] → "
                    f"`{e['orphan_title'][:50]}` (score={e['score']:.3f})"
                )

        logger.info("Wrote %d edges across %d files", written, len(modified_files))

        if not modified_files:
            logger.info("No edges written — cleaning up branch")
            subprocess.run(["git", "checkout", "main"], cwd=str(REPO_DIR),
                           capture_output=True)
            subprocess.run(["git", "branch", "-d", branch_name], cwd=str(REPO_DIR),
                           capture_output=True)
            return

        # Commit and push
        orphan_count = len(set(e["orphan_title"] for e in edges_to_write if e["neighbor_path"] in modified_files))
        if commit_and_push(REPO_DIR, branch_name, list(modified_files), orphan_count):
            logger.info("Pushed branch %s", branch_name)

            # Create PR
            pr_url = create_pr(branch_name, orphan_count, summary_lines)
            if pr_url:
                logger.info("PR created: %s", pr_url)
            else:
                logger.warning("PR creation failed — branch is pushed, create manually")
        else:
            logger.error("Commit/push failed")

    finally:
        # Always return to main — even on exception (Ganymede: branch cleanup)
        try:
            subprocess.run(["git", "checkout", "main"], cwd=str(REPO_DIR),
                           capture_output=True)
        except Exception:
            pass
        release_lock(lock_path)

    logger.info("Done.")


if __name__ == "__main__":
    main()