#!/usr/bin/env python3 """Orphan Reweave — connect isolated claims via vector similarity + Haiku classification. Finds claims with zero incoming links (orphans), uses Qdrant to find semantically similar neighbors, classifies the relationship with Haiku, and writes edges on the neighbor's frontmatter pointing TO the orphan. Usage: python3 reweave.py --dry-run # Show what would be connected python3 reweave.py --max-orphans 50 # Process up to 50 orphans python3 reweave.py --threshold 0.72 # Override similarity floor Design: - Orphan = zero incoming links (no other claim's supports/challenges/related/depends_on points to it) - Write edge on NEIGHBOR (not orphan) so orphan gains an incoming link - Haiku classifies: supports | challenges | related (>=0.85 confidence for supports/challenges) - reweave_edges parallel field for tooling-readable provenance - Single PR per run for Leo review Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887> """ import argparse import datetime import hashlib import json import logging import os import re import subprocess import sys import time import urllib.request from pathlib import Path import yaml logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger("reweave") # --- Config --- REPO_DIR = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main")) SECRETS_DIR = Path(os.environ.get("SECRETS_DIR", "/opt/teleo-eval/secrets")) QDRANT_URL = os.environ.get("QDRANT_URL", "http://localhost:6333") QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION", "teleo-claims") FORGEJO_URL = os.environ.get("FORGEJO_URL", "http://localhost:3000") EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"] EDGE_FIELDS = ("supports", "challenges", "challenged_by", "depends_on", "related") WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]") # Thresholds (from calibration data — Mar 28) DEFAULT_THRESHOLD = 0.70 # Elbow in score distribution DEFAULT_MAX_ORPHANS = 50 # Keep PRs reviewable DEFAULT_MAX_NEIGHBORS = 3 # Don't over-connect HAIKU_CONFIDENCE_FLOOR = 0.85 # Below this → default to "related" PER_FILE_EDGE_CAP = 10 # Max total reweave edges per neighbor file # Domain processing order: diversity first, internet-finance last (Leo) DOMAIN_PRIORITY = [ "ai-alignment", "health", "space-development", "entertainment", "creative-industries", "collective-intelligence", "governance", # internet-finance last — batch-imported futarchy cluster, lower cross-domain value "internet-finance", ] # ─── Orphan Detection ──────────────────────────────────────────────────────── def _parse_frontmatter(path: Path) -> dict | None: """Parse YAML frontmatter from a markdown file. Returns dict or None.""" try: text = path.read_text(errors="replace") except Exception: return None if not text.startswith("---"): return None end = text.find("\n---", 3) if end == -1: return None try: fm = yaml.safe_load(text[3:end]) return fm if isinstance(fm, dict) else None except Exception: return None def _get_body(path: Path) -> str: """Get body text (after frontmatter) from a markdown file.""" try: text = path.read_text(errors="replace") except Exception: return "" if not text.startswith("---"): return text end = text.find("\n---", 3) if end == -1: return text return text[end + 4:].strip() def _get_edge_targets(path: Path) -> list[str]: """Extract all outgoing edge targets from a claim's frontmatter + wiki links.""" targets = [] fm = _parse_frontmatter(path) if fm: for field in EDGE_FIELDS: val = fm.get(field) if isinstance(val, list): targets.extend(str(v).strip().lower() for v in val if v) elif isinstance(val, str) and val.strip(): targets.append(val.strip().lower()) # Also check reweave_edges (from previous runs) rw = fm.get("reweave_edges") if isinstance(rw, list): targets.extend(str(v).strip().lower() for v in rw if v) # Wiki links in body try: text = path.read_text(errors="replace") end = text.find("\n---", 3) if end > 0: body = text[end + 4:] for link in WIKI_LINK_RE.findall(body): targets.append(link.strip().lower()) except Exception: pass return targets def _claim_name_variants(path: Path, repo_root: Path = None) -> list[str]: """Generate name variants for a claim file (used for incoming link matching). A claim at domains/ai-alignment/rlhf-reward-hacking.md could be referenced as: - "rlhf-reward-hacking" - "rlhf reward hacking" - "RLHF reward hacking" (title case) - The actual 'name' or 'title' from frontmatter - "domains/ai-alignment/rlhf-reward-hacking" (relative path without .md) """ variants = set() stem = path.stem variants.add(stem.lower()) variants.add(stem.lower().replace("-", " ")) # Also match by relative path (Ganymede Q1: some edges use path references) if repo_root: try: rel = str(path.relative_to(repo_root)).removesuffix(".md") variants.add(rel.lower()) except ValueError: pass fm = _parse_frontmatter(path) if fm: for key in ("name", "title"): val = fm.get(key) if isinstance(val, str) and val.strip(): variants.add(val.strip().lower()) return list(variants) def _is_entity(path: Path) -> bool: """Check if a file is an entity (not a claim). Entities need different edge vocabulary.""" fm = _parse_frontmatter(path) if fm and fm.get("type") == "entity": return True # Check path parts — avoids false positives on paths like "domains/entities-overview/" return "entities" in Path(path).parts def _same_source(path_a: Path, path_b: Path) -> bool: """Check if two claims derive from the same source material. Prevents self-referential edges where N claims about the same paper all "support" each other — inflates graph density without adding information. """ fm_a = _parse_frontmatter(path_a) fm_b = _parse_frontmatter(path_b) if not fm_a or not fm_b: return False # Check source field src_a = fm_a.get("source") or fm_a.get("source_file") or "" src_b = fm_b.get("source") or fm_b.get("source_file") or "" if src_a and src_b and str(src_a).strip() == str(src_b).strip(): return True return False def find_all_claims(repo_root: Path) -> list[Path]: """Find all knowledge files (claim, framework, entity, decision) in the KB.""" claims = [] for d in EMBED_DIRS: base = repo_root / d if not base.is_dir(): continue for md in base.rglob("*.md"): if md.name.startswith("_"): continue fm = _parse_frontmatter(md) if fm and fm.get("type") not in ("source", "musing", None): claims.append(md) return claims def build_reverse_link_index(claims: list[Path]) -> dict[str, set[Path]]: """Build a reverse index: claim_name_variant → set of files that link TO it. For each claim, extract all outgoing edges. For each target name, record the source claim as an incoming link for that target. """ # name_variant → set of source paths that point to it incoming: dict[str, set[Path]] = {} for claim_path in claims: targets = _get_edge_targets(claim_path) for target in targets: if target not in incoming: incoming[target] = set() incoming[target].add(claim_path) return incoming def find_orphans(claims: list[Path], incoming: dict[str, set[Path]], repo_root: Path = None) -> list[Path]: """Find claims with zero incoming links.""" orphans = [] for claim_path in claims: variants = _claim_name_variants(claim_path, repo_root) has_incoming = any( len(incoming.get(v, set()) - {claim_path}) > 0 for v in variants ) if not has_incoming: orphans.append(claim_path) return orphans def sort_orphans_by_domain(orphans: list[Path], repo_root: Path) -> list[Path]: """Sort orphans by domain priority (diversity first, internet-finance last).""" def domain_key(path: Path) -> tuple[int, str]: rel = path.relative_to(repo_root) parts = rel.parts domain = "" if len(parts) >= 2 and parts[0] in ("domains", "entities", "decisions"): domain = parts[1] elif parts[0] == "foundations" and len(parts) >= 2: domain = parts[1] elif parts[0] == "core": domain = "core" try: priority = DOMAIN_PRIORITY.index(domain) except ValueError: # Unknown domain goes before internet-finance but after known ones priority = len(DOMAIN_PRIORITY) - 1 return (priority, path.stem) return sorted(orphans, key=domain_key) # ─── Qdrant Search ─────────────────────────────────────────────────────────── def _get_api_key() -> str: """Load OpenRouter API key.""" key_file = SECRETS_DIR / "openrouter-key" if key_file.exists(): return key_file.read_text().strip() key = os.environ.get("OPENROUTER_API_KEY", "") if key: return key logger.error("No OpenRouter API key found") sys.exit(1) def make_point_id(rel_path: str) -> str: """Deterministic point ID from repo-relative path (matches embed-claims.py).""" return hashlib.md5(rel_path.encode()).hexdigest() def get_vector_from_qdrant(rel_path: str) -> list[float] | None: """Retrieve a claim's existing vector from Qdrant by its point ID.""" point_id = make_point_id(rel_path) body = json.dumps({"ids": [point_id], "with_vector": True}).encode() req = urllib.request.Request( f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points", data=body, headers={"Content-Type": "application/json"}, ) try: with urllib.request.urlopen(req, timeout=10) as resp: data = json.loads(resp.read()) points = data.get("result", []) if points and points[0].get("vector"): return points[0]["vector"] except Exception as e: logger.warning("Qdrant point lookup failed for %s: %s", rel_path, e) return None def search_neighbors(vector: list[float], exclude_path: str, threshold: float, limit: int) -> list[dict]: """Search Qdrant for nearest neighbors above threshold, excluding self.""" body = { "vector": vector, "limit": limit + 5, # over-fetch to account for self + filtered "with_payload": True, "score_threshold": threshold, "filter": { "must_not": [{"key": "claim_path", "match": {"value": exclude_path}}] }, } req = urllib.request.Request( f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search", data=json.dumps(body).encode(), headers={"Content-Type": "application/json"}, ) try: with urllib.request.urlopen(req, timeout=10) as resp: data = json.loads(resp.read()) hits = data.get("result", []) return hits[:limit] except Exception as e: logger.warning("Qdrant search failed: %s", e) return [] # ─── Haiku Edge Classification ─────────────────────────────────────────────── CLASSIFY_PROMPT = """You are classifying the relationship between two knowledge claims. CLAIM A (the orphan — needs to be connected): Title: {orphan_title} Body: {orphan_body} CLAIM B (the neighbor — already connected in the knowledge graph): Title: {neighbor_title} Body: {neighbor_body} What is the relationship FROM Claim B TO Claim A? Options: - "supports" — Claim B provides evidence, reasoning, or examples that strengthen Claim A - "challenges" — Claim B contradicts, undermines, or provides counter-evidence to Claim A. NOTE: "challenges" is underused — if one claim says X works and another says X fails, or they propose incompatible mechanisms, that IS a challenge. Use it. - "related" — Claims are topically connected but neither supports nor challenges the other. This is the WEAKEST edge — prefer supports/challenges when the relationship has directionality. Respond with EXACTLY this JSON format, nothing else: {{"edge_type": "supports|challenges|related", "confidence": 0.0-1.0, "reason": "one sentence explanation"}} """ def classify_edge(orphan_title: str, orphan_body: str, neighbor_title: str, neighbor_body: str, api_key: str) -> dict: """Use Haiku to classify the edge type between two claims. Returns {"edge_type": str, "confidence": float, "reason": str}. Falls back to "related" on any failure. """ default = {"edge_type": "related", "confidence": 0.5, "reason": "classification failed"} prompt = CLASSIFY_PROMPT.format( orphan_title=orphan_title, orphan_body=orphan_body[:500], neighbor_title=neighbor_title, neighbor_body=neighbor_body[:500], ) payload = json.dumps({ "model": "anthropic/claude-3.5-haiku", "messages": [{"role": "user", "content": prompt}], "max_tokens": 200, "temperature": 0.3, }).encode() req = urllib.request.Request( "https://openrouter.ai/api/v1/chat/completions", data=payload, headers={ "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", }, ) try: with urllib.request.urlopen(req, timeout=15) as resp: data = json.loads(resp.read()) content = data["choices"][0]["message"]["content"].strip() # Parse JSON from response (handle markdown code blocks) if content.startswith("```"): content = content.split("\n", 1)[-1].rsplit("```", 1)[0].strip() result = json.loads(content) edge_type = result.get("edge_type", "related") confidence = float(result.get("confidence", 0.5)) # Enforce confidence floor for supports/challenges if edge_type in ("supports", "challenges") and confidence < HAIKU_CONFIDENCE_FLOOR: edge_type = "related" return { "edge_type": edge_type, "confidence": confidence, "reason": result.get("reason", ""), } except Exception as e: logger.warning("Haiku classification failed: %s", e) return default # ─── YAML Frontmatter Editing ──────────────────────────────────────────────── def _count_reweave_edges(path: Path) -> int: """Count existing reweave_edges in a file's frontmatter.""" fm = _parse_frontmatter(path) if not fm: return 0 rw = fm.get("reweave_edges") if isinstance(rw, list): return len(rw) return 0 def write_edge(neighbor_path: Path, orphan_title: str, edge_type: str, date_str: str, dry_run: bool = False) -> bool: """Write a reweave edge on the neighbor's frontmatter. Adds to both the edge_type list (related/supports/challenges) and the parallel reweave_edges list for provenance tracking. Uses ruamel.yaml for round-trip YAML preservation. """ # Check per-file cap if _count_reweave_edges(neighbor_path) >= PER_FILE_EDGE_CAP: logger.info(" Skip %s — per-file edge cap (%d) reached", neighbor_path.name, PER_FILE_EDGE_CAP) return False try: text = neighbor_path.read_text(errors="replace") except Exception as e: logger.warning(" Cannot read %s: %s", neighbor_path, e) return False if not text.startswith("---"): logger.warning(" No frontmatter in %s", neighbor_path.name) return False end = text.find("\n---", 3) if end == -1: return False fm_text = text[3:end] body_text = text[end:] # includes the closing --- # Try ruamel.yaml for round-trip editing try: from ruamel.yaml import YAML ry = YAML() ry.preserve_quotes = True ry.width = 4096 # prevent line wrapping import io fm = ry.load(fm_text) if not isinstance(fm, dict): return False # Add to edge_type list (related/supports/challenges) # Clean value only — provenance tracked in reweave_edges (Ganymede: comment-in-string bug) if edge_type not in fm: fm[edge_type] = [] elif not isinstance(fm[edge_type], list): fm[edge_type] = [fm[edge_type]] # Check for duplicate existing = [str(v).strip().lower() for v in fm[edge_type] if v] if orphan_title.strip().lower() in existing: logger.info(" Skip duplicate edge: %s → %s", neighbor_path.name, orphan_title) return False fm[edge_type].append(orphan_title) # Add to reweave_edges with provenance (edge_type + date for audit trail) if "reweave_edges" not in fm: fm["reweave_edges"] = [] elif not isinstance(fm["reweave_edges"], list): fm["reweave_edges"] = [fm["reweave_edges"]] fm["reweave_edges"].append(f"{orphan_title}|{edge_type}|{date_str}") # Serialize back buf = io.StringIO() ry.dump(fm, buf) new_fm = buf.getvalue().rstrip("\n") new_text = f"---\n{new_fm}{body_text}" if not dry_run: neighbor_path.write_text(new_text) return True except ImportError: # Fallback: regex-based editing (no ruamel.yaml installed) logger.info(" ruamel.yaml not available, using regex fallback") return _write_edge_regex(neighbor_path, fm_text, body_text, orphan_title, edge_type, date_str, dry_run) def _write_edge_regex(neighbor_path: Path, fm_text: str, body_text: str, orphan_title: str, edge_type: str, date_str: str, dry_run: bool) -> bool: """Fallback: add edge via regex when ruamel.yaml is unavailable.""" # Strip leading newline from fm_text (text[3:end] includes \n after ---) fm_text = fm_text.lstrip("\n") # Check for duplicate before writing existing_re = re.compile( rf'^\s*-\s*["\']?{re.escape(orphan_title)}["\']?\s*$', re.MULTILINE | re.IGNORECASE, ) if existing_re.search(fm_text): logger.info(" Skip duplicate edge (regex): %s → %s", neighbor_path.name, orphan_title) return False # Check if edge_type field exists field_re = re.compile(rf"^{edge_type}:\s*$", re.MULTILINE) inline_re = re.compile(rf'^{edge_type}:\s*\[', re.MULTILINE) entry_line = f' - "{orphan_title}"' rw_line = f' - "{orphan_title}|{edge_type}|{date_str}"' if field_re.search(fm_text): # Multi-line list exists — find end of list, append lines = fm_text.split("\n") new_lines = [] in_field = False inserted = False for line in lines: new_lines.append(line) if re.match(rf"^{edge_type}:\s*$", line): in_field = True elif in_field and not line.startswith(" -"): # End of list — insert before this line new_lines.insert(-1, entry_line) in_field = False inserted = True if in_field and not inserted: # Field was last in frontmatter new_lines.append(entry_line) fm_text = "\n".join(new_lines) elif inline_re.search(fm_text): # Inline list — skip, too complex for regex logger.warning(" Inline list format for %s in %s, skipping", edge_type, neighbor_path.name) return False else: # Field doesn't exist — add at end of frontmatter fm_text = fm_text.rstrip("\n") + f"\n{edge_type}:\n{entry_line}" # Add reweave_edges field if "reweave_edges:" in fm_text: lines = fm_text.split("\n") new_lines = [] in_rw = False inserted_rw = False for line in lines: new_lines.append(line) if re.match(r"^reweave_edges:\s*$", line): in_rw = True elif in_rw and not line.startswith(" -"): new_lines.insert(-1, rw_line) in_rw = False inserted_rw = True if in_rw and not inserted_rw: new_lines.append(rw_line) fm_text = "\n".join(new_lines) else: fm_text = fm_text.rstrip("\n") + f"\nreweave_edges:\n{rw_line}" new_text = f"---\n{fm_text}{body_text}" if not dry_run: neighbor_path.write_text(new_text) return True # ─── Git + PR ──────────────────────────────────────────────────────────────── def create_branch(repo_root: Path, branch_name: str) -> bool: """Create and checkout a new branch. Cleans up stale local/remote branches from prior failed runs.""" # Delete stale local branch if it exists (e.g., from a failed earlier run today) subprocess.run(["git", "branch", "-D", branch_name], cwd=str(repo_root), capture_output=True) # ignore errors if branch doesn't exist # Delete stale remote branch if it exists token_file = SECRETS_DIR / "forgejo-admin-token" if token_file.exists(): token = token_file.read_text().strip() push_url = f"http://teleo:{token}@localhost:3000/teleo/teleo-codex.git" subprocess.run(["git", "push", push_url, "--delete", branch_name], cwd=str(repo_root), capture_output=True) # ignore errors if branch doesn't exist try: subprocess.run(["git", "checkout", "-b", branch_name], cwd=str(repo_root), check=True, capture_output=True) return True except subprocess.CalledProcessError as e: logger.error("Failed to create branch %s: %s", branch_name, e.stderr.decode()) return False def commit_and_push(repo_root: Path, branch_name: str, modified_files: list[Path], orphan_count: int) -> bool: """Stage modified files, commit, and push.""" # Stage only modified files for f in modified_files: subprocess.run(["git", "add", str(f)], cwd=str(repo_root), check=True, capture_output=True) # Check if anything staged result = subprocess.run(["git", "diff", "--cached", "--name-only"], cwd=str(repo_root), capture_output=True, text=True) if not result.stdout.strip(): logger.info("No files staged — nothing to commit") return False msg = ( f"reweave: connect {orphan_count} orphan claims via vector similarity\n\n" f"Threshold: {DEFAULT_THRESHOLD}, Haiku classification, {len(modified_files)} files modified.\n\n" f"Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887>" ) subprocess.run(["git", "commit", "-m", msg], cwd=str(repo_root), check=True, capture_output=True) # Push — inject token token_file = SECRETS_DIR / "forgejo-admin-token" if not token_file.exists(): logger.error("No Forgejo token found at %s", token_file) return False token = token_file.read_text().strip() push_url = f"http://teleo:{token}@localhost:3000/teleo/teleo-codex.git" subprocess.run(["git", "push", "-u", push_url, branch_name], cwd=str(repo_root), check=True, capture_output=True) return True def create_pr(branch_name: str, orphan_count: int, summary_lines: list[str]) -> str | None: """Create a Forgejo PR for the reweave batch.""" token_file = SECRETS_DIR / "forgejo-admin-token" if not token_file.exists(): return None token = token_file.read_text().strip() summary = "\n".join(f"- {line}" for line in summary_lines[:30]) body = ( f"## Orphan Reweave\n\n" f"Connected **{orphan_count}** orphan claims to the knowledge graph " f"via vector similarity (threshold {DEFAULT_THRESHOLD}) + Haiku edge classification.\n\n" f"### Edges Added\n{summary}\n\n" f"### Review Guide\n" f"- Each edge has a `# reweave:YYYY-MM-DD` comment — strip after review\n" f"- `reweave_edges` field tracks automated edges for tooling (graph_expand weights them 0.75x)\n" f"- Upgrade `related` → `supports`/`challenges` where you have better judgment\n" f"- Delete any edges that don't make sense\n\n" f"Pentagon-Agent: Epimetheus" ) payload = json.dumps({ "title": f"reweave: connect {orphan_count} orphan claims", "body": body, "head": branch_name, "base": "main", }).encode() req = urllib.request.Request( f"{FORGEJO_URL}/api/v1/repos/teleo/teleo-codex/pulls", data=payload, headers={ "Authorization": f"token {token}", "Content-Type": "application/json", }, ) try: with urllib.request.urlopen(req, timeout=30) as resp: data = json.loads(resp.read()) return data.get("html_url", "") except Exception as e: logger.error("PR creation failed: %s", e) return None # ─── Worktree Lock ─────────────────────────────────────────────────────────── _lock_fd = None # Module-level to prevent GC and avoid function-attribute fragility def acquire_lock(lock_path: Path, timeout: int = 30) -> bool: """Acquire file lock for worktree access. Returns True if acquired.""" global _lock_fd import fcntl try: lock_path.parent.mkdir(parents=True, exist_ok=True) _lock_fd = open(lock_path, "w") fcntl.flock(_lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) _lock_fd.write(f"reweave:{os.getpid()}\n") _lock_fd.flush() return True except (IOError, OSError): logger.warning("Could not acquire worktree lock at %s — another process has it", lock_path) _lock_fd = None return False def release_lock(lock_path: Path): """Release worktree lock.""" global _lock_fd import fcntl fd = _lock_fd _lock_fd = None if fd: try: fcntl.flock(fd, fcntl.LOCK_UN) fd.close() except Exception: pass try: lock_path.unlink(missing_ok=True) except Exception: pass # ─── Main ──────────────────────────────────────────────────────────────────── def main(): global REPO_DIR, DEFAULT_THRESHOLD parser = argparse.ArgumentParser(description="Orphan Reweave — connect isolated claims") parser.add_argument("--dry-run", action="store_true", help="Show what would be connected without modifying files") parser.add_argument("--max-orphans", type=int, default=DEFAULT_MAX_ORPHANS, help=f"Max orphans to process (default {DEFAULT_MAX_ORPHANS})") parser.add_argument("--max-neighbors", type=int, default=DEFAULT_MAX_NEIGHBORS, help=f"Max neighbors per orphan (default {DEFAULT_MAX_NEIGHBORS})") parser.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD, help=f"Minimum cosine similarity (default {DEFAULT_THRESHOLD})") parser.add_argument("--repo-dir", type=str, default=None, help="Override repo directory") args = parser.parse_args() if args.repo_dir: REPO_DIR = Path(args.repo_dir) DEFAULT_THRESHOLD = args.threshold date_str = datetime.date.today().isoformat() branch_name = f"reweave/{date_str}" logger.info("=== Orphan Reweave ===") logger.info("Repo: %s", REPO_DIR) logger.info("Threshold: %.2f, Max orphans: %d, Max neighbors: %d", args.threshold, args.max_orphans, args.max_neighbors) if args.dry_run: logger.info("DRY RUN — no files will be modified") # Step 1: Find all claims and build reverse-link index logger.info("Step 1: Scanning KB for claims...") claims = find_all_claims(REPO_DIR) logger.info(" Found %d knowledge files", len(claims)) logger.info("Step 2: Building reverse-link index...") incoming = build_reverse_link_index(claims) logger.info("Step 3: Finding orphans...") orphans = find_orphans(claims, incoming, REPO_DIR) orphans = sort_orphans_by_domain(orphans, REPO_DIR) logger.info(" Found %d orphans (%.1f%% of %d claims)", len(orphans), 100 * len(orphans) / max(len(claims), 1), len(claims)) if not orphans: logger.info("No orphans found — KB is fully connected!") return # Cap to max_orphans batch = orphans[:args.max_orphans] logger.info(" Processing batch of %d orphans", len(batch)) # Step 4: For each orphan, find neighbors and classify edges api_key = _get_api_key() edges_to_write: list[dict] = [] # {neighbor_path, orphan_title, edge_type, reason, score} skipped_no_vector = 0 skipped_no_neighbors = 0 skipped_entity_pair = 0 skipped_same_source = 0 for i, orphan_path in enumerate(batch): rel_path = str(orphan_path.relative_to(REPO_DIR)) fm = _parse_frontmatter(orphan_path) orphan_title = fm.get("name", fm.get("title", orphan_path.stem.replace("-", " "))) if fm else orphan_path.stem orphan_body = _get_body(orphan_path) logger.info("[%d/%d] %s", i + 1, len(batch), orphan_title[:80]) # Get vector from Qdrant vector = get_vector_from_qdrant(rel_path) if not vector: logger.info(" No vector in Qdrant — skipping (not embedded yet)") skipped_no_vector += 1 continue # Find neighbors hits = search_neighbors(vector, rel_path, args.threshold, args.max_neighbors) if not hits: logger.info(" No neighbors above threshold %.2f", args.threshold) skipped_no_neighbors += 1 continue for hit in hits: payload = hit.get("payload", {}) neighbor_rel = payload.get("claim_path", "") neighbor_title = payload.get("claim_title", "") score = hit.get("score", 0) if not neighbor_rel: continue neighbor_path = REPO_DIR / neighbor_rel if not neighbor_path.exists(): logger.info(" Neighbor %s not found on disk — skipping", neighbor_rel) continue # Entity-to-entity exclusion: entities need different vocabulary # (founded_by, competes_with, etc.) not supports/challenges if _is_entity(orphan_path) and _is_entity(neighbor_path): logger.info(" Skip entity-entity pair: %s ↔ %s", orphan_path.name, neighbor_path.name) skipped_entity_pair += 1 continue # Same-source exclusion: N claims from one paper all "supporting" each other # inflates graph density without adding information if _same_source(orphan_path, neighbor_path): logger.info(" Skip same-source pair: %s ↔ %s", orphan_path.name, neighbor_path.name) skipped_same_source += 1 continue neighbor_body = _get_body(neighbor_path) # Classify with Haiku result = classify_edge(orphan_title, orphan_body, neighbor_title, neighbor_body, api_key) edge_type = result["edge_type"] confidence = result["confidence"] reason = result["reason"] logger.info(" → %s (%.3f) %s [%.2f]: %s", neighbor_title[:50], score, edge_type, confidence, reason[:60]) edges_to_write.append({ "neighbor_path": neighbor_path, "neighbor_rel": neighbor_rel, "neighbor_title": neighbor_title, "orphan_title": str(orphan_title), "orphan_rel": rel_path, "edge_type": edge_type, "score": score, "confidence": confidence, "reason": reason, }) # Rate limit courtesy if not args.dry_run and i < len(batch) - 1: time.sleep(0.3) logger.info("\n=== Summary ===") logger.info("Orphans processed: %d", len(batch)) logger.info("Edges to write: %d", len(edges_to_write)) logger.info("Skipped (no vector): %d", skipped_no_vector) logger.info("Skipped (no neighbors): %d", skipped_no_neighbors) logger.info("Skipped (entity-entity): %d", skipped_entity_pair) logger.info("Skipped (same-source): %d", skipped_same_source) if not edges_to_write: logger.info("Nothing to write.") return if args.dry_run: logger.info("\n=== Dry Run — Edges That Would Be Written ===") for e in edges_to_write: logger.info(" %s → [%s] → %s (score=%.3f, conf=%.2f)", e["neighbor_title"][:40], e["edge_type"], e["orphan_title"][:40], e["score"], e["confidence"]) return # Step 5: Acquire lock, create branch, write edges, commit, push, create PR lock_path = REPO_DIR.parent / ".main-worktree.lock" if not acquire_lock(lock_path): logger.error("Cannot acquire worktree lock — aborting") sys.exit(1) try: # Create branch if not create_branch(REPO_DIR, branch_name): logger.error("Failed to create branch %s", branch_name) sys.exit(1) # Write edges modified_files = set() written = 0 summary_lines = [] for e in edges_to_write: ok = write_edge( e["neighbor_path"], e["orphan_title"], e["edge_type"], date_str, dry_run=False, ) if ok: modified_files.add(e["neighbor_path"]) written += 1 summary_lines.append( f"`{e['neighbor_title'][:50]}` → [{e['edge_type']}] → " f"`{e['orphan_title'][:50]}` (score={e['score']:.3f})" ) logger.info("Wrote %d edges across %d files", written, len(modified_files)) if not modified_files: logger.info("No edges written — cleaning up branch") subprocess.run(["git", "checkout", "main"], cwd=str(REPO_DIR), capture_output=True) subprocess.run(["git", "branch", "-d", branch_name], cwd=str(REPO_DIR), capture_output=True) return # Commit and push orphan_count = len(set(e["orphan_title"] for e in edges_to_write if e["neighbor_path"] in modified_files)) if commit_and_push(REPO_DIR, branch_name, list(modified_files), orphan_count): logger.info("Pushed branch %s", branch_name) # Create PR pr_url = create_pr(branch_name, orphan_count, summary_lines) if pr_url: logger.info("PR created: %s", pr_url) else: logger.warning("PR creation failed — branch is pushed, create manually") else: logger.error("Commit/push failed") finally: # Always return to main — even on exception (Ganymede: branch cleanup) try: subprocess.run(["git", "checkout", "main"], cwd=str(REPO_DIR), capture_output=True) except Exception: pass release_lock(lock_path) logger.info("Done.") if __name__ == "__main__": main()