teleo-codex/ops/pipeline-v2/reweave.py
m3taversal 05d74d5e32 sync: import all VPS pipeline + diagnostics code as baseline
Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source
of truth. Previously only 8 of 67 files existed in repo — the rest were
deployed directly to VPS via SCP, causing massive drift.

Includes:
- pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.)
- pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh
- diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics)
- agent-state/: bootstrap, lib-state, cascade inbox processor, schema
- systemd/: service unit files for reference
- deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate
- research-session.sh: updated with Step 8.5 digest + cascade inbox processing

No new code written — all files are exact copies from VPS as of 2026-04-06.
From this point forward: edit in repo, commit, then deploy.sh.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 00:00:00 +01:00

972 lines
36 KiB
Python

#!/usr/bin/env python3
"""Orphan Reweave — connect isolated claims via vector similarity + Haiku classification.
Finds claims with zero incoming links (orphans), uses Qdrant to find semantically
similar neighbors, classifies the relationship with Haiku, and writes edges on the
neighbor's frontmatter pointing TO the orphan.
Usage:
python3 reweave.py --dry-run # Show what would be connected
python3 reweave.py --max-orphans 50 # Process up to 50 orphans
python3 reweave.py --threshold 0.72 # Override similarity floor
Design:
- Orphan = zero incoming links (no other claim's supports/challenges/related/depends_on points to it)
- Write edge on NEIGHBOR (not orphan) so orphan gains an incoming link
- Haiku classifies: supports | challenges | related (>=0.85 confidence for supports/challenges)
- reweave_edges parallel field for tooling-readable provenance
- Single PR per run for Leo review
Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887>
"""
import argparse
import datetime
import hashlib
import json
import logging
import os
import re
import subprocess
import sys
import time
import urllib.request
from pathlib import Path
import yaml
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger("reweave")
# --- Config ---
REPO_DIR = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main"))
SECRETS_DIR = Path(os.environ.get("SECRETS_DIR", "/opt/teleo-eval/secrets"))
QDRANT_URL = os.environ.get("QDRANT_URL", "http://localhost:6333")
QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION", "teleo-claims")
FORGEJO_URL = os.environ.get("FORGEJO_URL", "http://localhost:3000")
EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]
EDGE_FIELDS = ("supports", "challenges", "challenged_by", "depends_on", "related")
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
# Thresholds (from calibration data — Mar 28)
DEFAULT_THRESHOLD = 0.70 # Elbow in score distribution
DEFAULT_MAX_ORPHANS = 50 # Keep PRs reviewable
DEFAULT_MAX_NEIGHBORS = 3 # Don't over-connect
HAIKU_CONFIDENCE_FLOOR = 0.85 # Below this → default to "related"
PER_FILE_EDGE_CAP = 10 # Max total reweave edges per neighbor file
# Domain processing order: diversity first, internet-finance last (Leo)
DOMAIN_PRIORITY = [
"ai-alignment", "health", "space-development", "entertainment",
"creative-industries", "collective-intelligence", "governance",
# internet-finance last — batch-imported futarchy cluster, lower cross-domain value
"internet-finance",
]
# ─── Orphan Detection ────────────────────────────────────────────────────────
def _parse_frontmatter(path: Path) -> dict | None:
"""Parse YAML frontmatter from a markdown file. Returns dict or None."""
try:
text = path.read_text(errors="replace")
except Exception:
return None
if not text.startswith("---"):
return None
end = text.find("\n---", 3)
if end == -1:
return None
try:
fm = yaml.safe_load(text[3:end])
return fm if isinstance(fm, dict) else None
except Exception:
return None
def _get_body(path: Path) -> str:
"""Get body text (after frontmatter) from a markdown file."""
try:
text = path.read_text(errors="replace")
except Exception:
return ""
if not text.startswith("---"):
return text
end = text.find("\n---", 3)
if end == -1:
return text
return text[end + 4:].strip()
def _get_edge_targets(path: Path) -> list[str]:
"""Extract all outgoing edge targets from a claim's frontmatter + wiki links."""
targets = []
fm = _parse_frontmatter(path)
if fm:
for field in EDGE_FIELDS:
val = fm.get(field)
if isinstance(val, list):
targets.extend(str(v).strip().lower() for v in val if v)
elif isinstance(val, str) and val.strip():
targets.append(val.strip().lower())
# Also check reweave_edges (from previous runs)
rw = fm.get("reweave_edges")
if isinstance(rw, list):
targets.extend(str(v).strip().lower() for v in rw if v)
# Wiki links in body
try:
text = path.read_text(errors="replace")
end = text.find("\n---", 3)
if end > 0:
body = text[end + 4:]
for link in WIKI_LINK_RE.findall(body):
targets.append(link.strip().lower())
except Exception:
pass
return targets
def _claim_name_variants(path: Path, repo_root: Path = None) -> list[str]:
"""Generate name variants for a claim file (used for incoming link matching).
A claim at domains/ai-alignment/rlhf-reward-hacking.md could be referenced as:
- "rlhf-reward-hacking"
- "rlhf reward hacking"
- "RLHF reward hacking" (title case)
- The actual 'name' or 'title' from frontmatter
- "domains/ai-alignment/rlhf-reward-hacking" (relative path without .md)
"""
variants = set()
stem = path.stem
variants.add(stem.lower())
variants.add(stem.lower().replace("-", " "))
# Also match by relative path (Ganymede Q1: some edges use path references)
if repo_root:
try:
rel = str(path.relative_to(repo_root)).removesuffix(".md")
variants.add(rel.lower())
except ValueError:
pass
fm = _parse_frontmatter(path)
if fm:
for key in ("name", "title"):
val = fm.get(key)
if isinstance(val, str) and val.strip():
variants.add(val.strip().lower())
return list(variants)
def _is_entity(path: Path) -> bool:
"""Check if a file is an entity (not a claim). Entities need different edge vocabulary."""
fm = _parse_frontmatter(path)
if fm and fm.get("type") == "entity":
return True
# Check path parts — avoids false positives on paths like "domains/entities-overview/"
return "entities" in Path(path).parts
def _same_source(path_a: Path, path_b: Path) -> bool:
"""Check if two claims derive from the same source material.
Prevents self-referential edges where N claims about the same paper
all "support" each other — inflates graph density without adding information.
"""
fm_a = _parse_frontmatter(path_a)
fm_b = _parse_frontmatter(path_b)
if not fm_a or not fm_b:
return False
# Check source field
src_a = fm_a.get("source") or fm_a.get("source_file") or ""
src_b = fm_b.get("source") or fm_b.get("source_file") or ""
if src_a and src_b and str(src_a).strip() == str(src_b).strip():
return True
return False
def find_all_claims(repo_root: Path) -> list[Path]:
"""Find all knowledge files (claim, framework, entity, decision) in the KB."""
claims = []
for d in EMBED_DIRS:
base = repo_root / d
if not base.is_dir():
continue
for md in base.rglob("*.md"):
if md.name.startswith("_"):
continue
fm = _parse_frontmatter(md)
if fm and fm.get("type") not in ("source", "musing", None):
claims.append(md)
return claims
def build_reverse_link_index(claims: list[Path]) -> dict[str, set[Path]]:
"""Build a reverse index: claim_name_variant → set of files that link TO it.
For each claim, extract all outgoing edges. For each target name, record
the source claim as an incoming link for that target.
"""
# name_variant → set of source paths that point to it
incoming: dict[str, set[Path]] = {}
for claim_path in claims:
targets = _get_edge_targets(claim_path)
for target in targets:
if target not in incoming:
incoming[target] = set()
incoming[target].add(claim_path)
return incoming
def find_orphans(claims: list[Path], incoming: dict[str, set[Path]],
repo_root: Path = None) -> list[Path]:
"""Find claims with zero incoming links."""
orphans = []
for claim_path in claims:
variants = _claim_name_variants(claim_path, repo_root)
has_incoming = any(
len(incoming.get(v, set()) - {claim_path}) > 0
for v in variants
)
if not has_incoming:
orphans.append(claim_path)
return orphans
def sort_orphans_by_domain(orphans: list[Path], repo_root: Path) -> list[Path]:
"""Sort orphans by domain priority (diversity first, internet-finance last)."""
def domain_key(path: Path) -> tuple[int, str]:
rel = path.relative_to(repo_root)
parts = rel.parts
domain = ""
if len(parts) >= 2 and parts[0] in ("domains", "entities", "decisions"):
domain = parts[1]
elif parts[0] == "foundations" and len(parts) >= 2:
domain = parts[1]
elif parts[0] == "core":
domain = "core"
try:
priority = DOMAIN_PRIORITY.index(domain)
except ValueError:
# Unknown domain goes before internet-finance but after known ones
priority = len(DOMAIN_PRIORITY) - 1
return (priority, path.stem)
return sorted(orphans, key=domain_key)
# ─── Qdrant Search ───────────────────────────────────────────────────────────
def _get_api_key() -> str:
"""Load OpenRouter API key."""
key_file = SECRETS_DIR / "openrouter-key"
if key_file.exists():
return key_file.read_text().strip()
key = os.environ.get("OPENROUTER_API_KEY", "")
if key:
return key
logger.error("No OpenRouter API key found")
sys.exit(1)
def make_point_id(rel_path: str) -> str:
"""Deterministic point ID from repo-relative path (matches embed-claims.py)."""
return hashlib.md5(rel_path.encode()).hexdigest()
def get_vector_from_qdrant(rel_path: str) -> list[float] | None:
"""Retrieve a claim's existing vector from Qdrant by its point ID."""
point_id = make_point_id(rel_path)
body = json.dumps({"ids": [point_id], "with_vector": True}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points",
data=body,
headers={"Content-Type": "application/json"},
)
try:
with urllib.request.urlopen(req, timeout=10) as resp:
data = json.loads(resp.read())
points = data.get("result", [])
if points and points[0].get("vector"):
return points[0]["vector"]
except Exception as e:
logger.warning("Qdrant point lookup failed for %s: %s", rel_path, e)
return None
def search_neighbors(vector: list[float], exclude_path: str,
threshold: float, limit: int) -> list[dict]:
"""Search Qdrant for nearest neighbors above threshold, excluding self."""
body = {
"vector": vector,
"limit": limit + 5, # over-fetch to account for self + filtered
"with_payload": True,
"score_threshold": threshold,
"filter": {
"must_not": [{"key": "claim_path", "match": {"value": exclude_path}}]
},
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
data=json.dumps(body).encode(),
headers={"Content-Type": "application/json"},
)
try:
with urllib.request.urlopen(req, timeout=10) as resp:
data = json.loads(resp.read())
hits = data.get("result", [])
return hits[:limit]
except Exception as e:
logger.warning("Qdrant search failed: %s", e)
return []
# ─── Haiku Edge Classification ───────────────────────────────────────────────
CLASSIFY_PROMPT = """You are classifying the relationship between two knowledge claims.
CLAIM A (the orphan — needs to be connected):
Title: {orphan_title}
Body: {orphan_body}
CLAIM B (the neighbor — already connected in the knowledge graph):
Title: {neighbor_title}
Body: {neighbor_body}
What is the relationship FROM Claim B TO Claim A?
Options:
- "supports" — Claim B provides evidence, reasoning, or examples that strengthen Claim A
- "challenges" — Claim B contradicts, undermines, or provides counter-evidence to Claim A. NOTE: "challenges" is underused — if one claim says X works and another says X fails, or they propose incompatible mechanisms, that IS a challenge. Use it.
- "related" — Claims are topically connected but neither supports nor challenges the other. This is the WEAKEST edge — prefer supports/challenges when the relationship has directionality.
Respond with EXACTLY this JSON format, nothing else:
{{"edge_type": "supports|challenges|related", "confidence": 0.0-1.0, "reason": "one sentence explanation"}}
"""
def classify_edge(orphan_title: str, orphan_body: str,
neighbor_title: str, neighbor_body: str,
api_key: str) -> dict:
"""Use Haiku to classify the edge type between two claims.
Returns {"edge_type": str, "confidence": float, "reason": str}.
Falls back to "related" on any failure.
"""
default = {"edge_type": "related", "confidence": 0.5, "reason": "classification failed"}
prompt = CLASSIFY_PROMPT.format(
orphan_title=orphan_title,
orphan_body=orphan_body[:500],
neighbor_title=neighbor_title,
neighbor_body=neighbor_body[:500],
)
payload = json.dumps({
"model": "anthropic/claude-3.5-haiku",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 200,
"temperature": 0.3,
}).encode()
req = urllib.request.Request(
"https://openrouter.ai/api/v1/chat/completions",
data=payload,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
)
try:
with urllib.request.urlopen(req, timeout=15) as resp:
data = json.loads(resp.read())
content = data["choices"][0]["message"]["content"].strip()
# Parse JSON from response (handle markdown code blocks)
if content.startswith("```"):
content = content.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
result = json.loads(content)
edge_type = result.get("edge_type", "related")
confidence = float(result.get("confidence", 0.5))
# Enforce confidence floor for supports/challenges
if edge_type in ("supports", "challenges") and confidence < HAIKU_CONFIDENCE_FLOOR:
edge_type = "related"
return {
"edge_type": edge_type,
"confidence": confidence,
"reason": result.get("reason", ""),
}
except Exception as e:
logger.warning("Haiku classification failed: %s", e)
return default
# ─── YAML Frontmatter Editing ────────────────────────────────────────────────
def _count_reweave_edges(path: Path) -> int:
"""Count existing reweave_edges in a file's frontmatter."""
fm = _parse_frontmatter(path)
if not fm:
return 0
rw = fm.get("reweave_edges")
if isinstance(rw, list):
return len(rw)
return 0
def write_edge(neighbor_path: Path, orphan_title: str, edge_type: str,
date_str: str, dry_run: bool = False) -> bool:
"""Write a reweave edge on the neighbor's frontmatter.
Adds to both the edge_type list (related/supports/challenges) and
the parallel reweave_edges list for provenance tracking.
Uses ruamel.yaml for round-trip YAML preservation.
"""
# Check per-file cap
if _count_reweave_edges(neighbor_path) >= PER_FILE_EDGE_CAP:
logger.info(" Skip %s — per-file edge cap (%d) reached", neighbor_path.name, PER_FILE_EDGE_CAP)
return False
try:
text = neighbor_path.read_text(errors="replace")
except Exception as e:
logger.warning(" Cannot read %s: %s", neighbor_path, e)
return False
if not text.startswith("---"):
logger.warning(" No frontmatter in %s", neighbor_path.name)
return False
end = text.find("\n---", 3)
if end == -1:
return False
fm_text = text[3:end]
body_text = text[end:] # includes the closing ---
# Try ruamel.yaml for round-trip editing
try:
from ruamel.yaml import YAML
ry = YAML()
ry.preserve_quotes = True
ry.width = 4096 # prevent line wrapping
import io
fm = ry.load(fm_text)
if not isinstance(fm, dict):
return False
# Add to edge_type list (related/supports/challenges)
# Clean value only — provenance tracked in reweave_edges (Ganymede: comment-in-string bug)
if edge_type not in fm:
fm[edge_type] = []
elif not isinstance(fm[edge_type], list):
fm[edge_type] = [fm[edge_type]]
# Check for duplicate
existing = [str(v).strip().lower() for v in fm[edge_type] if v]
if orphan_title.strip().lower() in existing:
logger.info(" Skip duplicate edge: %s%s", neighbor_path.name, orphan_title)
return False
fm[edge_type].append(orphan_title)
# Add to reweave_edges with provenance (edge_type + date for audit trail)
if "reweave_edges" not in fm:
fm["reweave_edges"] = []
elif not isinstance(fm["reweave_edges"], list):
fm["reweave_edges"] = [fm["reweave_edges"]]
fm["reweave_edges"].append(f"{orphan_title}|{edge_type}|{date_str}")
# Serialize back
buf = io.StringIO()
ry.dump(fm, buf)
new_fm = buf.getvalue().rstrip("\n")
new_text = f"---\n{new_fm}{body_text}"
if not dry_run:
neighbor_path.write_text(new_text)
return True
except ImportError:
# Fallback: regex-based editing (no ruamel.yaml installed)
logger.info(" ruamel.yaml not available, using regex fallback")
return _write_edge_regex(neighbor_path, fm_text, body_text, orphan_title,
edge_type, date_str, dry_run)
def _write_edge_regex(neighbor_path: Path, fm_text: str, body_text: str,
orphan_title: str, edge_type: str, date_str: str,
dry_run: bool) -> bool:
"""Fallback: add edge via regex when ruamel.yaml is unavailable."""
# Strip leading newline from fm_text (text[3:end] includes \n after ---)
fm_text = fm_text.lstrip("\n")
# Check for duplicate before writing
existing_re = re.compile(
rf'^\s*-\s*["\']?{re.escape(orphan_title)}["\']?\s*$',
re.MULTILINE | re.IGNORECASE,
)
if existing_re.search(fm_text):
logger.info(" Skip duplicate edge (regex): %s%s", neighbor_path.name, orphan_title)
return False
# Check if edge_type field exists
field_re = re.compile(rf"^{edge_type}:\s*$", re.MULTILINE)
inline_re = re.compile(rf'^{edge_type}:\s*\[', re.MULTILINE)
entry_line = f' - "{orphan_title}"'
rw_line = f' - "{orphan_title}|{edge_type}|{date_str}"'
if field_re.search(fm_text):
# Multi-line list exists — find end of list, append
lines = fm_text.split("\n")
new_lines = []
in_field = False
inserted = False
for line in lines:
new_lines.append(line)
if re.match(rf"^{edge_type}:\s*$", line):
in_field = True
elif in_field and not line.startswith(" -"):
# End of list — insert before this line
new_lines.insert(-1, entry_line)
in_field = False
inserted = True
if in_field and not inserted:
# Field was last in frontmatter
new_lines.append(entry_line)
fm_text = "\n".join(new_lines)
elif inline_re.search(fm_text):
# Inline list — skip, too complex for regex
logger.warning(" Inline list format for %s in %s, skipping", edge_type, neighbor_path.name)
return False
else:
# Field doesn't exist — add at end of frontmatter
fm_text = fm_text.rstrip("\n") + f"\n{edge_type}:\n{entry_line}"
# Add reweave_edges field
if "reweave_edges:" in fm_text:
lines = fm_text.split("\n")
new_lines = []
in_rw = False
inserted_rw = False
for line in lines:
new_lines.append(line)
if re.match(r"^reweave_edges:\s*$", line):
in_rw = True
elif in_rw and not line.startswith(" -"):
new_lines.insert(-1, rw_line)
in_rw = False
inserted_rw = True
if in_rw and not inserted_rw:
new_lines.append(rw_line)
fm_text = "\n".join(new_lines)
else:
fm_text = fm_text.rstrip("\n") + f"\nreweave_edges:\n{rw_line}"
new_text = f"---\n{fm_text}{body_text}"
if not dry_run:
neighbor_path.write_text(new_text)
return True
# ─── Git + PR ────────────────────────────────────────────────────────────────
def create_branch(repo_root: Path, branch_name: str) -> bool:
"""Create and checkout a new branch. Cleans up stale local/remote branches from prior failed runs."""
# Delete stale local branch if it exists (e.g., from a failed earlier run today)
subprocess.run(["git", "branch", "-D", branch_name],
cwd=str(repo_root), capture_output=True) # ignore errors if branch doesn't exist
# Delete stale remote branch if it exists
token_file = SECRETS_DIR / "forgejo-admin-token"
if token_file.exists():
token = token_file.read_text().strip()
push_url = f"http://teleo:{token}@localhost:3000/teleo/teleo-codex.git"
subprocess.run(["git", "push", push_url, "--delete", branch_name],
cwd=str(repo_root), capture_output=True) # ignore errors if branch doesn't exist
try:
subprocess.run(["git", "checkout", "-b", branch_name],
cwd=str(repo_root), check=True, capture_output=True)
return True
except subprocess.CalledProcessError as e:
logger.error("Failed to create branch %s: %s", branch_name, e.stderr.decode())
return False
def commit_and_push(repo_root: Path, branch_name: str, modified_files: list[Path],
orphan_count: int) -> bool:
"""Stage modified files, commit, and push."""
# Stage only modified files
for f in modified_files:
subprocess.run(["git", "add", str(f)], cwd=str(repo_root),
check=True, capture_output=True)
# Check if anything staged
result = subprocess.run(["git", "diff", "--cached", "--name-only"],
cwd=str(repo_root), capture_output=True, text=True)
if not result.stdout.strip():
logger.info("No files staged — nothing to commit")
return False
msg = (
f"reweave: connect {orphan_count} orphan claims via vector similarity\n\n"
f"Threshold: {DEFAULT_THRESHOLD}, Haiku classification, {len(modified_files)} files modified.\n\n"
f"Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887>"
)
subprocess.run(["git", "commit", "-m", msg], cwd=str(repo_root),
check=True, capture_output=True)
# Push — inject token
token_file = SECRETS_DIR / "forgejo-admin-token"
if not token_file.exists():
logger.error("No Forgejo token found at %s", token_file)
return False
token = token_file.read_text().strip()
push_url = f"http://teleo:{token}@localhost:3000/teleo/teleo-codex.git"
subprocess.run(["git", "push", "-u", push_url, branch_name],
cwd=str(repo_root), check=True, capture_output=True)
return True
def create_pr(branch_name: str, orphan_count: int, summary_lines: list[str]) -> str | None:
"""Create a Forgejo PR for the reweave batch."""
token_file = SECRETS_DIR / "forgejo-admin-token"
if not token_file.exists():
return None
token = token_file.read_text().strip()
summary = "\n".join(f"- {line}" for line in summary_lines[:30])
body = (
f"## Orphan Reweave\n\n"
f"Connected **{orphan_count}** orphan claims to the knowledge graph "
f"via vector similarity (threshold {DEFAULT_THRESHOLD}) + Haiku edge classification.\n\n"
f"### Edges Added\n{summary}\n\n"
f"### Review Guide\n"
f"- Each edge has a `# reweave:YYYY-MM-DD` comment — strip after review\n"
f"- `reweave_edges` field tracks automated edges for tooling (graph_expand weights them 0.75x)\n"
f"- Upgrade `related` → `supports`/`challenges` where you have better judgment\n"
f"- Delete any edges that don't make sense\n\n"
f"Pentagon-Agent: Epimetheus"
)
payload = json.dumps({
"title": f"reweave: connect {orphan_count} orphan claims",
"body": body,
"head": branch_name,
"base": "main",
}).encode()
req = urllib.request.Request(
f"{FORGEJO_URL}/api/v1/repos/teleo/teleo-codex/pulls",
data=payload,
headers={
"Authorization": f"token {token}",
"Content-Type": "application/json",
},
)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
data = json.loads(resp.read())
return data.get("html_url", "")
except Exception as e:
logger.error("PR creation failed: %s", e)
return None
# ─── Worktree Lock ───────────────────────────────────────────────────────────
_lock_fd = None # Module-level to prevent GC and avoid function-attribute fragility
def acquire_lock(lock_path: Path, timeout: int = 30) -> bool:
"""Acquire file lock for worktree access. Returns True if acquired."""
global _lock_fd
import fcntl
try:
lock_path.parent.mkdir(parents=True, exist_ok=True)
_lock_fd = open(lock_path, "w")
fcntl.flock(_lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
_lock_fd.write(f"reweave:{os.getpid()}\n")
_lock_fd.flush()
return True
except (IOError, OSError):
logger.warning("Could not acquire worktree lock at %s — another process has it", lock_path)
_lock_fd = None
return False
def release_lock(lock_path: Path):
"""Release worktree lock."""
global _lock_fd
import fcntl
fd = _lock_fd
_lock_fd = None
if fd:
try:
fcntl.flock(fd, fcntl.LOCK_UN)
fd.close()
except Exception:
pass
try:
lock_path.unlink(missing_ok=True)
except Exception:
pass
# ─── Main ────────────────────────────────────────────────────────────────────
def main():
global REPO_DIR, DEFAULT_THRESHOLD
parser = argparse.ArgumentParser(description="Orphan Reweave — connect isolated claims")
parser.add_argument("--dry-run", action="store_true",
help="Show what would be connected without modifying files")
parser.add_argument("--max-orphans", type=int, default=DEFAULT_MAX_ORPHANS,
help=f"Max orphans to process (default {DEFAULT_MAX_ORPHANS})")
parser.add_argument("--max-neighbors", type=int, default=DEFAULT_MAX_NEIGHBORS,
help=f"Max neighbors per orphan (default {DEFAULT_MAX_NEIGHBORS})")
parser.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD,
help=f"Minimum cosine similarity (default {DEFAULT_THRESHOLD})")
parser.add_argument("--repo-dir", type=str, default=None,
help="Override repo directory")
args = parser.parse_args()
if args.repo_dir:
REPO_DIR = Path(args.repo_dir)
DEFAULT_THRESHOLD = args.threshold
date_str = datetime.date.today().isoformat()
branch_name = f"reweave/{date_str}"
logger.info("=== Orphan Reweave ===")
logger.info("Repo: %s", REPO_DIR)
logger.info("Threshold: %.2f, Max orphans: %d, Max neighbors: %d",
args.threshold, args.max_orphans, args.max_neighbors)
if args.dry_run:
logger.info("DRY RUN — no files will be modified")
# Step 1: Find all claims and build reverse-link index
logger.info("Step 1: Scanning KB for claims...")
claims = find_all_claims(REPO_DIR)
logger.info(" Found %d knowledge files", len(claims))
logger.info("Step 2: Building reverse-link index...")
incoming = build_reverse_link_index(claims)
logger.info("Step 3: Finding orphans...")
orphans = find_orphans(claims, incoming, REPO_DIR)
orphans = sort_orphans_by_domain(orphans, REPO_DIR)
logger.info(" Found %d orphans (%.1f%% of %d claims)",
len(orphans), 100 * len(orphans) / max(len(claims), 1), len(claims))
if not orphans:
logger.info("No orphans found — KB is fully connected!")
return
# Cap to max_orphans
batch = orphans[:args.max_orphans]
logger.info(" Processing batch of %d orphans", len(batch))
# Step 4: For each orphan, find neighbors and classify edges
api_key = _get_api_key()
edges_to_write: list[dict] = [] # {neighbor_path, orphan_title, edge_type, reason, score}
skipped_no_vector = 0
skipped_no_neighbors = 0
skipped_entity_pair = 0
skipped_same_source = 0
for i, orphan_path in enumerate(batch):
rel_path = str(orphan_path.relative_to(REPO_DIR))
fm = _parse_frontmatter(orphan_path)
orphan_title = fm.get("name", fm.get("title", orphan_path.stem.replace("-", " "))) if fm else orphan_path.stem
orphan_body = _get_body(orphan_path)
logger.info("[%d/%d] %s", i + 1, len(batch), orphan_title[:80])
# Get vector from Qdrant
vector = get_vector_from_qdrant(rel_path)
if not vector:
logger.info(" No vector in Qdrant — skipping (not embedded yet)")
skipped_no_vector += 1
continue
# Find neighbors
hits = search_neighbors(vector, rel_path, args.threshold, args.max_neighbors)
if not hits:
logger.info(" No neighbors above threshold %.2f", args.threshold)
skipped_no_neighbors += 1
continue
for hit in hits:
payload = hit.get("payload", {})
neighbor_rel = payload.get("claim_path", "")
neighbor_title = payload.get("claim_title", "")
score = hit.get("score", 0)
if not neighbor_rel:
continue
neighbor_path = REPO_DIR / neighbor_rel
if not neighbor_path.exists():
logger.info(" Neighbor %s not found on disk — skipping", neighbor_rel)
continue
# Entity-to-entity exclusion: entities need different vocabulary
# (founded_by, competes_with, etc.) not supports/challenges
if _is_entity(orphan_path) and _is_entity(neighbor_path):
logger.info(" Skip entity-entity pair: %s%s", orphan_path.name, neighbor_path.name)
skipped_entity_pair += 1
continue
# Same-source exclusion: N claims from one paper all "supporting" each other
# inflates graph density without adding information
if _same_source(orphan_path, neighbor_path):
logger.info(" Skip same-source pair: %s%s", orphan_path.name, neighbor_path.name)
skipped_same_source += 1
continue
neighbor_body = _get_body(neighbor_path)
# Classify with Haiku
result = classify_edge(orphan_title, orphan_body,
neighbor_title, neighbor_body, api_key)
edge_type = result["edge_type"]
confidence = result["confidence"]
reason = result["reason"]
logger.info("%s (%.3f) %s [%.2f]: %s",
neighbor_title[:50], score, edge_type, confidence, reason[:60])
edges_to_write.append({
"neighbor_path": neighbor_path,
"neighbor_rel": neighbor_rel,
"neighbor_title": neighbor_title,
"orphan_title": str(orphan_title),
"orphan_rel": rel_path,
"edge_type": edge_type,
"score": score,
"confidence": confidence,
"reason": reason,
})
# Rate limit courtesy
if not args.dry_run and i < len(batch) - 1:
time.sleep(0.3)
logger.info("\n=== Summary ===")
logger.info("Orphans processed: %d", len(batch))
logger.info("Edges to write: %d", len(edges_to_write))
logger.info("Skipped (no vector): %d", skipped_no_vector)
logger.info("Skipped (no neighbors): %d", skipped_no_neighbors)
logger.info("Skipped (entity-entity): %d", skipped_entity_pair)
logger.info("Skipped (same-source): %d", skipped_same_source)
if not edges_to_write:
logger.info("Nothing to write.")
return
if args.dry_run:
logger.info("\n=== Dry Run — Edges That Would Be Written ===")
for e in edges_to_write:
logger.info(" %s → [%s] → %s (score=%.3f, conf=%.2f)",
e["neighbor_title"][:40], e["edge_type"],
e["orphan_title"][:40], e["score"], e["confidence"])
return
# Step 5: Acquire lock, create branch, write edges, commit, push, create PR
lock_path = REPO_DIR.parent / ".main-worktree.lock"
if not acquire_lock(lock_path):
logger.error("Cannot acquire worktree lock — aborting")
sys.exit(1)
try:
# Create branch
if not create_branch(REPO_DIR, branch_name):
logger.error("Failed to create branch %s", branch_name)
sys.exit(1)
# Write edges
modified_files = set()
written = 0
summary_lines = []
for e in edges_to_write:
ok = write_edge(
e["neighbor_path"], e["orphan_title"], e["edge_type"],
date_str, dry_run=False,
)
if ok:
modified_files.add(e["neighbor_path"])
written += 1
summary_lines.append(
f"`{e['neighbor_title'][:50]}` → [{e['edge_type']}] → "
f"`{e['orphan_title'][:50]}` (score={e['score']:.3f})"
)
logger.info("Wrote %d edges across %d files", written, len(modified_files))
if not modified_files:
logger.info("No edges written — cleaning up branch")
subprocess.run(["git", "checkout", "main"], cwd=str(REPO_DIR),
capture_output=True)
subprocess.run(["git", "branch", "-d", branch_name], cwd=str(REPO_DIR),
capture_output=True)
return
# Commit and push
orphan_count = len(set(e["orphan_title"] for e in edges_to_write if e["neighbor_path"] in modified_files))
if commit_and_push(REPO_DIR, branch_name, list(modified_files), orphan_count):
logger.info("Pushed branch %s", branch_name)
# Create PR
pr_url = create_pr(branch_name, orphan_count, summary_lines)
if pr_url:
logger.info("PR created: %s", pr_url)
else:
logger.warning("PR creation failed — branch is pushed, create manually")
else:
logger.error("Commit/push failed")
finally:
# Always return to main — even on exception (Ganymede: branch cleanup)
try:
subprocess.run(["git", "checkout", "main"], cwd=str(REPO_DIR),
capture_output=True)
except Exception:
pass
release_lock(lock_path)
logger.info("Done.")
if __name__ == "__main__":
main()