The regex fallback was writing list entries as ' - "title"' (2-space indent + quotes) while existing frontmatter uses '- title' (0-space indent, no quotes). This caused YAML parse failures during merge. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
972 lines
36 KiB
Python
972 lines
36 KiB
Python
#!/usr/bin/env python3
|
|
"""Orphan Reweave — connect isolated claims via vector similarity + Haiku classification.
|
|
|
|
Finds claims with zero incoming links (orphans), uses Qdrant to find semantically
|
|
similar neighbors, classifies the relationship with Haiku, and writes edges on the
|
|
neighbor's frontmatter pointing TO the orphan.
|
|
|
|
Usage:
|
|
python3 reweave.py --dry-run # Show what would be connected
|
|
python3 reweave.py --max-orphans 50 # Process up to 50 orphans
|
|
python3 reweave.py --threshold 0.72 # Override similarity floor
|
|
|
|
Design:
|
|
- Orphan = zero incoming links (no other claim's supports/challenges/related/depends_on points to it)
|
|
- Write edge on NEIGHBOR (not orphan) so orphan gains an incoming link
|
|
- Haiku classifies: supports | challenges | related (>=0.85 confidence for supports/challenges)
|
|
- reweave_edges parallel field for tooling-readable provenance
|
|
- Single PR per run for Leo review
|
|
|
|
Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887>
|
|
"""
|
|
|
|
import argparse
|
|
import datetime
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
logger = logging.getLogger("reweave")
|
|
|
|
# --- Config ---
|
|
REPO_DIR = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main"))
|
|
SECRETS_DIR = Path(os.environ.get("SECRETS_DIR", "/opt/teleo-eval/secrets"))
|
|
QDRANT_URL = os.environ.get("QDRANT_URL", "http://localhost:6333")
|
|
QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION", "teleo-claims")
|
|
FORGEJO_URL = os.environ.get("FORGEJO_URL", "http://localhost:3000")
|
|
|
|
EMBED_DIRS = ["domains", "core", "foundations", "decisions", "entities"]
|
|
EDGE_FIELDS = ("supports", "challenges", "challenged_by", "depends_on", "related")
|
|
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
|
|
|
# Thresholds (from calibration data — Mar 28)
|
|
DEFAULT_THRESHOLD = 0.70 # Elbow in score distribution
|
|
DEFAULT_MAX_ORPHANS = 50 # Keep PRs reviewable
|
|
DEFAULT_MAX_NEIGHBORS = 3 # Don't over-connect
|
|
HAIKU_CONFIDENCE_FLOOR = 0.85 # Below this → default to "related"
|
|
PER_FILE_EDGE_CAP = 10 # Max total reweave edges per neighbor file
|
|
|
|
# Domain processing order: diversity first, internet-finance last (Leo)
|
|
DOMAIN_PRIORITY = [
|
|
"ai-alignment", "health", "space-development", "entertainment",
|
|
"creative-industries", "collective-intelligence", "governance",
|
|
# internet-finance last — batch-imported futarchy cluster, lower cross-domain value
|
|
"internet-finance",
|
|
]
|
|
|
|
|
|
# ─── Orphan Detection ────────────────────────────────────────────────────────
|
|
|
|
|
|
def _parse_frontmatter(path: Path) -> dict | None:
|
|
"""Parse YAML frontmatter from a markdown file. Returns dict or None."""
|
|
try:
|
|
text = path.read_text(errors="replace")
|
|
except Exception:
|
|
return None
|
|
if not text.startswith("---"):
|
|
return None
|
|
end = text.find("\n---", 3)
|
|
if end == -1:
|
|
return None
|
|
try:
|
|
fm = yaml.safe_load(text[3:end])
|
|
return fm if isinstance(fm, dict) else None
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _get_body(path: Path) -> str:
|
|
"""Get body text (after frontmatter) from a markdown file."""
|
|
try:
|
|
text = path.read_text(errors="replace")
|
|
except Exception:
|
|
return ""
|
|
if not text.startswith("---"):
|
|
return text
|
|
end = text.find("\n---", 3)
|
|
if end == -1:
|
|
return text
|
|
return text[end + 4:].strip()
|
|
|
|
|
|
def _get_edge_targets(path: Path) -> list[str]:
|
|
"""Extract all outgoing edge targets from a claim's frontmatter + wiki links."""
|
|
targets = []
|
|
fm = _parse_frontmatter(path)
|
|
if fm:
|
|
for field in EDGE_FIELDS:
|
|
val = fm.get(field)
|
|
if isinstance(val, list):
|
|
targets.extend(str(v).strip().lower() for v in val if v)
|
|
elif isinstance(val, str) and val.strip():
|
|
targets.append(val.strip().lower())
|
|
# Also check reweave_edges (from previous runs)
|
|
rw = fm.get("reweave_edges")
|
|
if isinstance(rw, list):
|
|
targets.extend(str(v).strip().lower() for v in rw if v)
|
|
|
|
# Wiki links in body
|
|
try:
|
|
text = path.read_text(errors="replace")
|
|
end = text.find("\n---", 3)
|
|
if end > 0:
|
|
body = text[end + 4:]
|
|
for link in WIKI_LINK_RE.findall(body):
|
|
targets.append(link.strip().lower())
|
|
except Exception:
|
|
pass
|
|
|
|
return targets
|
|
|
|
|
|
def _claim_name_variants(path: Path, repo_root: Path = None) -> list[str]:
|
|
"""Generate name variants for a claim file (used for incoming link matching).
|
|
|
|
A claim at domains/ai-alignment/rlhf-reward-hacking.md could be referenced as:
|
|
- "rlhf-reward-hacking"
|
|
- "rlhf reward hacking"
|
|
- "RLHF reward hacking" (title case)
|
|
- The actual 'name' or 'title' from frontmatter
|
|
- "domains/ai-alignment/rlhf-reward-hacking" (relative path without .md)
|
|
"""
|
|
variants = set()
|
|
stem = path.stem
|
|
variants.add(stem.lower())
|
|
variants.add(stem.lower().replace("-", " "))
|
|
|
|
# Also match by relative path (Ganymede Q1: some edges use path references)
|
|
if repo_root:
|
|
try:
|
|
rel = str(path.relative_to(repo_root)).removesuffix(".md")
|
|
variants.add(rel.lower())
|
|
except ValueError:
|
|
pass
|
|
|
|
fm = _parse_frontmatter(path)
|
|
if fm:
|
|
for key in ("name", "title"):
|
|
val = fm.get(key)
|
|
if isinstance(val, str) and val.strip():
|
|
variants.add(val.strip().lower())
|
|
|
|
return list(variants)
|
|
|
|
|
|
def _is_entity(path: Path) -> bool:
|
|
"""Check if a file is an entity (not a claim). Entities need different edge vocabulary."""
|
|
fm = _parse_frontmatter(path)
|
|
if fm and fm.get("type") == "entity":
|
|
return True
|
|
# Check path parts — avoids false positives on paths like "domains/entities-overview/"
|
|
return "entities" in Path(path).parts
|
|
|
|
|
|
def _same_source(path_a: Path, path_b: Path) -> bool:
|
|
"""Check if two claims derive from the same source material.
|
|
|
|
Prevents self-referential edges where N claims about the same paper
|
|
all "support" each other — inflates graph density without adding information.
|
|
"""
|
|
fm_a = _parse_frontmatter(path_a)
|
|
fm_b = _parse_frontmatter(path_b)
|
|
if not fm_a or not fm_b:
|
|
return False
|
|
|
|
# Check source field
|
|
src_a = fm_a.get("source") or fm_a.get("source_file") or ""
|
|
src_b = fm_b.get("source") or fm_b.get("source_file") or ""
|
|
if src_a and src_b and str(src_a).strip() == str(src_b).strip():
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def find_all_claims(repo_root: Path) -> list[Path]:
|
|
"""Find all knowledge files (claim, framework, entity, decision) in the KB."""
|
|
claims = []
|
|
for d in EMBED_DIRS:
|
|
base = repo_root / d
|
|
if not base.is_dir():
|
|
continue
|
|
for md in base.rglob("*.md"):
|
|
if md.name.startswith("_"):
|
|
continue
|
|
fm = _parse_frontmatter(md)
|
|
if fm and fm.get("type") not in ("source", "musing", None):
|
|
claims.append(md)
|
|
return claims
|
|
|
|
|
|
def build_reverse_link_index(claims: list[Path]) -> dict[str, set[Path]]:
|
|
"""Build a reverse index: claim_name_variant → set of files that link TO it.
|
|
|
|
For each claim, extract all outgoing edges. For each target name, record
|
|
the source claim as an incoming link for that target.
|
|
"""
|
|
# name_variant → set of source paths that point to it
|
|
incoming: dict[str, set[Path]] = {}
|
|
|
|
for claim_path in claims:
|
|
targets = _get_edge_targets(claim_path)
|
|
for target in targets:
|
|
if target not in incoming:
|
|
incoming[target] = set()
|
|
incoming[target].add(claim_path)
|
|
|
|
return incoming
|
|
|
|
|
|
def find_orphans(claims: list[Path], incoming: dict[str, set[Path]],
|
|
repo_root: Path = None) -> list[Path]:
|
|
"""Find claims with zero incoming links."""
|
|
orphans = []
|
|
for claim_path in claims:
|
|
variants = _claim_name_variants(claim_path, repo_root)
|
|
has_incoming = any(
|
|
len(incoming.get(v, set()) - {claim_path}) > 0
|
|
for v in variants
|
|
)
|
|
if not has_incoming:
|
|
orphans.append(claim_path)
|
|
return orphans
|
|
|
|
|
|
def sort_orphans_by_domain(orphans: list[Path], repo_root: Path) -> list[Path]:
|
|
"""Sort orphans by domain priority (diversity first, internet-finance last)."""
|
|
def domain_key(path: Path) -> tuple[int, str]:
|
|
rel = path.relative_to(repo_root)
|
|
parts = rel.parts
|
|
domain = ""
|
|
if len(parts) >= 2 and parts[0] in ("domains", "entities", "decisions"):
|
|
domain = parts[1]
|
|
elif parts[0] == "foundations" and len(parts) >= 2:
|
|
domain = parts[1]
|
|
elif parts[0] == "core":
|
|
domain = "core"
|
|
|
|
try:
|
|
priority = DOMAIN_PRIORITY.index(domain)
|
|
except ValueError:
|
|
# Unknown domain goes before internet-finance but after known ones
|
|
priority = len(DOMAIN_PRIORITY) - 1
|
|
|
|
return (priority, path.stem)
|
|
|
|
return sorted(orphans, key=domain_key)
|
|
|
|
|
|
# ─── Qdrant Search ───────────────────────────────────────────────────────────
|
|
|
|
|
|
def _get_api_key() -> str:
|
|
"""Load OpenRouter API key."""
|
|
key_file = SECRETS_DIR / "openrouter-key"
|
|
if key_file.exists():
|
|
return key_file.read_text().strip()
|
|
key = os.environ.get("OPENROUTER_API_KEY", "")
|
|
if key:
|
|
return key
|
|
logger.error("No OpenRouter API key found")
|
|
sys.exit(1)
|
|
|
|
|
|
def make_point_id(rel_path: str) -> str:
|
|
"""Deterministic point ID from repo-relative path (matches embed-claims.py)."""
|
|
return hashlib.md5(rel_path.encode()).hexdigest()
|
|
|
|
|
|
def get_vector_from_qdrant(rel_path: str) -> list[float] | None:
|
|
"""Retrieve a claim's existing vector from Qdrant by its point ID."""
|
|
point_id = make_point_id(rel_path)
|
|
body = json.dumps({"ids": [point_id], "with_vector": True}).encode()
|
|
req = urllib.request.Request(
|
|
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points",
|
|
data=body,
|
|
headers={"Content-Type": "application/json"},
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=10) as resp:
|
|
data = json.loads(resp.read())
|
|
points = data.get("result", [])
|
|
if points and points[0].get("vector"):
|
|
return points[0]["vector"]
|
|
except Exception as e:
|
|
logger.warning("Qdrant point lookup failed for %s: %s", rel_path, e)
|
|
return None
|
|
|
|
|
|
def search_neighbors(vector: list[float], exclude_path: str,
|
|
threshold: float, limit: int) -> list[dict]:
|
|
"""Search Qdrant for nearest neighbors above threshold, excluding self."""
|
|
body = {
|
|
"vector": vector,
|
|
"limit": limit + 5, # over-fetch to account for self + filtered
|
|
"with_payload": True,
|
|
"score_threshold": threshold,
|
|
"filter": {
|
|
"must_not": [{"key": "claim_path", "match": {"value": exclude_path}}]
|
|
},
|
|
}
|
|
req = urllib.request.Request(
|
|
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
|
|
data=json.dumps(body).encode(),
|
|
headers={"Content-Type": "application/json"},
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=10) as resp:
|
|
data = json.loads(resp.read())
|
|
hits = data.get("result", [])
|
|
return hits[:limit]
|
|
except Exception as e:
|
|
logger.warning("Qdrant search failed: %s", e)
|
|
return []
|
|
|
|
|
|
# ─── Haiku Edge Classification ───────────────────────────────────────────────
|
|
|
|
|
|
CLASSIFY_PROMPT = """You are classifying the relationship between two knowledge claims.
|
|
|
|
CLAIM A (the orphan — needs to be connected):
|
|
Title: {orphan_title}
|
|
Body: {orphan_body}
|
|
|
|
CLAIM B (the neighbor — already connected in the knowledge graph):
|
|
Title: {neighbor_title}
|
|
Body: {neighbor_body}
|
|
|
|
What is the relationship FROM Claim B TO Claim A?
|
|
|
|
Options:
|
|
- "supports" — Claim B provides evidence, reasoning, or examples that strengthen Claim A
|
|
- "challenges" — Claim B contradicts, undermines, or provides counter-evidence to Claim A. NOTE: "challenges" is underused — if one claim says X works and another says X fails, or they propose incompatible mechanisms, that IS a challenge. Use it.
|
|
- "related" — Claims are topically connected but neither supports nor challenges the other. This is the WEAKEST edge — prefer supports/challenges when the relationship has directionality.
|
|
|
|
Respond with EXACTLY this JSON format, nothing else:
|
|
{{"edge_type": "supports|challenges|related", "confidence": 0.0-1.0, "reason": "one sentence explanation"}}
|
|
"""
|
|
|
|
|
|
def classify_edge(orphan_title: str, orphan_body: str,
|
|
neighbor_title: str, neighbor_body: str,
|
|
api_key: str) -> dict:
|
|
"""Use Haiku to classify the edge type between two claims.
|
|
|
|
Returns {"edge_type": str, "confidence": float, "reason": str}.
|
|
Falls back to "related" on any failure.
|
|
"""
|
|
default = {"edge_type": "related", "confidence": 0.5, "reason": "classification failed"}
|
|
|
|
prompt = CLASSIFY_PROMPT.format(
|
|
orphan_title=orphan_title,
|
|
orphan_body=orphan_body[:500],
|
|
neighbor_title=neighbor_title,
|
|
neighbor_body=neighbor_body[:500],
|
|
)
|
|
|
|
payload = json.dumps({
|
|
"model": "anthropic/claude-3.5-haiku",
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"max_tokens": 200,
|
|
"temperature": 0.3,
|
|
}).encode()
|
|
|
|
req = urllib.request.Request(
|
|
"https://openrouter.ai/api/v1/chat/completions",
|
|
data=payload,
|
|
headers={
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
)
|
|
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
data = json.loads(resp.read())
|
|
content = data["choices"][0]["message"]["content"].strip()
|
|
|
|
# Parse JSON from response (handle markdown code blocks)
|
|
if content.startswith("```"):
|
|
content = content.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
|
|
|
|
result = json.loads(content)
|
|
edge_type = result.get("edge_type", "related")
|
|
confidence = float(result.get("confidence", 0.5))
|
|
|
|
# Enforce confidence floor for supports/challenges
|
|
if edge_type in ("supports", "challenges") and confidence < HAIKU_CONFIDENCE_FLOOR:
|
|
edge_type = "related"
|
|
|
|
return {
|
|
"edge_type": edge_type,
|
|
"confidence": confidence,
|
|
"reason": result.get("reason", ""),
|
|
}
|
|
except Exception as e:
|
|
logger.warning("Haiku classification failed: %s", e)
|
|
return default
|
|
|
|
|
|
# ─── YAML Frontmatter Editing ────────────────────────────────────────────────
|
|
|
|
|
|
def _count_reweave_edges(path: Path) -> int:
|
|
"""Count existing reweave_edges in a file's frontmatter."""
|
|
fm = _parse_frontmatter(path)
|
|
if not fm:
|
|
return 0
|
|
rw = fm.get("reweave_edges")
|
|
if isinstance(rw, list):
|
|
return len(rw)
|
|
return 0
|
|
|
|
|
|
def write_edge(neighbor_path: Path, orphan_title: str, edge_type: str,
|
|
date_str: str, dry_run: bool = False) -> bool:
|
|
"""Write a reweave edge on the neighbor's frontmatter.
|
|
|
|
Adds to both the edge_type list (related/supports/challenges) and
|
|
the parallel reweave_edges list for provenance tracking.
|
|
|
|
Uses ruamel.yaml for round-trip YAML preservation.
|
|
"""
|
|
# Check per-file cap
|
|
if _count_reweave_edges(neighbor_path) >= PER_FILE_EDGE_CAP:
|
|
logger.info(" Skip %s — per-file edge cap (%d) reached", neighbor_path.name, PER_FILE_EDGE_CAP)
|
|
return False
|
|
|
|
try:
|
|
text = neighbor_path.read_text(errors="replace")
|
|
except Exception as e:
|
|
logger.warning(" Cannot read %s: %s", neighbor_path, e)
|
|
return False
|
|
|
|
if not text.startswith("---"):
|
|
logger.warning(" No frontmatter in %s", neighbor_path.name)
|
|
return False
|
|
|
|
end = text.find("\n---", 3)
|
|
if end == -1:
|
|
return False
|
|
|
|
fm_text = text[3:end]
|
|
body_text = text[end:] # includes the closing ---
|
|
|
|
# Try ruamel.yaml for round-trip editing
|
|
try:
|
|
from ruamel.yaml import YAML
|
|
ry = YAML()
|
|
ry.preserve_quotes = True
|
|
ry.width = 4096 # prevent line wrapping
|
|
|
|
import io
|
|
fm = ry.load(fm_text)
|
|
if not isinstance(fm, dict):
|
|
return False
|
|
|
|
# Add to edge_type list (related/supports/challenges)
|
|
# Clean value only — provenance tracked in reweave_edges (Ganymede: comment-in-string bug)
|
|
if edge_type not in fm:
|
|
fm[edge_type] = []
|
|
elif not isinstance(fm[edge_type], list):
|
|
fm[edge_type] = [fm[edge_type]]
|
|
|
|
# Check for duplicate
|
|
existing = [str(v).strip().lower() for v in fm[edge_type] if v]
|
|
if orphan_title.strip().lower() in existing:
|
|
logger.info(" Skip duplicate edge: %s → %s", neighbor_path.name, orphan_title)
|
|
return False
|
|
|
|
fm[edge_type].append(orphan_title)
|
|
|
|
# Add to reweave_edges with provenance (edge_type + date for audit trail)
|
|
if "reweave_edges" not in fm:
|
|
fm["reweave_edges"] = []
|
|
elif not isinstance(fm["reweave_edges"], list):
|
|
fm["reweave_edges"] = [fm["reweave_edges"]]
|
|
fm["reweave_edges"].append(f"{orphan_title}|{edge_type}|{date_str}")
|
|
|
|
# Serialize back
|
|
buf = io.StringIO()
|
|
ry.dump(fm, buf)
|
|
new_fm = buf.getvalue().rstrip("\n")
|
|
|
|
new_text = f"---\n{new_fm}{body_text}"
|
|
|
|
if not dry_run:
|
|
neighbor_path.write_text(new_text)
|
|
return True
|
|
|
|
except ImportError:
|
|
# Fallback: regex-based editing (no ruamel.yaml installed)
|
|
logger.info(" ruamel.yaml not available, using regex fallback")
|
|
return _write_edge_regex(neighbor_path, fm_text, body_text, orphan_title,
|
|
edge_type, date_str, dry_run)
|
|
|
|
|
|
def _write_edge_regex(neighbor_path: Path, fm_text: str, body_text: str,
|
|
orphan_title: str, edge_type: str, date_str: str,
|
|
dry_run: bool) -> bool:
|
|
"""Fallback: add edge via regex when ruamel.yaml is unavailable."""
|
|
# Strip leading newline from fm_text (text[3:end] includes \n after ---)
|
|
fm_text = fm_text.lstrip("\n")
|
|
|
|
# Check for duplicate before writing
|
|
existing_re = re.compile(
|
|
rf'^\s*-\s*["\']?{re.escape(orphan_title)}["\']?\s*$',
|
|
re.MULTILINE | re.IGNORECASE,
|
|
)
|
|
if existing_re.search(fm_text):
|
|
logger.info(" Skip duplicate edge (regex): %s → %s", neighbor_path.name, orphan_title)
|
|
return False
|
|
|
|
# Check if edge_type field exists
|
|
field_re = re.compile(rf"^{edge_type}:\s*$", re.MULTILINE)
|
|
inline_re = re.compile(rf'^{edge_type}:\s*\[', re.MULTILINE)
|
|
|
|
entry_line = f'- {orphan_title}'
|
|
rw_line = f'- {orphan_title}|{edge_type}|{date_str}'
|
|
|
|
if field_re.search(fm_text):
|
|
# Multi-line list exists — find end of list, append
|
|
lines = fm_text.split("\n")
|
|
new_lines = []
|
|
in_field = False
|
|
inserted = False
|
|
for line in lines:
|
|
new_lines.append(line)
|
|
if re.match(rf"^{edge_type}:\s*$", line):
|
|
in_field = True
|
|
elif in_field and not line.startswith(("- ", " -")):
|
|
# End of list — insert before this line
|
|
new_lines.insert(-1, entry_line)
|
|
in_field = False
|
|
inserted = True
|
|
if in_field and not inserted:
|
|
# Field was last in frontmatter
|
|
new_lines.append(entry_line)
|
|
fm_text = "\n".join(new_lines)
|
|
|
|
elif inline_re.search(fm_text):
|
|
# Inline list — skip, too complex for regex
|
|
logger.warning(" Inline list format for %s in %s, skipping", edge_type, neighbor_path.name)
|
|
return False
|
|
else:
|
|
# Field doesn't exist — add at end of frontmatter
|
|
fm_text = fm_text.rstrip("\n") + f"\n{edge_type}:\n{entry_line}"
|
|
|
|
# Add reweave_edges field
|
|
if "reweave_edges:" in fm_text:
|
|
lines = fm_text.split("\n")
|
|
new_lines = []
|
|
in_rw = False
|
|
inserted_rw = False
|
|
for line in lines:
|
|
new_lines.append(line)
|
|
if re.match(r"^reweave_edges:\s*$", line):
|
|
in_rw = True
|
|
elif in_rw and not line.startswith(("- ", " -")):
|
|
new_lines.insert(-1, rw_line)
|
|
in_rw = False
|
|
inserted_rw = True
|
|
if in_rw and not inserted_rw:
|
|
new_lines.append(rw_line)
|
|
fm_text = "\n".join(new_lines)
|
|
else:
|
|
fm_text = fm_text.rstrip("\n") + f"\nreweave_edges:\n{rw_line}"
|
|
|
|
new_text = f"---\n{fm_text}{body_text}"
|
|
|
|
if not dry_run:
|
|
neighbor_path.write_text(new_text)
|
|
return True
|
|
|
|
|
|
# ─── Git + PR ────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def create_branch(repo_root: Path, branch_name: str) -> bool:
|
|
"""Create and checkout a new branch. Cleans up stale local/remote branches from prior failed runs."""
|
|
# Delete stale local branch if it exists (e.g., from a failed earlier run today)
|
|
subprocess.run(["git", "branch", "-D", branch_name],
|
|
cwd=str(repo_root), capture_output=True) # ignore errors if branch doesn't exist
|
|
|
|
# Delete stale remote branch if it exists
|
|
token_file = SECRETS_DIR / "forgejo-admin-token"
|
|
if token_file.exists():
|
|
token = token_file.read_text().strip()
|
|
push_url = f"http://teleo:{token}@localhost:3000/teleo/teleo-codex.git"
|
|
subprocess.run(["git", "push", push_url, "--delete", branch_name],
|
|
cwd=str(repo_root), capture_output=True) # ignore errors if branch doesn't exist
|
|
|
|
try:
|
|
subprocess.run(["git", "checkout", "-b", branch_name],
|
|
cwd=str(repo_root), check=True, capture_output=True)
|
|
return True
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error("Failed to create branch %s: %s", branch_name, e.stderr.decode())
|
|
return False
|
|
|
|
|
|
def commit_and_push(repo_root: Path, branch_name: str, modified_files: list[Path],
|
|
orphan_count: int) -> bool:
|
|
"""Stage modified files, commit, and push."""
|
|
# Stage only modified files
|
|
for f in modified_files:
|
|
subprocess.run(["git", "add", str(f)], cwd=str(repo_root),
|
|
check=True, capture_output=True)
|
|
|
|
# Check if anything staged
|
|
result = subprocess.run(["git", "diff", "--cached", "--name-only"],
|
|
cwd=str(repo_root), capture_output=True, text=True)
|
|
if not result.stdout.strip():
|
|
logger.info("No files staged — nothing to commit")
|
|
return False
|
|
|
|
msg = (
|
|
f"reweave: connect {orphan_count} orphan claims via vector similarity\n\n"
|
|
f"Threshold: {DEFAULT_THRESHOLD}, Haiku classification, {len(modified_files)} files modified.\n\n"
|
|
f"Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887>"
|
|
)
|
|
subprocess.run(["git", "commit", "-m", msg], cwd=str(repo_root),
|
|
check=True, capture_output=True)
|
|
|
|
# Push — inject token
|
|
token_file = SECRETS_DIR / "forgejo-admin-token"
|
|
if not token_file.exists():
|
|
logger.error("No Forgejo token found at %s", token_file)
|
|
return False
|
|
token = token_file.read_text().strip()
|
|
push_url = f"http://teleo:{token}@localhost:3000/teleo/teleo-codex.git"
|
|
|
|
subprocess.run(["git", "push", "-u", push_url, branch_name],
|
|
cwd=str(repo_root), check=True, capture_output=True)
|
|
return True
|
|
|
|
|
|
def create_pr(branch_name: str, orphan_count: int, summary_lines: list[str]) -> str | None:
|
|
"""Create a Forgejo PR for the reweave batch."""
|
|
token_file = SECRETS_DIR / "forgejo-admin-token"
|
|
if not token_file.exists():
|
|
return None
|
|
token = token_file.read_text().strip()
|
|
|
|
summary = "\n".join(f"- {line}" for line in summary_lines[:30])
|
|
body = (
|
|
f"## Orphan Reweave\n\n"
|
|
f"Connected **{orphan_count}** orphan claims to the knowledge graph "
|
|
f"via vector similarity (threshold {DEFAULT_THRESHOLD}) + Haiku edge classification.\n\n"
|
|
f"### Edges Added\n{summary}\n\n"
|
|
f"### Review Guide\n"
|
|
f"- Each edge has a `# reweave:YYYY-MM-DD` comment — strip after review\n"
|
|
f"- `reweave_edges` field tracks automated edges for tooling (graph_expand weights them 0.75x)\n"
|
|
f"- Upgrade `related` → `supports`/`challenges` where you have better judgment\n"
|
|
f"- Delete any edges that don't make sense\n\n"
|
|
f"Pentagon-Agent: Epimetheus"
|
|
)
|
|
|
|
payload = json.dumps({
|
|
"title": f"reweave: connect {orphan_count} orphan claims",
|
|
"body": body,
|
|
"head": branch_name,
|
|
"base": "main",
|
|
}).encode()
|
|
|
|
req = urllib.request.Request(
|
|
f"{FORGEJO_URL}/api/v1/repos/teleo/teleo-codex/pulls",
|
|
data=payload,
|
|
headers={
|
|
"Authorization": f"token {token}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
)
|
|
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
data = json.loads(resp.read())
|
|
return data.get("html_url", "")
|
|
except Exception as e:
|
|
logger.error("PR creation failed: %s", e)
|
|
return None
|
|
|
|
|
|
# ─── Worktree Lock ───────────────────────────────────────────────────────────
|
|
|
|
_lock_fd = None # Module-level to prevent GC and avoid function-attribute fragility
|
|
|
|
|
|
def acquire_lock(lock_path: Path, timeout: int = 30) -> bool:
|
|
"""Acquire file lock for worktree access. Returns True if acquired."""
|
|
global _lock_fd
|
|
import fcntl
|
|
try:
|
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
_lock_fd = open(lock_path, "w")
|
|
fcntl.flock(_lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
_lock_fd.write(f"reweave:{os.getpid()}\n")
|
|
_lock_fd.flush()
|
|
return True
|
|
except (IOError, OSError):
|
|
logger.warning("Could not acquire worktree lock at %s — another process has it", lock_path)
|
|
_lock_fd = None
|
|
return False
|
|
|
|
|
|
def release_lock(lock_path: Path):
|
|
"""Release worktree lock."""
|
|
global _lock_fd
|
|
import fcntl
|
|
fd = _lock_fd
|
|
_lock_fd = None
|
|
if fd:
|
|
try:
|
|
fcntl.flock(fd, fcntl.LOCK_UN)
|
|
fd.close()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
lock_path.unlink(missing_ok=True)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
# ─── Main ────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def main():
|
|
global REPO_DIR, DEFAULT_THRESHOLD
|
|
|
|
parser = argparse.ArgumentParser(description="Orphan Reweave — connect isolated claims")
|
|
parser.add_argument("--dry-run", action="store_true",
|
|
help="Show what would be connected without modifying files")
|
|
parser.add_argument("--max-orphans", type=int, default=DEFAULT_MAX_ORPHANS,
|
|
help=f"Max orphans to process (default {DEFAULT_MAX_ORPHANS})")
|
|
parser.add_argument("--max-neighbors", type=int, default=DEFAULT_MAX_NEIGHBORS,
|
|
help=f"Max neighbors per orphan (default {DEFAULT_MAX_NEIGHBORS})")
|
|
parser.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD,
|
|
help=f"Minimum cosine similarity (default {DEFAULT_THRESHOLD})")
|
|
parser.add_argument("--repo-dir", type=str, default=None,
|
|
help="Override repo directory")
|
|
args = parser.parse_args()
|
|
|
|
if args.repo_dir:
|
|
REPO_DIR = Path(args.repo_dir)
|
|
DEFAULT_THRESHOLD = args.threshold
|
|
|
|
date_str = datetime.date.today().isoformat()
|
|
branch_name = f"reweave/{date_str}"
|
|
|
|
logger.info("=== Orphan Reweave ===")
|
|
logger.info("Repo: %s", REPO_DIR)
|
|
logger.info("Threshold: %.2f, Max orphans: %d, Max neighbors: %d",
|
|
args.threshold, args.max_orphans, args.max_neighbors)
|
|
if args.dry_run:
|
|
logger.info("DRY RUN — no files will be modified")
|
|
|
|
# Step 1: Find all claims and build reverse-link index
|
|
logger.info("Step 1: Scanning KB for claims...")
|
|
claims = find_all_claims(REPO_DIR)
|
|
logger.info(" Found %d knowledge files", len(claims))
|
|
|
|
logger.info("Step 2: Building reverse-link index...")
|
|
incoming = build_reverse_link_index(claims)
|
|
|
|
logger.info("Step 3: Finding orphans...")
|
|
orphans = find_orphans(claims, incoming, REPO_DIR)
|
|
orphans = sort_orphans_by_domain(orphans, REPO_DIR)
|
|
logger.info(" Found %d orphans (%.1f%% of %d claims)",
|
|
len(orphans), 100 * len(orphans) / max(len(claims), 1), len(claims))
|
|
|
|
if not orphans:
|
|
logger.info("No orphans found — KB is fully connected!")
|
|
return
|
|
|
|
# Cap to max_orphans
|
|
batch = orphans[:args.max_orphans]
|
|
logger.info(" Processing batch of %d orphans", len(batch))
|
|
|
|
# Step 4: For each orphan, find neighbors and classify edges
|
|
api_key = _get_api_key()
|
|
edges_to_write: list[dict] = [] # {neighbor_path, orphan_title, edge_type, reason, score}
|
|
skipped_no_vector = 0
|
|
skipped_no_neighbors = 0
|
|
skipped_entity_pair = 0
|
|
skipped_same_source = 0
|
|
|
|
for i, orphan_path in enumerate(batch):
|
|
rel_path = str(orphan_path.relative_to(REPO_DIR))
|
|
fm = _parse_frontmatter(orphan_path)
|
|
orphan_title = fm.get("name", fm.get("title", orphan_path.stem.replace("-", " "))) if fm else orphan_path.stem
|
|
orphan_body = _get_body(orphan_path)
|
|
|
|
logger.info("[%d/%d] %s", i + 1, len(batch), orphan_title[:80])
|
|
|
|
# Get vector from Qdrant
|
|
vector = get_vector_from_qdrant(rel_path)
|
|
if not vector:
|
|
logger.info(" No vector in Qdrant — skipping (not embedded yet)")
|
|
skipped_no_vector += 1
|
|
continue
|
|
|
|
# Find neighbors
|
|
hits = search_neighbors(vector, rel_path, args.threshold, args.max_neighbors)
|
|
if not hits:
|
|
logger.info(" No neighbors above threshold %.2f", args.threshold)
|
|
skipped_no_neighbors += 1
|
|
continue
|
|
|
|
for hit in hits:
|
|
payload = hit.get("payload", {})
|
|
neighbor_rel = payload.get("claim_path", "")
|
|
neighbor_title = payload.get("claim_title", "")
|
|
score = hit.get("score", 0)
|
|
|
|
if not neighbor_rel:
|
|
continue
|
|
|
|
neighbor_path = REPO_DIR / neighbor_rel
|
|
if not neighbor_path.exists():
|
|
logger.info(" Neighbor %s not found on disk — skipping", neighbor_rel)
|
|
continue
|
|
|
|
# Entity-to-entity exclusion: entities need different vocabulary
|
|
# (founded_by, competes_with, etc.) not supports/challenges
|
|
if _is_entity(orphan_path) and _is_entity(neighbor_path):
|
|
logger.info(" Skip entity-entity pair: %s ↔ %s", orphan_path.name, neighbor_path.name)
|
|
skipped_entity_pair += 1
|
|
continue
|
|
|
|
# Same-source exclusion: N claims from one paper all "supporting" each other
|
|
# inflates graph density without adding information
|
|
if _same_source(orphan_path, neighbor_path):
|
|
logger.info(" Skip same-source pair: %s ↔ %s", orphan_path.name, neighbor_path.name)
|
|
skipped_same_source += 1
|
|
continue
|
|
|
|
neighbor_body = _get_body(neighbor_path)
|
|
|
|
# Classify with Haiku
|
|
result = classify_edge(orphan_title, orphan_body,
|
|
neighbor_title, neighbor_body, api_key)
|
|
edge_type = result["edge_type"]
|
|
confidence = result["confidence"]
|
|
reason = result["reason"]
|
|
|
|
logger.info(" → %s (%.3f) %s [%.2f]: %s",
|
|
neighbor_title[:50], score, edge_type, confidence, reason[:60])
|
|
|
|
edges_to_write.append({
|
|
"neighbor_path": neighbor_path,
|
|
"neighbor_rel": neighbor_rel,
|
|
"neighbor_title": neighbor_title,
|
|
"orphan_title": str(orphan_title),
|
|
"orphan_rel": rel_path,
|
|
"edge_type": edge_type,
|
|
"score": score,
|
|
"confidence": confidence,
|
|
"reason": reason,
|
|
})
|
|
|
|
# Rate limit courtesy
|
|
if not args.dry_run and i < len(batch) - 1:
|
|
time.sleep(0.3)
|
|
|
|
logger.info("\n=== Summary ===")
|
|
logger.info("Orphans processed: %d", len(batch))
|
|
logger.info("Edges to write: %d", len(edges_to_write))
|
|
logger.info("Skipped (no vector): %d", skipped_no_vector)
|
|
logger.info("Skipped (no neighbors): %d", skipped_no_neighbors)
|
|
logger.info("Skipped (entity-entity): %d", skipped_entity_pair)
|
|
logger.info("Skipped (same-source): %d", skipped_same_source)
|
|
|
|
if not edges_to_write:
|
|
logger.info("Nothing to write.")
|
|
return
|
|
|
|
if args.dry_run:
|
|
logger.info("\n=== Dry Run — Edges That Would Be Written ===")
|
|
for e in edges_to_write:
|
|
logger.info(" %s → [%s] → %s (score=%.3f, conf=%.2f)",
|
|
e["neighbor_title"][:40], e["edge_type"],
|
|
e["orphan_title"][:40], e["score"], e["confidence"])
|
|
return
|
|
|
|
# Step 5: Acquire lock, create branch, write edges, commit, push, create PR
|
|
lock_path = REPO_DIR.parent / ".main-worktree.lock"
|
|
if not acquire_lock(lock_path):
|
|
logger.error("Cannot acquire worktree lock — aborting")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
# Create branch
|
|
if not create_branch(REPO_DIR, branch_name):
|
|
logger.error("Failed to create branch %s", branch_name)
|
|
sys.exit(1)
|
|
|
|
# Write edges
|
|
modified_files = set()
|
|
written = 0
|
|
summary_lines = []
|
|
|
|
for e in edges_to_write:
|
|
ok = write_edge(
|
|
e["neighbor_path"], e["orphan_title"], e["edge_type"],
|
|
date_str, dry_run=False,
|
|
)
|
|
if ok:
|
|
modified_files.add(e["neighbor_path"])
|
|
written += 1
|
|
summary_lines.append(
|
|
f"`{e['neighbor_title'][:50]}` → [{e['edge_type']}] → "
|
|
f"`{e['orphan_title'][:50]}` (score={e['score']:.3f})"
|
|
)
|
|
|
|
logger.info("Wrote %d edges across %d files", written, len(modified_files))
|
|
|
|
if not modified_files:
|
|
logger.info("No edges written — cleaning up branch")
|
|
subprocess.run(["git", "checkout", "main"], cwd=str(REPO_DIR),
|
|
capture_output=True)
|
|
subprocess.run(["git", "branch", "-d", branch_name], cwd=str(REPO_DIR),
|
|
capture_output=True)
|
|
return
|
|
|
|
# Commit and push
|
|
orphan_count = len(set(e["orphan_title"] for e in edges_to_write if e["neighbor_path"] in modified_files))
|
|
if commit_and_push(REPO_DIR, branch_name, list(modified_files), orphan_count):
|
|
logger.info("Pushed branch %s", branch_name)
|
|
|
|
# Create PR
|
|
pr_url = create_pr(branch_name, orphan_count, summary_lines)
|
|
if pr_url:
|
|
logger.info("PR created: %s", pr_url)
|
|
else:
|
|
logger.warning("PR creation failed — branch is pushed, create manually")
|
|
else:
|
|
logger.error("Commit/push failed")
|
|
|
|
finally:
|
|
# Always return to main — even on exception (Ganymede: branch cleanup)
|
|
try:
|
|
subprocess.run(["git", "checkout", "main"], cwd=str(REPO_DIR),
|
|
capture_output=True)
|
|
except Exception:
|
|
pass
|
|
release_lock(lock_path)
|
|
|
|
logger.info("Done.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|