Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source of truth. Previously only 8 of 67 files existed in repo — the rest were deployed directly to VPS via SCP, causing massive drift. Includes: - pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.) - pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh - diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics) - agent-state/: bootstrap, lib-state, cascade inbox processor, schema - systemd/: service unit files for reference - deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate - research-session.sh: updated with Step 8.5 digest + cascade inbox processing No new code written — all files are exact copies from VPS as of 2026-04-06. From this point forward: edit in repo, commit, then deploy.sh. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
200 lines
6.5 KiB
Python
200 lines
6.5 KiB
Python
"""Atomic extract-and-connect — wire new claims to the KB at extraction time.
|
|
|
|
After extraction writes claim files to disk, this module:
|
|
1. Embeds each new claim (title + description + body snippet)
|
|
2. Searches Qdrant for semantically similar existing claims
|
|
3. Adds found neighbors as `related` edges on the NEW claim's frontmatter
|
|
|
|
Key design decision: edges are written on the NEW claim, not on existing claims.
|
|
Writing on existing claims would cause merge conflicts (same reason entities are
|
|
queued, not written on branches). When the PR merges, embed-on-merge adds the
|
|
new claim to Qdrant, and reweave can later add reciprocal edges on neighbors.
|
|
|
|
Cost: ~$0.0001 per claim (embedding only). No LLM classification — defaults to
|
|
"related". Reweave handles supports/challenges classification in a separate pass.
|
|
|
|
Owner: Epimetheus
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger("pipeline.connect")
|
|
|
|
# Similarity threshold for auto-connecting — below reweave's 0.70 but above
|
|
# the noise floor (~0.55). "related" still means actually related, not vaguely topical.
|
|
CONNECT_THRESHOLD = 0.65
|
|
CONNECT_MAX_NEIGHBORS = 5
|
|
|
|
# --- Import search functions ---
|
|
# This module is called from openrouter-extract-v2.py which may not have lib/ on path
|
|
# via the package, so handle both import paths.
|
|
try:
|
|
from .search import embed_query, search_qdrant
|
|
from .post_extract import parse_frontmatter, _rebuild_content
|
|
except ImportError:
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
from search import embed_query, search_qdrant
|
|
from post_extract import parse_frontmatter, _rebuild_content
|
|
|
|
|
|
def _build_search_text(content: str) -> str:
|
|
"""Extract title + description + first 500 chars of body for embedding."""
|
|
fm, body = parse_frontmatter(content)
|
|
parts = []
|
|
if fm:
|
|
desc = fm.get("description", "")
|
|
if isinstance(desc, str) and desc:
|
|
parts.append(desc.strip('"').strip("'"))
|
|
# Get H1 title from body
|
|
h1_match = re.search(r"^# (.+)$", body, re.MULTILINE) if body else None
|
|
if h1_match:
|
|
parts.append(h1_match.group(1).strip())
|
|
# Add body snippet (skip H1 line)
|
|
if body:
|
|
body_text = re.sub(r"^# .+\n*", "", body).strip()
|
|
# Stop at "Relevant Notes" or "Topics" sections
|
|
body_text = re.split(r"\n---\n", body_text)[0].strip()
|
|
if body_text:
|
|
parts.append(body_text[:500])
|
|
return " ".join(parts)
|
|
|
|
|
|
def _add_related_edges(claim_path: str, neighbor_titles: list[str]) -> bool:
|
|
"""Add related edges to a claim's frontmatter. Returns True if modified."""
|
|
try:
|
|
with open(claim_path) as f:
|
|
content = f.read()
|
|
except Exception as e:
|
|
logger.warning("Cannot read %s: %s", claim_path, e)
|
|
return False
|
|
|
|
fm, body = parse_frontmatter(content)
|
|
if fm is None:
|
|
return False
|
|
|
|
# Get existing related edges to avoid duplicates
|
|
existing = fm.get("related", [])
|
|
if isinstance(existing, str):
|
|
existing = [existing]
|
|
elif not isinstance(existing, list):
|
|
existing = []
|
|
|
|
existing_lower = {str(e).strip().lower() for e in existing}
|
|
|
|
# Add new edges
|
|
added = []
|
|
for title in neighbor_titles:
|
|
if title.strip().lower() not in existing_lower:
|
|
added.append(title)
|
|
existing_lower.add(title.strip().lower())
|
|
|
|
if not added:
|
|
return False
|
|
|
|
fm["related"] = existing + added
|
|
|
|
# Rebuild and write
|
|
new_content = _rebuild_content(fm, body)
|
|
with open(claim_path, "w") as f:
|
|
f.write(new_content)
|
|
|
|
return True
|
|
|
|
|
|
def connect_new_claims(
|
|
claim_paths: list[str],
|
|
threshold: float = CONNECT_THRESHOLD,
|
|
max_neighbors: int = CONNECT_MAX_NEIGHBORS,
|
|
) -> dict:
|
|
"""Connect newly-written claims to the existing KB via vector search.
|
|
|
|
Args:
|
|
claim_paths: List of file paths to newly-written claim files.
|
|
threshold: Minimum cosine similarity for connection.
|
|
max_neighbors: Maximum edges to add per claim.
|
|
|
|
Returns:
|
|
{
|
|
"total": int,
|
|
"connected": int,
|
|
"edges_added": int,
|
|
"skipped_embed_failed": int,
|
|
"skipped_no_neighbors": int,
|
|
"connections": [{"claim": str, "neighbors": [str]}],
|
|
}
|
|
"""
|
|
stats = {
|
|
"total": len(claim_paths),
|
|
"connected": 0,
|
|
"edges_added": 0,
|
|
"skipped_embed_failed": 0,
|
|
"skipped_no_neighbors": 0,
|
|
"connections": [],
|
|
}
|
|
|
|
for claim_path in claim_paths:
|
|
try:
|
|
with open(claim_path) as f:
|
|
content = f.read()
|
|
except Exception:
|
|
continue
|
|
|
|
# Build search text from claim content
|
|
search_text = _build_search_text(content)
|
|
if not search_text or len(search_text) < 20:
|
|
stats["skipped_no_neighbors"] += 1
|
|
continue
|
|
|
|
# Embed the claim
|
|
vector = embed_query(search_text)
|
|
if vector is None:
|
|
stats["skipped_embed_failed"] += 1
|
|
continue
|
|
|
|
# Search Qdrant for neighbors (exclude nothing — new claim isn't in Qdrant yet)
|
|
hits = search_qdrant(
|
|
vector,
|
|
limit=max_neighbors,
|
|
domain=None, # Cross-domain connections are valuable
|
|
score_threshold=threshold,
|
|
)
|
|
|
|
if not hits:
|
|
stats["skipped_no_neighbors"] += 1
|
|
continue
|
|
|
|
# Extract neighbor titles
|
|
neighbor_titles = []
|
|
for hit in hits:
|
|
payload = hit.get("payload", {})
|
|
title = payload.get("claim_title", "")
|
|
if title:
|
|
neighbor_titles.append(title)
|
|
|
|
if not neighbor_titles:
|
|
stats["skipped_no_neighbors"] += 1
|
|
continue
|
|
|
|
# Add edges to the new claim's frontmatter
|
|
if _add_related_edges(claim_path, neighbor_titles):
|
|
stats["connected"] += 1
|
|
stats["edges_added"] += len(neighbor_titles)
|
|
stats["connections"].append({
|
|
"claim": os.path.basename(claim_path),
|
|
"neighbors": neighbor_titles,
|
|
})
|
|
logger.info("Connected %s → %d neighbors", os.path.basename(claim_path), len(neighbor_titles))
|
|
else:
|
|
stats["skipped_no_neighbors"] += 1
|
|
|
|
logger.info(
|
|
"Extract-and-connect: %d/%d claims connected (%d edges added, %d embed failed, %d no neighbors)",
|
|
stats["connected"], stats["total"], stats["edges_added"],
|
|
stats["skipped_embed_failed"], stats["skipped_no_neighbors"],
|
|
)
|
|
|
|
return stats
|