Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source of truth. Previously only 8 of 67 files existed in repo — the rest were deployed directly to VPS via SCP, causing massive drift. Includes: - pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.) - pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh - diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics) - agent-state/: bootstrap, lib-state, cascade inbox processor, schema - systemd/: service unit files for reference - deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate - research-session.sh: updated with Step 8.5 digest + cascade inbox processing No new code written — all files are exact copies from VPS as of 2026-04-06. From this point forward: edit in repo, commit, then deploy.sh. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
358 lines
13 KiB
Python
358 lines
13 KiB
Python
"""Entity batch processor — applies queued entity operations to main.
|
|
|
|
Reads from entity_queue, applies creates/updates to the main worktree,
|
|
commits directly to main. No PR needed for entity timeline appends —
|
|
they're factual, commutative, and low-risk.
|
|
|
|
Entity creates (new entity files) go through PR review like claims.
|
|
Entity updates (timeline appends) commit directly — they're additive
|
|
and recoverable from source archives if wrong.
|
|
|
|
Runs as part of the pipeline's ingest stage or as a standalone cron.
|
|
|
|
Epimetheus owns this module. Leo reviews changes. Rhea deploys.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
from datetime import date
|
|
from pathlib import Path
|
|
|
|
from . import config, db
|
|
from .entity_queue import cleanup, dequeue, mark_failed, mark_processed
|
|
|
|
logger = logging.getLogger("pipeline.entity_batch")
|
|
|
|
|
|
def _read_file(path: str) -> str:
|
|
try:
|
|
with open(path) as f:
|
|
return f.read()
|
|
except FileNotFoundError:
|
|
return ""
|
|
|
|
|
|
async def _git(*args, cwd: str = None, timeout: int = 60) -> tuple[int, str]:
|
|
"""Run a git command async."""
|
|
proc = await asyncio.create_subprocess_exec(
|
|
"git", *args,
|
|
cwd=cwd or str(config.MAIN_WORKTREE),
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
try:
|
|
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
|
except asyncio.TimeoutError:
|
|
proc.kill()
|
|
await proc.wait()
|
|
return -1, f"git {args[0]} timed out after {timeout}s"
|
|
output = (stdout or b"").decode().strip()
|
|
if stderr:
|
|
output += "\n" + stderr.decode().strip()
|
|
return proc.returncode, output
|
|
|
|
|
|
def _apply_timeline_entry(entity_path: str, timeline_entry: str) -> tuple[bool, str]:
|
|
"""Append a timeline entry to an existing entity file.
|
|
|
|
Returns (success, message).
|
|
"""
|
|
if not os.path.exists(entity_path):
|
|
return False, f"entity file not found: {entity_path}"
|
|
|
|
content = _read_file(entity_path)
|
|
if not content:
|
|
return False, f"entity file empty: {entity_path}"
|
|
|
|
# Check for duplicate timeline entry
|
|
if timeline_entry.strip() in content:
|
|
return False, "duplicate timeline entry"
|
|
|
|
# Find or create Timeline section
|
|
if "## Timeline" in content:
|
|
lines = content.split("\n")
|
|
insert_idx = len(lines)
|
|
in_timeline = False
|
|
for i, line in enumerate(lines):
|
|
if line.strip().startswith("## Timeline"):
|
|
in_timeline = True
|
|
continue
|
|
if in_timeline and line.strip().startswith("## "):
|
|
insert_idx = i
|
|
break
|
|
lines.insert(insert_idx, timeline_entry)
|
|
updated = "\n".join(lines)
|
|
else:
|
|
updated = content.rstrip() + "\n\n## Timeline\n\n" + timeline_entry + "\n"
|
|
|
|
with open(entity_path, "w") as f:
|
|
f.write(updated)
|
|
|
|
return True, "timeline entry appended"
|
|
|
|
|
|
def _apply_claim_enrichment(claim_path: str, evidence: str, pr_number: int,
|
|
original_title: str, similarity: float) -> tuple[bool, str]:
|
|
"""Append auto-enrichment evidence to an existing claim file.
|
|
|
|
Used for near-duplicate auto-conversion. (Ganymede: route through entity_batch)
|
|
"""
|
|
if not os.path.exists(claim_path):
|
|
return False, f"target claim not found: {claim_path}"
|
|
|
|
content = _read_file(claim_path)
|
|
if not content:
|
|
return False, f"target claim empty: {claim_path}"
|
|
|
|
# Dedup: skip if this PR already enriched this claim (idempotency)
|
|
if f"PR #{pr_number}" in content:
|
|
return False, f"already enriched by PR #{pr_number}"
|
|
|
|
enrichment_block = (
|
|
f"\n\n### Auto-enrichment (near-duplicate conversion, similarity={similarity:.2f})\n"
|
|
f"*Source: PR #{pr_number} — \"{original_title}\"*\n"
|
|
f"*Auto-converted by substantive fixer. Review: revert if this evidence doesn't belong here.*\n\n"
|
|
f"{evidence}\n"
|
|
)
|
|
|
|
if "\n---\n" in content:
|
|
parts = content.rsplit("\n---\n", 1)
|
|
updated = parts[0] + enrichment_block + "\n---\n" + parts[1]
|
|
else:
|
|
updated = content + enrichment_block
|
|
|
|
with open(claim_path, "w") as f:
|
|
f.write(updated)
|
|
|
|
return True, "enrichment appended"
|
|
|
|
|
|
def _apply_entity_create(entity_path: str, content: str) -> tuple[bool, str]:
|
|
"""Create a new entity file. Returns (success, message)."""
|
|
if os.path.exists(entity_path):
|
|
return False, f"entity already exists: {entity_path}"
|
|
|
|
os.makedirs(os.path.dirname(entity_path), exist_ok=True)
|
|
with open(entity_path, "w") as f:
|
|
f.write(content)
|
|
|
|
return True, "entity created"
|
|
|
|
|
|
async def apply_batch(conn=None, max_entries: int = 50) -> tuple[int, int]:
|
|
"""Process the entity queue. Returns (applied, failed).
|
|
|
|
1. Pull latest main
|
|
2. Read pending queue entries
|
|
3. Apply each operation to the main worktree
|
|
4. Commit all changes in one batch commit
|
|
5. Push to origin
|
|
"""
|
|
main_wt = str(config.MAIN_WORKTREE)
|
|
|
|
# Ensure we're on main branch — batch script may have left worktree on an extract branch
|
|
await _git("checkout", "main", cwd=main_wt)
|
|
|
|
# Pull latest main
|
|
rc, out = await _git("fetch", "origin", "main", cwd=main_wt)
|
|
if rc != 0:
|
|
logger.error("Failed to fetch main: %s", out)
|
|
return 0, 0
|
|
rc, out = await _git("reset", "--hard", "origin/main", cwd=main_wt)
|
|
if rc != 0:
|
|
logger.error("Failed to reset main: %s", out)
|
|
return 0, 0
|
|
|
|
# Read queue
|
|
entries = dequeue(limit=max_entries)
|
|
if not entries:
|
|
return 0, 0
|
|
|
|
logger.info("Processing %d entity queue entries", len(entries))
|
|
|
|
applied_entries: list[dict] = [] # Track for post-push marking (Ganymede review)
|
|
failed = 0
|
|
files_changed: set[str] = set()
|
|
|
|
for entry in entries:
|
|
# Handle enrichments (from substantive fixer near-duplicate conversion)
|
|
if entry.get("type") == "enrichment":
|
|
target = entry.get("target_claim", "")
|
|
evidence = entry.get("evidence", "")
|
|
domain = entry.get("domain", "")
|
|
if not target or not evidence:
|
|
mark_failed(entry, "enrichment missing target or evidence")
|
|
failed += 1
|
|
continue
|
|
claim_path = os.path.join(main_wt, "domains", domain, os.path.basename(target))
|
|
rel_path = os.path.join("domains", domain, os.path.basename(target))
|
|
try:
|
|
ok, msg = _apply_claim_enrichment(
|
|
claim_path, evidence, entry.get("pr_number", 0),
|
|
entry.get("original_title", ""), entry.get("similarity", 0),
|
|
)
|
|
if ok:
|
|
files_changed.add(rel_path)
|
|
applied_entries.append(entry)
|
|
logger.info("Applied enrichment to %s: %s", target, msg)
|
|
else:
|
|
mark_failed(entry, msg)
|
|
failed += 1
|
|
except Exception as e:
|
|
logger.exception("Failed enrichment on %s", target)
|
|
mark_failed(entry, str(e))
|
|
failed += 1
|
|
continue
|
|
|
|
# Handle entity operations
|
|
entity = entry.get("entity", {})
|
|
filename = entity.get("filename", "")
|
|
domain = entity.get("domain", "")
|
|
action = entity.get("action", "")
|
|
|
|
if not filename or not domain:
|
|
mark_failed(entry, "missing filename or domain")
|
|
failed += 1
|
|
continue
|
|
|
|
# Sanitize filename — prevent path traversal (Ganymede review)
|
|
filename = os.path.basename(filename)
|
|
|
|
entity_dir = os.path.join(main_wt, "entities", domain)
|
|
entity_path = os.path.join(entity_dir, filename)
|
|
rel_path = os.path.join("entities", domain, filename)
|
|
|
|
try:
|
|
if action == "update":
|
|
timeline = entity.get("timeline_entry", "")
|
|
if not timeline:
|
|
mark_failed(entry, "update with no timeline_entry")
|
|
failed += 1
|
|
continue
|
|
|
|
ok, msg = _apply_timeline_entry(entity_path, timeline)
|
|
if ok:
|
|
files_changed.add(rel_path)
|
|
applied_entries.append(entry)
|
|
logger.debug("Applied update to %s: %s", filename, msg)
|
|
else:
|
|
mark_failed(entry, msg)
|
|
failed += 1
|
|
|
|
elif action == "create":
|
|
content = entity.get("content", "")
|
|
if not content:
|
|
mark_failed(entry, "create with no content")
|
|
failed += 1
|
|
continue
|
|
|
|
# If entity already exists, try to apply as timeline update instead
|
|
if os.path.exists(entity_path):
|
|
timeline = entity.get("timeline_entry", "")
|
|
if timeline:
|
|
ok, msg = _apply_timeline_entry(entity_path, timeline)
|
|
if ok:
|
|
files_changed.add(rel_path)
|
|
applied_entries.append(entry)
|
|
else:
|
|
mark_failed(entry, f"create→update fallback: {msg}")
|
|
failed += 1
|
|
else:
|
|
mark_failed(entry, "entity exists, no timeline to append")
|
|
failed += 1
|
|
continue
|
|
|
|
ok, msg = _apply_entity_create(entity_path, content)
|
|
if ok:
|
|
files_changed.add(rel_path)
|
|
applied_entries.append(entry)
|
|
logger.debug("Created entity %s", filename)
|
|
else:
|
|
mark_failed(entry, msg)
|
|
failed += 1
|
|
|
|
else:
|
|
mark_failed(entry, f"unknown action: {action}")
|
|
failed += 1
|
|
|
|
except Exception as e:
|
|
logger.exception("Failed to apply entity %s", filename)
|
|
mark_failed(entry, str(e))
|
|
failed += 1
|
|
|
|
applied = len(applied_entries)
|
|
|
|
# Commit and push if any files changed
|
|
if files_changed:
|
|
# Stage changed files
|
|
for f in files_changed:
|
|
await _git("add", f, cwd=main_wt)
|
|
|
|
# Commit
|
|
commit_msg = (
|
|
f"entity-batch: update {len(files_changed)} entities\n\n"
|
|
f"- Applied {applied} entity operations from queue\n"
|
|
f"- Files: {', '.join(sorted(files_changed)[:10])}"
|
|
f"{'...' if len(files_changed) > 10 else ''}\n\n"
|
|
f"Pentagon-Agent: Epimetheus <968B2991-E2DF-4006-B962-F5B0A0CC8ACA>"
|
|
)
|
|
rc, out = await _git("commit", "-m", commit_msg, cwd=main_wt)
|
|
if rc != 0:
|
|
logger.error("Entity batch commit failed: %s", out)
|
|
return applied, failed
|
|
|
|
# Push with retry — main advances frequently from merge module.
|
|
# Pull-rebase before each attempt to catch up with remote.
|
|
push_ok = False
|
|
for attempt in range(3):
|
|
# Always pull-rebase before pushing to catch up with remote main
|
|
rc, out = await _git("pull", "--rebase", "origin", "main", cwd=main_wt, timeout=30)
|
|
if rc != 0:
|
|
logger.warning("Entity batch pull-rebase failed (attempt %d): %s", attempt + 1, out)
|
|
await _git("rebase", "--abort", cwd=main_wt)
|
|
await _git("reset", "--hard", "origin/main", cwd=main_wt)
|
|
return 0, failed + applied
|
|
|
|
rc, out = await _git("push", "origin", "main", cwd=main_wt, timeout=30)
|
|
if rc == 0:
|
|
push_ok = True
|
|
break
|
|
logger.warning("Entity batch push failed (attempt %d), retrying: %s", attempt + 1, out[:100])
|
|
await asyncio.sleep(2) # Brief pause before retry
|
|
|
|
if not push_ok:
|
|
logger.error("Entity batch push failed after 3 attempts")
|
|
await _git("reset", "--hard", "origin/main", cwd=main_wt)
|
|
return 0, failed + applied
|
|
|
|
# Push succeeded — NOW mark entries as processed (Ganymede review)
|
|
for entry in applied_entries:
|
|
mark_processed(entry)
|
|
|
|
logger.info(
|
|
"Entity batch: committed %d file changes (%d applied, %d failed)",
|
|
len(files_changed), applied, failed,
|
|
)
|
|
|
|
# Audit
|
|
if conn:
|
|
db.audit(
|
|
conn, "entity_batch", "batch_applied",
|
|
json.dumps({
|
|
"applied": applied, "failed": failed,
|
|
"files": sorted(files_changed)[:20],
|
|
}),
|
|
)
|
|
|
|
# Cleanup old entries
|
|
cleanup(max_age_hours=24)
|
|
|
|
return applied, failed
|
|
|
|
|
|
async def entity_batch_cycle(conn, max_workers=None) -> tuple[int, int]:
|
|
"""Pipeline stage entry point. Called by teleo-pipeline.py's ingest stage."""
|
|
return await apply_batch(conn)
|